Docs

Documentation

Learn how to automate your web scraping workflows with ActiCrawl

API Examples

This guide provides practical examples of common use cases for the ActiCrawl API. Each example includes complete code snippets in multiple programming languages.

Basic Web Scraping

Extract Article Content

Scrape a blog post or news article and get clean, readable content:

python
import requests

api_key = 'YOUR_API_KEY'

response = requests.post(
    'https://www.acticrawl.com/api/v1/scrape',
    headers={'Authorization': f'Bearer {api_key}'},
    json={
        'url': 'https://techcrunch.com/2024/01/15/openai-gpt-5/',
        'output_format': 'markdown',
        'extract_main_content': True
    }
)

article = response.json()
print(article['data']['content'])

Generate Screenshots

Capture a screenshot of any webpage:

javascript
const axios = require('axios');

async function captureScreenshot(url) {
  const response = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
    url: url,
    output_format: 'screenshot',
    wait_for: 3000  // Wait 3 seconds for page to load
  }, {
    headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
  });

  // Save screenshot to file (if available)
  if (response.data.data.content) {
    const fs = require('fs');
    const buffer = Buffer.from(response.data.data.content, 'base64');
    fs.writeFileSync('screenshot.png', buffer);
  }
}

captureScreenshot('https://stripe.com');

E-commerce Data Extraction

Scrape Product Information

Extract structured product data from e-commerce sites:

python
import requests
import json

def scrape_product(product_url):
    response = requests.post(
        'https://www.acticrawl.com/api/v1/scrape',
        headers={'Authorization': 'Bearer YOUR_API_KEY'},
        json={
            'url': product_url,
            'output_format': 'json',
            'execute_js': True,
            'wait_for': 5000,  # Wait for dynamic content
            'include_only_tags': 'h1,.price,.description,.product-image,.reviews'
        }
    )

    return response.json()

# Example usage
product = scrape_product('https://amazon.com/dp/B08N5WRWNW')
print(f"Title: {product['data']['metadata']['title']}")
if product['data']['content']:
    print(f"Content: {product['data']['content']}")

Monitor Price Changes

Track product prices over time:

javascript
const cron = require('node-cron');
const axios = require('axios');
const fs = require('fs');

async function checkPrice(url) {
  const response = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
    url: url,
    output_format: 'json',
    include_only_tags: '.price,.product-title'
  }, {
    headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
  });

  const data = response.data.data;
  const content = data.content; // JSON object
  const timestamp = new Date().toISOString();

  // Log price to CSV
  if (content && content.content) {
    fs.appendFileSync('prices.csv', `${timestamp},${content.content}\n`);
  }

  return content;
}

// Check price every hour
cron.schedule('0 * * * *', () => {
  checkPrice('https://example.com/product');
});

News and Content Aggregation

Scrape Multiple News Articles

Collect articles from a news website:

python
import requests
from datetime import datetime

def scrape_news_article(article_url):
    # Scrape individual article with markdown output
    response = requests.post(
        'https://www.acticrawl.com/api/v1/scrape',
        headers={'Authorization': 'Bearer YOUR_API_KEY'},
        json={
            'url': article_url,
            'output_format': 'markdown',
            'extract_main_content': True
        }
    )

    return response.json()

def scrape_news_links(base_url):
    # First get all links from the homepage
    response = requests.post(
        'https://www.acticrawl.com/api/v1/scrape',
        headers={'Authorization': 'Bearer YOUR_API_KEY'},
        json={
            'url': base_url,
            'output_format': 'links'
        }
    )

    links = response.json()['data']['content']
    article_links = [link for link in links if 'article' in link['url'] or 'story' in link['url']]

    return article_links

# Example usage
links = scrape_news_links('https://news.ycombinator.com')
for link in links[:5]:  # Process first 5 articles
    article = scrape_news_article(link['url'])
    print(f"Title: {article['data']['metadata']['title']}")
    print(f"URL: {link['url']}")
    print(f"Content: {article['data']['content'][:200]}...")
    print("-" * 80)

Create RSS Feed from Any Website

Convert any website into an RSS feed:

javascript
const axios = require('axios');
const RSS = require('rss');

async function websiteToRSS(siteUrl) {
  // Get all links from the blog page
  const linksResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
    url: `${siteUrl}/blog`,
    output_format: 'links'
  }, {
    headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
  });

  const blogLinks = linksResponse.data.data.content.filter(link => 
    link.url.includes('/blog/') && !link.url.endsWith('/blog/')
  ).slice(0, 20);

  // Create RSS feed
  const feed = new RSS({
    title: `${siteUrl} RSS Feed`,
    description: `Latest posts from ${siteUrl}`,
    feed_url: `${siteUrl}/rss`,
    site_url: siteUrl,
  });

  // Scrape each blog post
  for (const link of blogLinks) {
    const articleResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
      url: link.url,
      output_format: 'markdown',
      extract_main_content: true
    }, {
      headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
    });

    const article = articleResponse.data.data;
    feed.item({
      title: article.metadata.title || link.text,
      description: article.content.substring(0, 300) + '...',
      url: link.url,
      date: new Date(article.metadata.scraped_at)
    });
  }

  return feed.xml();
}

Research and Data Collection

Academic Paper Collection

Collect research papers and their metadata:

python
import requests
import pandas as pd

def collect_research_papers(search_url, query):
    # Note: This example demonstrates how to scrape search results
    # The actual search endpoint would depend on the site's structure

    # First, scrape the search results page
    search_query_url = f"{search_url}/search?q={query.replace(' ', '+')}"

    response = requests.post(
        'https://www.acticrawl.com/api/v1/scrape',
        headers={'Authorization': 'Bearer YOUR_API_KEY'},
        json={
            'url': search_query_url,
            'output_format': 'links',
            'execute_js': True,  # Many search pages use JavaScript
            'wait_for': 3000
        }
    )

    links = response.json()['data']['content']

    # Filter for paper links (adjust based on site structure)
    paper_links = [link for link in links if '/abs/' in link['url'] or '/paper/' in link['url']][:50]

    # Scrape each paper for details
    papers = []
    for link in paper_links:
        paper_response = requests.post(
            'https://www.acticrawl.com/api/v1/scrape',
            headers={'Authorization': 'Bearer YOUR_API_KEY'},
            json={
                'url': link['url'],
                'output_format': 'json',
                'extract_main_content': True
            }
        )

        paper_data = paper_response.json()['data']
        papers.append({
            'title': paper_data['metadata']['title'],
            'url': link['url'],
            'content': paper_data['content']['content'][:500] if paper_data['content'] else '',
            'scraped_at': paper_data['metadata']['scraped_at']
        })

    # Convert to DataFrame for analysis
    df = pd.DataFrame(papers)

    # Save to CSV
    df.to_csv(f'research_{query.replace(" ", "_")}.csv', index=False)

    return df

# Example usage
papers = collect_research_papers(
    'https://arxiv.org',
    'machine learning transformers'
)

Website Change Detection

Monitor websites for changes:

javascript
const axios = require('axios');
const crypto = require('crypto');
const nodemailer = require('nodemailer');

async function checkForChanges(url, selector) {
  // Scrape current content
  const response = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
    url: url,
    output_format: 'html_cleaned',
    include_only_tags: selector
  }, {
    headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
  });

  const content = response.data.data.content;
  const hash = crypto.createHash('md5').update(content).digest('hex');

  // Compare with stored hash
  const fs = require('fs');
  const hashFile = 'content-hashes.json';
  let hashes = {};

  if (fs.existsSync(hashFile)) {
    hashes = JSON.parse(fs.readFileSync(hashFile));
  }

  if (hashes[url] && hashes[url] !== hash) {
    // Content changed!
    await sendNotification(url, content);
  }

  // Update hash
  hashes[url] = hash;
  fs.writeFileSync(hashFile, JSON.stringify(hashes));
}

async function sendNotification(url, newContent) {
  // Send email notification
  const transporter = nodemailer.createTransport({
    // Configure your email service
  });

  await transporter.sendMail({
    to: 'your-email@example.com',
    subject: `Website Changed: ${url}`,
    html: `The website has been updated:<br><br>${newContent}`
  });
}

SEO and Marketing

Competitor Analysis

Analyze competitor websites:

python
import requests
from collections import Counter
import re

def analyze_competitor(competitor_url):
    # First get all links from the homepage
    links_response = requests.post(
        'https://www.acticrawl.com/api/v1/scrape',
        headers={'Authorization': 'Bearer YOUR_API_KEY'},
        json={
            'url': competitor_url,
            'output_format': 'links'
        }
    )

    all_links = links_response.json()['data']['content']

    # Filter internal links only
    internal_links = [link for link in all_links if competitor_url in link['url']][:100]

    # Analyze a sample of pages
    keywords = []
    page_structures = {}

    for link in internal_links[:10]:
        page_response = requests.post(
            'https://www.acticrawl.com/api/v1/scrape',
            headers={'Authorization': 'Bearer YOUR_API_KEY'},
            json={
                'url': link['url'],
                'output_format': 'json',
                'extract_main_content': True
            }
        )

        data = page_response.json()['data']

        # Extract keywords from content
        if data['content'] and 'content' in data['content']:
            # Simple keyword extraction (in production, use NLP libraries)
            words = re.findall(r'\b\w+\b', data['content']['content'].lower())
            # Filter common words and short words
            meaningful_words = [w for w in words if len(w) > 4]
            keywords.extend(meaningful_words)

        # Track page structure
        page_type = 'product' if '/product' in link['url'] else \
                   'blog' if '/blog' in link['url'] else \
                   'category' if '/category' in link['url'] else 'other'
        page_structures[page_type] = page_structures.get(page_type, 0) + 1

    # Analyze keywords
    keyword_freq = Counter(keywords)

    return {
        'total_pages': len(internal_links),
        'page_types': page_structures,
        'top_keywords': keyword_freq.most_common(20)
    }

# Example usage
analysis = analyze_competitor('https://competitor.com')
print(f"Total pages: {analysis['total_pages']}")
print(f"Page types: {analysis['page_types']}")
print(f"Top keywords: {analysis['top_keywords']}")

Generate Content Ideas

Find trending topics in your industry:

javascript
async function findTrendingTopics(industrySites) {
  const allArticles = [];

  // Scrape recent articles from multiple sites
  for (const site of industrySites) {
    // Get links from each site
    const linksResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
      url: site,
      output_format: 'links'
    }, {
      headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
    });

    // Filter for article/blog links
    const articleLinks = linksResponse.data.data.content.filter(link => 
      link.url.includes('/article') || 
      link.url.includes('/blog') || 
      link.url.includes('/post')
    ).slice(0, 10);

    // Scrape each article
    for (const link of articleLinks) {
      const articleResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
        url: link.url,
        output_format: 'markdown',
        extract_main_content: true
      }, {
        headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
      });

      allArticles.push({
        url: link.url,
        title: articleResponse.data.data.metadata.title,
        content: articleResponse.data.data.content
      });
    }
  }

  // Extract topics and keywords
  const topics = {};

  allArticles.forEach(article => {
    // Simple keyword extraction from title and content
    const text = `${article.title} ${article.content}`.toLowerCase();
    const words = text.match(/\b\w{5,}\b/g) || [];

    // Count word frequency
    words.forEach(word => {
      topics[word] = (topics[word] || 0) + 1;
    });
  });

  // Sort by frequency
  const trending = Object.entries(topics)
    .sort((a, b) => b[1] - a[1])
    .slice(0, 10);

  return trending;
}

// Example usage
const sites = [
  'https://techcrunch.com',
  'https://theverge.com',
  'https://wired.com'
];

findTrendingTopics(sites).then(trends => {
  console.log('Trending topics:');
  trends.forEach(([topic, count]) => {
    console.log(`- ${topic}: mentioned ${count} times`);
  });
});

Error Handling and Best Practices

Robust Error Handling

Always implement proper error handling:

python
import requests
import time
from requests.exceptions import RequestException

def scrape_with_retry(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.post(
                'https://www.acticrawl.com/api/v1/scrape',
                headers={'Authorization': 'Bearer YOUR_API_KEY'},
                json={'url': url, 'output_format': 'markdown'},
                timeout=30
            )

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                # Rate limited - wait and retry
                retry_after = int(response.headers.get('X-RateLimit-Reset', 60))
                print(f"Rate limited. Waiting until {retry_after}...")
                time.sleep(60)  # Wait 60 seconds before retry
            else:
                print(f"Error {response.status_code}: {response.text}")

        except RequestException as e:
            print(f"Request failed: {e}")

        if attempt < max_retries - 1:
            time.sleep(2 ** attempt)  # Exponential backoff

    return None

Batch Processing

Process multiple URLs efficiently:

javascript
async function batchScrape(urls, batchSize = 5) {
  const results = [];

  // Process URLs in batches
  for (let i = 0; i < urls.length; i += batchSize) {
    const batch = urls.slice(i, i + batchSize);

    const promises = batch.map(url => 
      axios.post('https://www.acticrawl.com/api/v1/scrape', {
        url: url,
        output_format: 'markdown',
        extract_main_content: true
      }, {
        headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
      }).catch(err => ({ error: err.message, url }))
    );

    const batchResults = await Promise.all(promises);
    results.push(...batchResults);

    // Respect rate limits
    if (i + batchSize < urls.length) {
      await new Promise(resolve => setTimeout(resolve, 1000));
    }
  }

  return results;
}

Next Steps

These examples demonstrate just a few of the many possibilities with the ActiCrawl API. For more advanced use cases:

  1. Explore our API Reference for complete endpoint documentation
  2. Check out our SDKs and Libraries for your programming language
  3. Join our Developer Community for support and discussions
  4. Contact our Enterprise Sales team for custom solutions

Happy scraping! 🚀