Docs

Documentation

Learn how to automate your web scraping workflows with ActiCrawl

API Examples

This guide provides practical examples of common use cases for the ActiCrawl API. Each example includes complete code snippets in multiple programming languages.

Basic Web Scraping

Extract Article Content

Scrape a blog post or news article and get clean, readable content:

python
import requests

api_key = 'YOUR_API_KEY'

response = requests.post(
    'https://api.acticrawl.com/v1/scrape',
    headers={'X-API-Key': api_key},
    json={
        'url': 'https://techcrunch.com/2024/01/15/openai-gpt-5/',
        'formats': ['markdown'],
        'options': {
            'onlyMainContent': True
        }
    }
)

article = response.json()
print(article['data']['markdown'])

Generate Screenshots

Capture a screenshot of any webpage:

javascript
const axios = require('axios');

async function captureScreenshot(url) {
  const response = await axios.post('https://api.acticrawl.com/v1/scrape', {
    url: url,
    formats: ['screenshot'],
    options: {
      waitFor: 3000  // Wait 3 seconds for page to load
    }
  }, {
    headers: { 'X-API-Key': 'YOUR_API_KEY' }
  });

  // Save screenshot to file
  const fs = require('fs');
  const buffer = Buffer.from(response.data.data.screenshot, 'base64');
  fs.writeFileSync('screenshot.png', buffer);
}

captureScreenshot('https://stripe.com');

E-commerce Data Extraction

Scrape Product Information

Extract structured product data from e-commerce sites:

python
import requests
import json

def scrape_product(product_url):
    response = requests.post(
        'https://api.acticrawl.com/v1/scrape',
        headers={'X-API-Key': 'YOUR_API_KEY'},
        json={
            'url': product_url,
            'formats': ['json', 'screenshot'],
            'options': {
                'waitFor': 5000,  # Wait for dynamic content
                'onlyIncludeTags': [
                    'h1',  # Product title
                    '.price',  # Price
                    '.description',  # Description
                    '.product-image',  # Images
                    '.reviews'  # Reviews
                ]
            }
        }
    )

    return response.json()

# Example usage
product = scrape_product('https://amazon.com/dp/B08N5WRWNW')
print(f"Title: {product['data']['metadata']['title']}")
print(f"Price: {product['data']['json']['price']}")

Monitor Price Changes

Track product prices over time:

javascript
const cron = require('node-cron');
const axios = require('axios');
const fs = require('fs');

async function checkPrice(url) {
  const response = await axios.post('https://api.acticrawl.com/v1/scrape', {
    url: url,
    formats: ['json'],
    options: {
      onlyIncludeTags: ['.price', '.product-title']
    }
  }, {
    headers: { 'X-API-Key': 'YOUR_API_KEY' }
  });

  const data = response.data.data;
  const price = data.json.price;
  const timestamp = new Date().toISOString();

  // Log price to CSV
  fs.appendFileSync('prices.csv', `${timestamp},${price}\n`);

  return price;
}

// Check price every hour
cron.schedule('0 * * * *', () => {
  checkPrice('https://example.com/product');
});

News and Content Aggregation

Scrape Multiple News Articles

Collect articles from a news website:

python
import requests
from datetime import datetime

def scrape_news_site(base_url, max_articles=10):
    # First, get the homepage to find article links
    response = requests.post(
        'https://api.acticrawl.com/v1/crawl',
        headers={'X-API-Key': 'YOUR_API_KEY'},
        json={
            'url': base_url,
            'match': f'{base_url}/article/**',
            'selector': 'article a[href]',
            'limit': max_articles,
            'depth': 1,
            'formats': ['markdown'],
            'options': {
                'onlyMainContent': True
            }
        }
    )

    articles = response.json()['data']

    # Process articles
    for article in articles:
        print(f"Title: {article['metadata']['title']}")
        print(f"URL: {article['url']}")
        print(f"Published: {article['metadata']['publishedTime']}")
        print(f"Content: {article['markdown'][:200]}...")
        print("-" * 80)

    return articles

# Example usage
news = scrape_news_site('https://news.ycombinator.com', max_articles=5)

Create RSS Feed from Any Website

Convert any website into an RSS feed:

javascript
const axios = require('axios');
const RSS = require('rss');

async function websiteToRSS(siteUrl) {
  // Crawl website for recent posts
  const response = await axios.post('https://api.acticrawl.com/v1/crawl', {
    url: siteUrl,
    match: `${siteUrl}/blog/**`,
    limit: 20,
    depth: 1,
    formats: ['markdown'],
    options: {
      onlyMainContent: true
    }
  }, {
    headers: { 'X-API-Key': 'YOUR_API_KEY' }
  });

  // Create RSS feed
  const feed = new RSS({
    title: `${siteUrl} RSS Feed`,
    description: `Latest posts from ${siteUrl}`,
    feed_url: `${siteUrl}/rss`,
    site_url: siteUrl,
  });

  // Add items to feed
  response.data.data.forEach(article => {
    feed.item({
      title: article.metadata.title,
      description: article.markdown.substring(0, 300) + '...',
      url: article.url,
      date: article.metadata.publishedTime || new Date()
    });
  });

  return feed.xml();
}

Research and Data Collection

Academic Paper Collection

Collect research papers and their metadata:

python
import requests
import pandas as pd

def collect_research_papers(search_url, query):
    response = requests.post(
        'https://api.acticrawl.com/v1/search',
        headers={'X-API-Key': 'YOUR_API_KEY'},
        json={
            'url': search_url,
            'query': query,
            'limit': 50,
            'depth': 2
        }
    )

    papers = response.json()['data']['results']

    # Convert to DataFrame for analysis
    df = pd.DataFrame([{
        'title': p['title'],
        'url': p['url'],
        'excerpt': p['excerpt'],
        'relevance': p['relevance_score']
    } for p in papers])

    # Save to CSV
    df.to_csv(f'research_{query}.csv', index=False)

    return df

# Example usage
papers = collect_research_papers(
    'https://arxiv.org',
    'machine learning transformers'
)

Website Change Detection

Monitor websites for changes:

javascript
const axios = require('axios');
const crypto = require('crypto');
const nodemailer = require('nodemailer');

async function checkForChanges(url, selector) {
  // Scrape current content
  const response = await axios.post('https://api.acticrawl.com/v1/scrape', {
    url: url,
    formats: ['html'],
    options: {
      onlyIncludeTags: [selector]
    }
  }, {
    headers: { 'X-API-Key': 'YOUR_API_KEY' }
  });

  const content = response.data.data.html;
  const hash = crypto.createHash('md5').update(content).digest('hex');

  // Compare with stored hash
  const fs = require('fs');
  const hashFile = 'content-hashes.json';
  let hashes = {};

  if (fs.existsSync(hashFile)) {
    hashes = JSON.parse(fs.readFileSync(hashFile));
  }

  if (hashes[url] && hashes[url] !== hash) {
    // Content changed!
    await sendNotification(url, content);
  }

  // Update hash
  hashes[url] = hash;
  fs.writeFileSync(hashFile, JSON.stringify(hashes));
}

async function sendNotification(url, newContent) {
  // Send email notification
  const transporter = nodemailer.createTransport({
    // Configure your email service
  });

  await transporter.sendMail({
    to: 'your-email@example.com',
    subject: `Website Changed: ${url}`,
    html: `The website has been updated:<br><br>${newContent}`
  });
}

SEO and Marketing

Competitor Analysis

Analyze competitor websites:

python
import requests
from collections import Counter

def analyze_competitor(competitor_url):
    # Get site structure
    map_response = requests.post(
        'https://api.acticrawl.com/v1/map',
        headers={'X-API-Key': 'YOUR_API_KEY'},
        json={
            'url': competitor_url,
            'limit': 100,
            'depth': 3
        }
    )

    site_structure = map_response.json()['data']

    # Analyze a sample of pages
    sample_urls = site_structure['urls'][:10]
    keywords = []

    for url in sample_urls:
        page_response = requests.post(
            'https://api.acticrawl.com/v1/scrape',
            headers={'X-API-Key': 'YOUR_API_KEY'},
            json={
                'url': url,
                'formats': ['json'],
                'options': {
                    'onlyMainContent': True
                }
            }
        )

        metadata = page_response.json()['data']['metadata']
        if metadata.get('keywords'):
            keywords.extend(metadata['keywords'])

    # Analyze keywords
    keyword_freq = Counter(keywords)

    return {
        'total_pages': len(site_structure['urls']),
        'site_structure': site_structure['structure'],
        'top_keywords': keyword_freq.most_common(20)
    }

# Example usage
analysis = analyze_competitor('https://competitor.com')
print(f"Total pages: {analysis['total_pages']}")
print(f"Top keywords: {analysis['top_keywords']}")

Generate Content Ideas

Find trending topics in your industry:

javascript
async function findTrendingTopics(industrySites) {
  const allArticles = [];

  // Scrape recent articles from multiple sites
  for (const site of industrySites) {
    const response = await axios.post('https://api.acticrawl.com/v1/crawl', {
      url: site,
      match: `${site}/**`,
      selector: 'article a, .post a',
      limit: 20,
      depth: 1,
      formats: ['markdown'],
      options: {
        onlyMainContent: true
      }
    }, {
      headers: { 'X-API-Key': 'YOUR_API_KEY' }
    });

    allArticles.push(...response.data.data);
  }

  // Extract topics and keywords
  const topics = {};

  allArticles.forEach(article => {
    const keywords = article.metadata.keywords || [];
    keywords.forEach(keyword => {
      topics[keyword] = (topics[keyword] || 0) + 1;
    });
  });

  // Sort by frequency
  const trending = Object.entries(topics)
    .sort((a, b) => b[1] - a[1])
    .slice(0, 10);

  return trending;
}

// Example usage
const sites = [
  'https://techcrunch.com',
  'https://theverge.com',
  'https://wired.com'
];

findTrendingTopics(sites).then(trends => {
  console.log('Trending topics:');
  trends.forEach(([topic, count]) => {
    console.log(`- ${topic}: mentioned ${count} times`);
  });
});

Error Handling and Best Practices

Robust Error Handling

Always implement proper error handling:

python
import requests
import time
from requests.exceptions import RequestException

def scrape_with_retry(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.post(
                'https://api.acticrawl.com/v1/scrape',
                headers={'X-API-Key': 'YOUR_API_KEY'},
                json={'url': url, 'formats': ['markdown']},
                timeout=30
            )

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                # Rate limited - wait and retry
                retry_after = int(response.headers.get('Retry-After', 60))
                print(f"Rate limited. Waiting {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Error {response.status_code}: {response.text}")

        except RequestException as e:
            print(f"Request failed: {e}")

        if attempt < max_retries - 1:
            time.sleep(2 ** attempt)  # Exponential backoff

    return None

Batch Processing

Process multiple URLs efficiently:

javascript
async function batchScrape(urls, batchSize = 5) {
  const results = [];

  // Process URLs in batches
  for (let i = 0; i < urls.length; i += batchSize) {
    const batch = urls.slice(i, i + batchSize);

    const promises = batch.map(url => 
      axios.post('https://api.acticrawl.com/v1/scrape', {
        url: url,
        formats: ['markdown', 'metadata']
      }, {
        headers: { 'X-API-Key': 'YOUR_API_KEY' }
      }).catch(err => ({ error: err.message, url }))
    );

    const batchResults = await Promise.all(promises);
    results.push(...batchResults);

    // Respect rate limits
    if (i + batchSize < urls.length) {
      await new Promise(resolve => setTimeout(resolve, 1000));
    }
  }

  return results;
}

Next Steps

These examples demonstrate just a few of the many possibilities with the ActiCrawl API. For more advanced use cases:

  1. Explore our API Reference for complete endpoint documentation
  2. Check out our SDKs and Libraries for your programming language
  3. Join our Developer Community for support and discussions
  4. Contact our Enterprise Sales team for custom solutions

Happy scraping! 🚀