API Examples
This guide provides practical examples of common use cases for the ActiCrawl API. Each example includes complete code snippets in multiple programming languages.
Basic Web Scraping
Extract Article Content
Scrape a blog post or news article and get clean, readable content:
import requests
api_key = 'YOUR_API_KEY'
response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': f'Bearer {api_key}'},
json={
'url': 'https://techcrunch.com/2024/01/15/openai-gpt-5/',
'output_format': 'markdown',
'extract_main_content': True
}
)
article = response.json()
print(article['data']['content'])
Generate Screenshots
Capture a screenshot of any webpage:
const axios = require('axios');
async function captureScreenshot(url) {
const response = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: url,
output_format: 'screenshot',
wait_for: 3000 // Wait 3 seconds for page to load
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
// Save screenshot to file (if available)
if (response.data.data.content) {
const fs = require('fs');
const buffer = Buffer.from(response.data.data.content, 'base64');
fs.writeFileSync('screenshot.png', buffer);
}
}
captureScreenshot('https://stripe.com');
E-commerce Data Extraction
Scrape Product Information
Extract structured product data from e-commerce sites:
import requests
import json
def scrape_product(product_url):
response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': product_url,
'output_format': 'json',
'execute_js': True,
'wait_for': 5000, # Wait for dynamic content
'include_only_tags': 'h1,.price,.description,.product-image,.reviews'
}
)
return response.json()
# Example usage
product = scrape_product('https://amazon.com/dp/B08N5WRWNW')
print(f"Title: {product['data']['metadata']['title']}")
if product['data']['content']:
print(f"Content: {product['data']['content']}")
Monitor Price Changes
Track product prices over time:
const cron = require('node-cron');
const axios = require('axios');
const fs = require('fs');
async function checkPrice(url) {
const response = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: url,
output_format: 'json',
include_only_tags: '.price,.product-title'
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
const data = response.data.data;
const content = data.content; // JSON object
const timestamp = new Date().toISOString();
// Log price to CSV
if (content && content.content) {
fs.appendFileSync('prices.csv', `${timestamp},${content.content}\n`);
}
return content;
}
// Check price every hour
cron.schedule('0 * * * *', () => {
checkPrice('https://example.com/product');
});
News and Content Aggregation
Scrape Multiple News Articles
Collect articles from a news website:
import requests
from datetime import datetime
def scrape_news_article(article_url):
# Scrape individual article with markdown output
response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': article_url,
'output_format': 'markdown',
'extract_main_content': True
}
)
return response.json()
def scrape_news_links(base_url):
# First get all links from the homepage
response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': base_url,
'output_format': 'links'
}
)
links = response.json()['data']['content']
article_links = [link for link in links if 'article' in link['url'] or 'story' in link['url']]
return article_links
# Example usage
links = scrape_news_links('https://news.ycombinator.com')
for link in links[:5]: # Process first 5 articles
article = scrape_news_article(link['url'])
print(f"Title: {article['data']['metadata']['title']}")
print(f"URL: {link['url']}")
print(f"Content: {article['data']['content'][:200]}...")
print("-" * 80)
Create RSS Feed from Any Website
Convert any website into an RSS feed:
const axios = require('axios');
const RSS = require('rss');
async function websiteToRSS(siteUrl) {
// Get all links from the blog page
const linksResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: `${siteUrl}/blog`,
output_format: 'links'
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
const blogLinks = linksResponse.data.data.content.filter(link =>
link.url.includes('/blog/') && !link.url.endsWith('/blog/')
).slice(0, 20);
// Create RSS feed
const feed = new RSS({
title: `${siteUrl} RSS Feed`,
description: `Latest posts from ${siteUrl}`,
feed_url: `${siteUrl}/rss`,
site_url: siteUrl,
});
// Scrape each blog post
for (const link of blogLinks) {
const articleResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: link.url,
output_format: 'markdown',
extract_main_content: true
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
const article = articleResponse.data.data;
feed.item({
title: article.metadata.title || link.text,
description: article.content.substring(0, 300) + '...',
url: link.url,
date: new Date(article.metadata.scraped_at)
});
}
return feed.xml();
}
Research and Data Collection
Academic Paper Collection
Collect research papers and their metadata:
import requests
import pandas as pd
def collect_research_papers(search_url, query):
# Note: This example demonstrates how to scrape search results
# The actual search endpoint would depend on the site's structure
# First, scrape the search results page
search_query_url = f"{search_url}/search?q={query.replace(' ', '+')}"
response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': search_query_url,
'output_format': 'links',
'execute_js': True, # Many search pages use JavaScript
'wait_for': 3000
}
)
links = response.json()['data']['content']
# Filter for paper links (adjust based on site structure)
paper_links = [link for link in links if '/abs/' in link['url'] or '/paper/' in link['url']][:50]
# Scrape each paper for details
papers = []
for link in paper_links:
paper_response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': link['url'],
'output_format': 'json',
'extract_main_content': True
}
)
paper_data = paper_response.json()['data']
papers.append({
'title': paper_data['metadata']['title'],
'url': link['url'],
'content': paper_data['content']['content'][:500] if paper_data['content'] else '',
'scraped_at': paper_data['metadata']['scraped_at']
})
# Convert to DataFrame for analysis
df = pd.DataFrame(papers)
# Save to CSV
df.to_csv(f'research_{query.replace(" ", "_")}.csv', index=False)
return df
# Example usage
papers = collect_research_papers(
'https://arxiv.org',
'machine learning transformers'
)
Website Change Detection
Monitor websites for changes:
const axios = require('axios');
const crypto = require('crypto');
const nodemailer = require('nodemailer');
async function checkForChanges(url, selector) {
// Scrape current content
const response = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: url,
output_format: 'html_cleaned',
include_only_tags: selector
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
const content = response.data.data.content;
const hash = crypto.createHash('md5').update(content).digest('hex');
// Compare with stored hash
const fs = require('fs');
const hashFile = 'content-hashes.json';
let hashes = {};
if (fs.existsSync(hashFile)) {
hashes = JSON.parse(fs.readFileSync(hashFile));
}
if (hashes[url] && hashes[url] !== hash) {
// Content changed!
await sendNotification(url, content);
}
// Update hash
hashes[url] = hash;
fs.writeFileSync(hashFile, JSON.stringify(hashes));
}
async function sendNotification(url, newContent) {
// Send email notification
const transporter = nodemailer.createTransport({
// Configure your email service
});
await transporter.sendMail({
to: 'your-email@example.com',
subject: `Website Changed: ${url}`,
html: `The website has been updated:<br><br>${newContent}`
});
}
SEO and Marketing
Competitor Analysis
Analyze competitor websites:
import requests
from collections import Counter
import re
def analyze_competitor(competitor_url):
# First get all links from the homepage
links_response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': competitor_url,
'output_format': 'links'
}
)
all_links = links_response.json()['data']['content']
# Filter internal links only
internal_links = [link for link in all_links if competitor_url in link['url']][:100]
# Analyze a sample of pages
keywords = []
page_structures = {}
for link in internal_links[:10]:
page_response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={
'url': link['url'],
'output_format': 'json',
'extract_main_content': True
}
)
data = page_response.json()['data']
# Extract keywords from content
if data['content'] and 'content' in data['content']:
# Simple keyword extraction (in production, use NLP libraries)
words = re.findall(r'\b\w+\b', data['content']['content'].lower())
# Filter common words and short words
meaningful_words = [w for w in words if len(w) > 4]
keywords.extend(meaningful_words)
# Track page structure
page_type = 'product' if '/product' in link['url'] else \
'blog' if '/blog' in link['url'] else \
'category' if '/category' in link['url'] else 'other'
page_structures[page_type] = page_structures.get(page_type, 0) + 1
# Analyze keywords
keyword_freq = Counter(keywords)
return {
'total_pages': len(internal_links),
'page_types': page_structures,
'top_keywords': keyword_freq.most_common(20)
}
# Example usage
analysis = analyze_competitor('https://competitor.com')
print(f"Total pages: {analysis['total_pages']}")
print(f"Page types: {analysis['page_types']}")
print(f"Top keywords: {analysis['top_keywords']}")
Generate Content Ideas
Find trending topics in your industry:
async function findTrendingTopics(industrySites) {
const allArticles = [];
// Scrape recent articles from multiple sites
for (const site of industrySites) {
// Get links from each site
const linksResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: site,
output_format: 'links'
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
// Filter for article/blog links
const articleLinks = linksResponse.data.data.content.filter(link =>
link.url.includes('/article') ||
link.url.includes('/blog') ||
link.url.includes('/post')
).slice(0, 10);
// Scrape each article
for (const link of articleLinks) {
const articleResponse = await axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: link.url,
output_format: 'markdown',
extract_main_content: true
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
});
allArticles.push({
url: link.url,
title: articleResponse.data.data.metadata.title,
content: articleResponse.data.data.content
});
}
}
// Extract topics and keywords
const topics = {};
allArticles.forEach(article => {
// Simple keyword extraction from title and content
const text = `${article.title} ${article.content}`.toLowerCase();
const words = text.match(/\b\w{5,}\b/g) || [];
// Count word frequency
words.forEach(word => {
topics[word] = (topics[word] || 0) + 1;
});
});
// Sort by frequency
const trending = Object.entries(topics)
.sort((a, b) => b[1] - a[1])
.slice(0, 10);
return trending;
}
// Example usage
const sites = [
'https://techcrunch.com',
'https://theverge.com',
'https://wired.com'
];
findTrendingTopics(sites).then(trends => {
console.log('Trending topics:');
trends.forEach(([topic, count]) => {
console.log(`- ${topic}: mentioned ${count} times`);
});
});
Error Handling and Best Practices
Robust Error Handling
Always implement proper error handling:
import requests
import time
from requests.exceptions import RequestException
def scrape_with_retry(url, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.post(
'https://www.acticrawl.com/api/v1/scrape',
headers={'Authorization': 'Bearer YOUR_API_KEY'},
json={'url': url, 'output_format': 'markdown'},
timeout=30
)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limited - wait and retry
retry_after = int(response.headers.get('X-RateLimit-Reset', 60))
print(f"Rate limited. Waiting until {retry_after}...")
time.sleep(60) # Wait 60 seconds before retry
else:
print(f"Error {response.status_code}: {response.text}")
except RequestException as e:
print(f"Request failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
return None
Batch Processing
Process multiple URLs efficiently:
async function batchScrape(urls, batchSize = 5) {
const results = [];
// Process URLs in batches
for (let i = 0; i < urls.length; i += batchSize) {
const batch = urls.slice(i, i + batchSize);
const promises = batch.map(url =>
axios.post('https://www.acticrawl.com/api/v1/scrape', {
url: url,
output_format: 'markdown',
extract_main_content: true
}, {
headers: { 'Authorization': 'Bearer YOUR_API_KEY' }
}).catch(err => ({ error: err.message, url }))
);
const batchResults = await Promise.all(promises);
results.push(...batchResults);
// Respect rate limits
if (i + batchSize < urls.length) {
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
return results;
}
Next Steps
These examples demonstrate just a few of the many possibilities with the ActiCrawl API. For more advanced use cases:
- Explore our API Reference for complete endpoint documentation
- Check out our SDKs and Libraries for your programming language
- Join our Developer Community for support and discussions
- Contact our Enterprise Sales team for custom solutions
Happy scraping! 🚀