Rate Limiting
Rate limiting is crucial for responsible web scraping. ActiCrawl provides sophisticated rate limiting features to help you respect website resources while maximizing data collection efficiency.
Understanding Rate Limiting
Why Rate Limiting Matters
- Respect server resources - Prevent overwhelming target websites
- Avoid IP blocks - Maintain access to data sources
- Comply with robots.txt - Follow website scraping policies
- Maintain data quality - Ensure complete page loads
- Legal compliance - Adhere to terms of service
Types of Rate Limits
# Request-based limiting
rate_limit :requests_per_second, 10
# Time-based limiting
rate_limit :delay_between_requests, 1.second
# Concurrent request limiting
rate_limit :max_concurrent_requests, 5
# Domain-specific limiting
rate_limit :per_domain, {
"example.com" => 5.requests_per_second,
"api.example.com" => 100.requests_per_minute
}
Configuration
Global Rate Limiting
Configure global rate limits in your ActiCrawl settings:
ActiCrawl.configure do |config|
# Basic rate limiting
config.rate_limit = {
requests_per_second: 10,
burst_size: 20,
cooldown_period: 5.seconds
}
# Advanced configuration
config.rate_limiter = ActiCrawl::RateLimiter.new(
strategy: :token_bucket,
capacity: 100,
refill_rate: 10,
refill_interval: 1.second
)
end
Per-Domain Configuration
Set specific limits for different domains:
class DomainRateLimiter
DOMAIN_LIMITS = {
"api.github.com" => {
authenticated: 5000.per_hour,
unauthenticated: 60.per_hour
},
"twitter.com" => {
tweets: 900.per_15_minutes,
users: 900.per_15_minutes
},
"default" => {
requests: 10.per_second
}
}
def limit_for(domain, auth_status = :unauthenticated)
limits = DOMAIN_LIMITS[domain] || DOMAIN_LIMITS["default"]
limits[auth_status] || limits[:requests]
end
end
Dynamic Rate Limiting
Adjust rate limits based on server responses:
class AdaptiveRateLimiter
def initialize
@current_delay = 0.1 # 100ms initial delay
@min_delay = 0.1
@max_delay = 5.0
end
def execute_with_limit
sleep(@current_delay)
response = yield
adjust_delay(response)
response
end
private
def adjust_delay(response)
case response.code
when 429 # Too Many Requests
# Exponential backoff
@current_delay = [@current_delay * 2, @max_delay].min
wait_time = parse_retry_after(response)
sleep(wait_time) if wait_time
when 200
# Gradually decrease delay on success
@current_delay = [@current_delay * 0.9, @min_delay].max
when 503 # Service Unavailable
@current_delay = [@current_delay * 1.5, @max_delay].min
end
end
def parse_retry_after(response)
retry_after = response.headers['Retry-After']
return nil unless retry_after
# Handle both seconds and HTTP date formats
if retry_after =~ /^\d+$/
retry_after.to_i
else
Time.parse(retry_after) - Time.now
end
end
end
Rate Limiting Strategies
1. Token Bucket Algorithm
Allows burst traffic while maintaining average rate:
class TokenBucket
attr_reader :capacity, :tokens, :refill_rate
def initialize(capacity:, refill_rate:, refill_interval: 1.second)
@capacity = capacity
@tokens = capacity
@refill_rate = refill_rate
@refill_interval = refill_interval
@last_refill = Time.now
@mutex = Mutex.new
end
def consume(tokens = 1)
@mutex.synchronize do
refill!
if @tokens >= tokens
@tokens -= tokens
true
else
false
end
end
end
def wait_for_tokens(tokens = 1)
loop do
return if consume(tokens)
sleep(0.1)
end
end
private
def refill!
now = Time.now
elapsed = now - @last_refill
tokens_to_add = (elapsed / @refill_interval) * @refill_rate
@tokens = [@tokens + tokens_to_add, @capacity].min
@last_refill = now
end
end
# Usage
bucket = TokenBucket.new(capacity: 100, refill_rate: 10)
crawler.before_request do
bucket.wait_for_tokens(1)
end
2. Sliding Window
Track requests in a rolling time window:
class SlidingWindowLimiter
def initialize(window_size:, max_requests:)
@window_size = window_size
@max_requests = max_requests
@requests = []
@mutex = Mutex.new
end
def allow_request?
@mutex.synchronize do
now = Time.now
# Remove old requests outside the window
@requests.reject! { |time| now - time > @window_size }
if @requests.size < @max_requests
@requests << now
true
else
false
end
end
end
def time_until_next_request
return 0 if @requests.empty?
oldest_request = @requests.min
wait_time = @window_size - (Time.now - oldest_request)
[wait_time, 0].max
end
end
# Usage
limiter = SlidingWindowLimiter.new(
window_size: 60.seconds,
max_requests: 100
)
if limiter.allow_request?
crawler.fetch(url)
else
sleep(limiter.time_until_next_request)
retry
end
3. Leaky Bucket
Process requests at a constant rate:
class LeakyBucket
def initialize(capacity:, leak_rate:)
@capacity = capacity
@leak_rate = leak_rate
@queue = Queue.new
@mutex = Mutex.new
start_leak_thread
end
def add_request(request)
@mutex.synchronize do
if @queue.size < @capacity
@queue.push(request)
true
else
false # Bucket overflow
end
end
end
private
def start_leak_thread
Thread.new do
loop do
sleep(1.0 / @leak_rate)
request = @queue.pop(true) rescue nil
request&.call if request
end
end
end
end
4. Distributed Rate Limiting
For multi-instance deployments:
class RedisRateLimiter
def initialize(redis:, key_prefix: "rate_limit")
@redis = redis
@key_prefix = key_prefix
end
def allow_request?(identifier, limit:, window:)
key = "#{@key_prefix}:#{identifier}:#{window_key(window)}"
@redis.multi do |multi|
multi.incr(key)
multi.expire(key, window)
end.first <= limit
end
def rate_limit_status(identifier, limit:, window:)
key = "#{@key_prefix}:#{identifier}:#{window_key(window)}"
current = @redis.get(key).to_i
{
limit: limit,
remaining: [limit - current, 0].max,
reset_at: Time.now + @redis.ttl(key)
}
end
private
def window_key(window)
(Time.now.to_i / window) * window
end
end
Handling Rate Limit Responses
Detecting Rate Limits
class RateLimitDetector
RATE_LIMIT_INDICATORS = {
status_codes: [429, 503],
headers: ['X-RateLimit-Remaining', 'Retry-After'],
body_patterns: [
/rate limit exceeded/i,
/too many requests/i,
/throttled/i
]
}
def rate_limited?(response)
# Check status code
return true if RATE_LIMIT_INDICATORS[:status_codes].include?(response.code)
# Check headers
return true if response.headers.keys.any? { |h|
h.match?(/rate.?limit/i) && response.headers[h].to_i == 0
}
# Check response body
return true if RATE_LIMIT_INDICATORS[:body_patterns].any? { |pattern|
response.body.match?(pattern)
}
false
end
def extract_retry_info(response)
{
retry_after: response.headers['Retry-After'],
limit: response.headers['X-RateLimit-Limit'],
remaining: response.headers['X-RateLimit-Remaining'],
reset: response.headers['X-RateLimit-Reset']
}
end
end
Retry Strategies
class RateLimitRetryHandler
def handle_rate_limit(response)
retry_info = extract_retry_info(response)
if retry_info[:retry_after]
wait_time = parse_retry_after(retry_info[:retry_after])
logger.info "Rate limited. Waiting #{wait_time}s"
sleep(wait_time)
elsif retry_info[:reset]
wait_until_reset(retry_info[:reset])
else
exponential_backoff
end
end
private
def wait_until_reset(reset_time)
reset_at = Time.at(reset_time.to_i)
wait_time = [reset_at - Time.now, 0].max
logger.info "Rate limit resets at #{reset_at}. Waiting #{wait_time}s"
sleep(wait_time + 1) # Add 1 second buffer
end
def exponential_backoff
@retry_count ||= 0
@retry_count += 1
wait_time = [2 ** @retry_count, 300].min # Max 5 minutes
logger.info "Exponential backoff: waiting #{wait_time}s"
sleep(wait_time)
end
end
Monitoring and Analytics
Rate Limit Metrics
class RateLimitMetrics
def initialize
@metrics = Hash.new { |h, k| h[k] = { requests: 0, limited: 0 } }
end
def record_request(domain, limited: false)
@metrics[domain][:requests] += 1
@metrics[domain][:limited] += 1 if limited
end
def rate_limit_ratio(domain)
stats = @metrics[domain]
return 0 if stats[:requests].zero?
(stats[:limited].to_f / stats[:requests] * 100).round(2)
end
def report
@metrics.map do |domain, stats|
{
domain: domain,
total_requests: stats[:requests],
rate_limited: stats[:limited],
limit_ratio: rate_limit_ratio(domain)
}
end
end
end
Performance Impact Analysis
class RateLimitPerformanceAnalyzer
def analyze_impact(with_limits:, without_limits:)
{
throughput: {
with_limits: calculate_throughput(with_limits),
without_limits: calculate_throughput(without_limits),
impact: throughput_impact(with_limits, without_limits)
},
response_time: {
with_limits: average_response_time(with_limits),
without_limits: average_response_time(without_limits),
overhead: response_time_overhead(with_limits, without_limits)
},
success_rate: {
with_limits: success_rate(with_limits),
without_limits: success_rate(without_limits)
}
}
end
private
def calculate_throughput(data)
total_time = data.last[:timestamp] - data.first[:timestamp]
data.size / total_time.to_f
end
def throughput_impact(with_limits, without_limits)
with_throughput = calculate_throughput(with_limits)
without_throughput = calculate_throughput(without_limits)
((without_throughput - with_throughput) / without_throughput * 100).round(2)
end
end
Best Practices
1. Respect robots.txt
class RobotsTxtCompliance
def initialize
@robots_cache = {}
end
def can_fetch?(url)
uri = URI.parse(url)
robots = fetch_robots_txt(uri)
robots.allowed?(url, "ActiCrawl")
end
def crawl_delay(url)
uri = URI.parse(url)
robots = fetch_robots_txt(uri)
robots.crawl_delay("ActiCrawl") || 0
end
private
def fetch_robots_txt(uri)
robots_url = "#{uri.scheme}://#{uri.host}/robots.txt"
@robots_cache[robots_url] ||= begin
response = Net::HTTP.get_response(URI.parse(robots_url))
Robotstxt.parse(response.body, robots_url)
rescue
Robotstxt.parse("", robots_url) # Empty robots.txt on error
end
end
end
2. Implement Request Queuing
class RequestQueue
def initialize(rate_limiter:)
@queue = Queue.new
@rate_limiter = rate_limiter
@workers = []
start_workers
end
def enqueue(request)
@queue.push(request)
end
def shutdown
@workers.each { |w| w[:stop] = true }
@workers.each { |w| w[:thread].join }
end
private
def start_workers
5.times do
worker = { stop: false }
worker[:thread] = Thread.new do
until worker[:stop]
request = @queue.pop(true) rescue nil
if request
@rate_limiter.wait_for_token
process_request(request)
else
sleep(0.1)
end
end
end
@workers << worker
end
end
def process_request(request)
request.call
rescue => e
logger.error "Request failed: #{e.message}"
end
end
3. Use Connection Pooling
class ConnectionPool
def initialize(size: 10, timeout: 5)
@pool = Queue.new
@size = size
@timeout = timeout
size.times { @pool.push(create_connection) }
end
def with_connection
connection = checkout
yield connection
ensure
checkin(connection)
end
private
def checkout
@pool.pop(true)
rescue ThreadError
if @pool.empty? && @created < @size
create_connection
else
raise "Connection pool exhausted"
end
end
def checkin(connection)
@pool.push(connection)
end
def create_connection
# Create your HTTP client connection
HTTP.persistent("https://example.com")
end
end
4. Monitor and Alert
class RateLimitMonitor
ALERT_THRESHOLDS = {
rate_limit_ratio: 20, # 20% of requests rate limited
queue_size: 1000,
avg_wait_time: 10 # seconds
}
def check_health
alerts = []
if rate_limit_ratio > ALERT_THRESHOLDS[:rate_limit_ratio]
alerts << "High rate limit ratio: #{rate_limit_ratio}%"
end
if queue_size > ALERT_THRESHOLDS[:queue_size]
alerts << "Large request queue: #{queue_size} requests"
end
if avg_wait_time > ALERT_THRESHOLDS[:avg_wait_time]
alerts << "High average wait time: #{avg_wait_time}s"
end
send_alerts(alerts) if alerts.any?
end
end
Testing Rate Limits
require 'test_helper'
class RateLimiterTest < ActiveSupport::TestCase
def setup
@limiter = TokenBucket.new(capacity: 10, refill_rate: 5)
end
test "respects token capacity" do
10.times { assert @limiter.consume }
assert_not @limiter.consume
end
test "refills tokens over time" do
10.times { @limiter.consume }
sleep(1.1)
assert_equal 5, @limiter.tokens
end
test "handles burst traffic" do
requests = []
20.times do |i|
if @limiter.consume
requests << { time: Time.now, index: i }
end
end
assert_equal 10, requests.size
end
end
Summary
Effective rate limiting is essential for sustainable web scraping:
- Choose appropriate strategies based on your use case
- Respect server resources and robots.txt rules
- Implement adaptive rate limiting that responds to server feedback
- Monitor rate limit metrics to optimize performance
- Use distributed rate limiting for scaled deployments
- Handle rate limit responses gracefully
- Test your rate limiting thoroughly
- Be a good citizen of the web
Remember: The goal is to collect data efficiently while being respectful to the websites you're scraping.