Docs

Documentation

Learn how to automate your web scraping workflows with ActiCrawl

Rate Limiting

Rate limiting is crucial for responsible web scraping. ActiCrawl provides sophisticated rate limiting features to help you respect website resources while maximizing data collection efficiency.

Understanding Rate Limiting

Why Rate Limiting Matters

  1. Respect server resources - Prevent overwhelming target websites
  2. Avoid IP blocks - Maintain access to data sources
  3. Comply with robots.txt - Follow website scraping policies
  4. Maintain data quality - Ensure complete page loads
  5. Legal compliance - Adhere to terms of service

Types of Rate Limits

ruby
# Request-based limiting
rate_limit :requests_per_second, 10

# Time-based limiting  
rate_limit :delay_between_requests, 1.second

# Concurrent request limiting
rate_limit :max_concurrent_requests, 5

# Domain-specific limiting
rate_limit :per_domain, {
  "example.com" => 5.requests_per_second,
  "api.example.com" => 100.requests_per_minute
}

Configuration

Global Rate Limiting

Configure global rate limits in your ActiCrawl settings:

ruby
ActiCrawl.configure do |config|
  # Basic rate limiting
  config.rate_limit = {
    requests_per_second: 10,
    burst_size: 20,
    cooldown_period: 5.seconds
  }

  # Advanced configuration
  config.rate_limiter = ActiCrawl::RateLimiter.new(
    strategy: :token_bucket,
    capacity: 100,
    refill_rate: 10,
    refill_interval: 1.second
  )
end

Per-Domain Configuration

Set specific limits for different domains:

ruby
class DomainRateLimiter
  DOMAIN_LIMITS = {
    "api.github.com" => {
      authenticated: 5000.per_hour,
      unauthenticated: 60.per_hour
    },
    "twitter.com" => {
      tweets: 900.per_15_minutes,
      users: 900.per_15_minutes
    },
    "default" => {
      requests: 10.per_second
    }
  }

  def limit_for(domain, auth_status = :unauthenticated)
    limits = DOMAIN_LIMITS[domain] || DOMAIN_LIMITS["default"]
    limits[auth_status] || limits[:requests]
  end
end

Dynamic Rate Limiting

Adjust rate limits based on server responses:

ruby
class AdaptiveRateLimiter
  def initialize
    @current_delay = 0.1 # 100ms initial delay
    @min_delay = 0.1
    @max_delay = 5.0
  end

  def execute_with_limit
    sleep(@current_delay)

    response = yield
    adjust_delay(response)

    response
  end

  private

  def adjust_delay(response)
    case response.code
    when 429 # Too Many Requests
      # Exponential backoff
      @current_delay = [@current_delay * 2, @max_delay].min
      wait_time = parse_retry_after(response)
      sleep(wait_time) if wait_time
    when 200
      # Gradually decrease delay on success
      @current_delay = [@current_delay * 0.9, @min_delay].max
    when 503 # Service Unavailable
      @current_delay = [@current_delay * 1.5, @max_delay].min
    end
  end

  def parse_retry_after(response)
    retry_after = response.headers['Retry-After']
    return nil unless retry_after

    # Handle both seconds and HTTP date formats
    if retry_after =~ /^\d+$/
      retry_after.to_i
    else
      Time.parse(retry_after) - Time.now
    end
  end
end

Rate Limiting Strategies

1. Token Bucket Algorithm

Allows burst traffic while maintaining average rate:

ruby
class TokenBucket
  attr_reader :capacity, :tokens, :refill_rate

  def initialize(capacity:, refill_rate:, refill_interval: 1.second)
    @capacity = capacity
    @tokens = capacity
    @refill_rate = refill_rate
    @refill_interval = refill_interval
    @last_refill = Time.now
    @mutex = Mutex.new
  end

  def consume(tokens = 1)
    @mutex.synchronize do
      refill!

      if @tokens >= tokens
        @tokens -= tokens
        true
      else
        false
      end
    end
  end

  def wait_for_tokens(tokens = 1)
    loop do
      return if consume(tokens)
      sleep(0.1)
    end
  end

  private

  def refill!
    now = Time.now
    elapsed = now - @last_refill

    tokens_to_add = (elapsed / @refill_interval) * @refill_rate
    @tokens = [@tokens + tokens_to_add, @capacity].min
    @last_refill = now
  end
end

# Usage
bucket = TokenBucket.new(capacity: 100, refill_rate: 10)

crawler.before_request do
  bucket.wait_for_tokens(1)
end

2. Sliding Window

Track requests in a rolling time window:

ruby
class SlidingWindowLimiter
  def initialize(window_size:, max_requests:)
    @window_size = window_size
    @max_requests = max_requests
    @requests = []
    @mutex = Mutex.new
  end

  def allow_request?
    @mutex.synchronize do
      now = Time.now
      # Remove old requests outside the window
      @requests.reject! { |time| now - time > @window_size }

      if @requests.size < @max_requests
        @requests << now
        true
      else
        false
      end
    end
  end

  def time_until_next_request
    return 0 if @requests.empty?

    oldest_request = @requests.min
    wait_time = @window_size - (Time.now - oldest_request)
    [wait_time, 0].max
  end
end

# Usage
limiter = SlidingWindowLimiter.new(
  window_size: 60.seconds,
  max_requests: 100
)

if limiter.allow_request?
  crawler.fetch(url)
else
  sleep(limiter.time_until_next_request)
  retry
end

3. Leaky Bucket

Process requests at a constant rate:

ruby
class LeakyBucket
  def initialize(capacity:, leak_rate:)
    @capacity = capacity
    @leak_rate = leak_rate
    @queue = Queue.new
    @mutex = Mutex.new
    start_leak_thread
  end

  def add_request(request)
    @mutex.synchronize do
      if @queue.size < @capacity
        @queue.push(request)
        true
      else
        false # Bucket overflow
      end
    end
  end

  private

  def start_leak_thread
    Thread.new do
      loop do
        sleep(1.0 / @leak_rate)

        request = @queue.pop(true) rescue nil
        request&.call if request
      end
    end
  end
end

4. Distributed Rate Limiting

For multi-instance deployments:

ruby
class RedisRateLimiter
  def initialize(redis:, key_prefix: "rate_limit")
    @redis = redis
    @key_prefix = key_prefix
  end

  def allow_request?(identifier, limit:, window:)
    key = "#{@key_prefix}:#{identifier}:#{window_key(window)}"

    @redis.multi do |multi|
      multi.incr(key)
      multi.expire(key, window)
    end.first <= limit
  end

  def rate_limit_status(identifier, limit:, window:)
    key = "#{@key_prefix}:#{identifier}:#{window_key(window)}"
    current = @redis.get(key).to_i

    {
      limit: limit,
      remaining: [limit - current, 0].max,
      reset_at: Time.now + @redis.ttl(key)
    }
  end

  private

  def window_key(window)
    (Time.now.to_i / window) * window
  end
end

Handling Rate Limit Responses

Detecting Rate Limits

ruby
class RateLimitDetector
  RATE_LIMIT_INDICATORS = {
    status_codes: [429, 503],
    headers: ['X-RateLimit-Remaining', 'Retry-After'],
    body_patterns: [
      /rate limit exceeded/i,
      /too many requests/i,
      /throttled/i
    ]
  }

  def rate_limited?(response)
    # Check status code
    return true if RATE_LIMIT_INDICATORS[:status_codes].include?(response.code)

    # Check headers
    return true if response.headers.keys.any? { |h| 
      h.match?(/rate.?limit/i) && response.headers[h].to_i == 0
    }

    # Check response body
    return true if RATE_LIMIT_INDICATORS[:body_patterns].any? { |pattern|
      response.body.match?(pattern)
    }

    false
  end

  def extract_retry_info(response)
    {
      retry_after: response.headers['Retry-After'],
      limit: response.headers['X-RateLimit-Limit'],
      remaining: response.headers['X-RateLimit-Remaining'],
      reset: response.headers['X-RateLimit-Reset']
    }
  end
end

Retry Strategies

ruby
class RateLimitRetryHandler
  def handle_rate_limit(response)
    retry_info = extract_retry_info(response)

    if retry_info[:retry_after]
      wait_time = parse_retry_after(retry_info[:retry_after])
      logger.info "Rate limited. Waiting #{wait_time}s"
      sleep(wait_time)
    elsif retry_info[:reset]
      wait_until_reset(retry_info[:reset])
    else
      exponential_backoff
    end
  end

  private

  def wait_until_reset(reset_time)
    reset_at = Time.at(reset_time.to_i)
    wait_time = [reset_at - Time.now, 0].max

    logger.info "Rate limit resets at #{reset_at}. Waiting #{wait_time}s"
    sleep(wait_time + 1) # Add 1 second buffer
  end

  def exponential_backoff
    @retry_count ||= 0
    @retry_count += 1

    wait_time = [2 ** @retry_count, 300].min # Max 5 minutes
    logger.info "Exponential backoff: waiting #{wait_time}s"
    sleep(wait_time)
  end
end

Monitoring and Analytics

Rate Limit Metrics

ruby
class RateLimitMetrics
  def initialize
    @metrics = Hash.new { |h, k| h[k] = { requests: 0, limited: 0 } }
  end

  def record_request(domain, limited: false)
    @metrics[domain][:requests] += 1
    @metrics[domain][:limited] += 1 if limited
  end

  def rate_limit_ratio(domain)
    stats = @metrics[domain]
    return 0 if stats[:requests].zero?

    (stats[:limited].to_f / stats[:requests] * 100).round(2)
  end

  def report
    @metrics.map do |domain, stats|
      {
        domain: domain,
        total_requests: stats[:requests],
        rate_limited: stats[:limited],
        limit_ratio: rate_limit_ratio(domain)
      }
    end
  end
end

Performance Impact Analysis

ruby
class RateLimitPerformanceAnalyzer
  def analyze_impact(with_limits:, without_limits:)
    {
      throughput: {
        with_limits: calculate_throughput(with_limits),
        without_limits: calculate_throughput(without_limits),
        impact: throughput_impact(with_limits, without_limits)
      },
      response_time: {
        with_limits: average_response_time(with_limits),
        without_limits: average_response_time(without_limits),
        overhead: response_time_overhead(with_limits, without_limits)
      },
      success_rate: {
        with_limits: success_rate(with_limits),
        without_limits: success_rate(without_limits)
      }
    }
  end

  private

  def calculate_throughput(data)
    total_time = data.last[:timestamp] - data.first[:timestamp]
    data.size / total_time.to_f
  end

  def throughput_impact(with_limits, without_limits)
    with_throughput = calculate_throughput(with_limits)
    without_throughput = calculate_throughput(without_limits)

    ((without_throughput - with_throughput) / without_throughput * 100).round(2)
  end
end

Best Practices

1. Respect robots.txt

ruby
class RobotsTxtCompliance
  def initialize
    @robots_cache = {}
  end

  def can_fetch?(url)
    uri = URI.parse(url)
    robots = fetch_robots_txt(uri)

    robots.allowed?(url, "ActiCrawl")
  end

  def crawl_delay(url)
    uri = URI.parse(url)
    robots = fetch_robots_txt(uri)

    robots.crawl_delay("ActiCrawl") || 0
  end

  private

  def fetch_robots_txt(uri)
    robots_url = "#{uri.scheme}://#{uri.host}/robots.txt"

    @robots_cache[robots_url] ||= begin
      response = Net::HTTP.get_response(URI.parse(robots_url))
      Robotstxt.parse(response.body, robots_url)
    rescue
      Robotstxt.parse("", robots_url) # Empty robots.txt on error
    end
  end
end

2. Implement Request Queuing

ruby
class RequestQueue
  def initialize(rate_limiter:)
    @queue = Queue.new
    @rate_limiter = rate_limiter
    @workers = []
    start_workers
  end

  def enqueue(request)
    @queue.push(request)
  end

  def shutdown
    @workers.each { |w| w[:stop] = true }
    @workers.each { |w| w[:thread].join }
  end

  private

  def start_workers
    5.times do
      worker = { stop: false }
      worker[:thread] = Thread.new do
        until worker[:stop]
          request = @queue.pop(true) rescue nil

          if request
            @rate_limiter.wait_for_token
            process_request(request)
          else
            sleep(0.1)
          end
        end
      end

      @workers << worker
    end
  end

  def process_request(request)
    request.call
  rescue => e
    logger.error "Request failed: #{e.message}"
  end
end

3. Use Connection Pooling

ruby
class ConnectionPool
  def initialize(size: 10, timeout: 5)
    @pool = Queue.new
    @size = size
    @timeout = timeout

    size.times { @pool.push(create_connection) }
  end

  def with_connection
    connection = checkout
    yield connection
  ensure
    checkin(connection)
  end

  private

  def checkout
    @pool.pop(true)
  rescue ThreadError
    if @pool.empty? && @created < @size
      create_connection
    else
      raise "Connection pool exhausted"
    end
  end

  def checkin(connection)
    @pool.push(connection)
  end

  def create_connection
    # Create your HTTP client connection
    HTTP.persistent("https://example.com")
  end
end

4. Monitor and Alert

ruby
class RateLimitMonitor
  ALERT_THRESHOLDS = {
    rate_limit_ratio: 20, # 20% of requests rate limited
    queue_size: 1000,
    avg_wait_time: 10 # seconds
  }

  def check_health
    alerts = []

    if rate_limit_ratio > ALERT_THRESHOLDS[:rate_limit_ratio]
      alerts << "High rate limit ratio: #{rate_limit_ratio}%"
    end

    if queue_size > ALERT_THRESHOLDS[:queue_size]
      alerts << "Large request queue: #{queue_size} requests"
    end

    if avg_wait_time > ALERT_THRESHOLDS[:avg_wait_time]
      alerts << "High average wait time: #{avg_wait_time}s"
    end

    send_alerts(alerts) if alerts.any?
  end
end

Testing Rate Limits

ruby
require 'test_helper'

class RateLimiterTest < ActiveSupport::TestCase
  def setup
    @limiter = TokenBucket.new(capacity: 10, refill_rate: 5)
  end

  test "respects token capacity" do
    10.times { assert @limiter.consume }
    assert_not @limiter.consume
  end

  test "refills tokens over time" do
    10.times { @limiter.consume }
    sleep(1.1)

    assert_equal 5, @limiter.tokens
  end

  test "handles burst traffic" do
    requests = []

    20.times do |i|
      if @limiter.consume
        requests << { time: Time.now, index: i }
      end
    end

    assert_equal 10, requests.size
  end
end

Summary

Effective rate limiting is essential for sustainable web scraping:

  1. Choose appropriate strategies based on your use case
  2. Respect server resources and robots.txt rules
  3. Implement adaptive rate limiting that responds to server feedback
  4. Monitor rate limit metrics to optimize performance
  5. Use distributed rate limiting for scaled deployments
  6. Handle rate limit responses gracefully
  7. Test your rate limiting thoroughly
  8. Be a good citizen of the web

Remember: The goal is to collect data efficiently while being respectful to the websites you're scraping.