Rate Limiting

Rate limiting is crucial for responsible web scraping. ActiCrawl provides sophisticated rate limiting features to help you respect website resources while maximizing data collection efficiency.

Understanding Rate Limiting

Why Rate Limiting Matters

Respect server resources - Prevent overwhelming target websites
Avoid IP blocks - Maintain access to data sources
Comply with robots.txt - Follow website scraping policies
Maintain data quality - Ensure complete page loads
Legal compliance - Adhere to terms of service

Types of Rate Limits

            ruby
            
          

            # Request-based limiting
rate_limit :requests_per_second, 10

# Time-based limiting  
rate_limit :delay_between_requests, 1.second

# Concurrent request limiting
rate_limit :max_concurrent_requests, 5

# Domain-specific limiting
rate_limit :per_domain, {
  "example.com" => 5.requests_per_second,
  "api.example.com" => 100.requests_per_minute
}

          

Configuration

Global Rate Limiting

Configure global rate limits in your ActiCrawl settings:

            ruby
            
          

            ActiCrawl.configure do |config|
  # Basic rate limiting
  config.rate_limit = {
    requests_per_second: 10,
    burst_size: 20,
    cooldown_period: 5.seconds
  }

  # Advanced configuration
  config.rate_limiter = ActiCrawl::RateLimiter.new(
    strategy: :token_bucket,
    capacity: 100,
    refill_rate: 10,
    refill_interval: 1.second
  )
end

          

Per-Domain Configuration

Set specific limits for different domains:

            ruby
            
          

            class DomainRateLimiter
  DOMAIN_LIMITS = {
    "api.github.com" => {
      authenticated: 5000.per_hour,
      unauthenticated: 60.per_hour
    },
    "twitter.com" => {
      tweets: 900.per_15_minutes,
      users: 900.per_15_minutes
    },
    "default" => {
      requests: 10.per_second
    }
  }

  def limit_for(domain, auth_status = :unauthenticated)
    limits = DOMAIN_LIMITS[domain] || DOMAIN_LIMITS["default"]
    limits[auth_status] || limits[:requests]
  end
end

          

Dynamic Rate Limiting

Adjust rate limits based on server responses:

            ruby
            
          

            class AdaptiveRateLimiter
  def initialize
    @current_delay = 0.1 # 100ms initial delay
    @min_delay = 0.1
    @max_delay = 5.0
  end

  def execute_with_limit
    sleep(@current_delay)

    response = yield
    adjust_delay(response)

    response
  end

  private

  def adjust_delay(response)
    case response.code
    when 429 # Too Many Requests
      # Exponential backoff
      @current_delay = [@current_delay * 2, @max_delay].min
      wait_time = parse_retry_after(response)
      sleep(wait_time) if wait_time
    when 200
      # Gradually decrease delay on success
      @current_delay = [@current_delay * 0.9, @min_delay].max
    when 503 # Service Unavailable
      @current_delay = [@current_delay * 1.5, @max_delay].min
    end
  end

  def parse_retry_after(response)
    retry_after = response.headers['Retry-After']
    return nil unless retry_after

    # Handle both seconds and HTTP date formats
    if retry_after =~ /^\d+$/
      retry_after.to_i
    else
      Time.parse(retry_after) - Time.now
    end
  end
end

          

Rate Limiting Strategies

1. Token Bucket Algorithm

Allows burst traffic while maintaining average rate:

            ruby
            
          

            class TokenBucket
  attr_reader :capacity, :tokens, :refill_rate

  def initialize(capacity:, refill_rate:, refill_interval: 1.second)
    @capacity = capacity
    @tokens = capacity
    @refill_rate = refill_rate
    @refill_interval = refill_interval
    @last_refill = Time.now
    @mutex = Mutex.new
  end

  def consume(tokens = 1)
    @mutex.synchronize do
      refill!

      if @tokens >= tokens
        @tokens -= tokens
        true
      else
        false
      end
    end
  end

  def wait_for_tokens(tokens = 1)
    loop do
      return if consume(tokens)
      sleep(0.1)
    end
  end

  private

  def refill!
    now = Time.now
    elapsed = now - @last_refill

    tokens_to_add = (elapsed / @refill_interval) * @refill_rate
    @tokens = [@tokens + tokens_to_add, @capacity].min
    @last_refill = now
  end
end

# Usage
bucket = TokenBucket.new(capacity: 100, refill_rate: 10)

crawler.before_request do
  bucket.wait_for_tokens(1)
end

          

2. Sliding Window

Track requests in a rolling time window:

            ruby
            
          

            class SlidingWindowLimiter
  def initialize(window_size:, max_requests:)
    @window_size = window_size
    @max_requests = max_requests
    @requests = []
    @mutex = Mutex.new
  end

  def allow_request?
    @mutex.synchronize do
      now = Time.now
      # Remove old requests outside the window
      @requests.reject! { |time| now - time > @window_size }

      if @requests.size < @max_requests
        @requests << now
        true
      else
        false
      end
    end
  end

  def time_until_next_request
    return 0 if @requests.empty?

    oldest_request = @requests.min
    wait_time = @window_size - (Time.now - oldest_request)
    [wait_time, 0].max
  end
end

# Usage
limiter = SlidingWindowLimiter.new(
  window_size: 60.seconds,
  max_requests: 100
)

if limiter.allow_request?
  crawler.fetch(url)
else
  sleep(limiter.time_until_next_request)
  retry
end

          

3. Leaky Bucket

Process requests at a constant rate:

            ruby
            
          

            class LeakyBucket
  def initialize(capacity:, leak_rate:)
    @capacity = capacity
    @leak_rate = leak_rate
    @queue = Queue.new
    @mutex = Mutex.new
    start_leak_thread
  end

  def add_request(request)
    @mutex.synchronize do
      if @queue.size < @capacity
        @queue.push(request)
        true
      else
        false # Bucket overflow
      end
    end
  end

  private

  def start_leak_thread
    Thread.new do
      loop do
        sleep(1.0 / @leak_rate)

        request = @queue.pop(true) rescue nil
        request&.call if request
      end
    end
  end
end

          

4. Distributed Rate Limiting

For multi-instance deployments:

            ruby
            
          

            class RedisRateLimiter
  def initialize(redis:, key_prefix: "rate_limit")
    @redis = redis
    @key_prefix = key_prefix
  end

  def allow_request?(identifier, limit:, window:)
    key = "#{@key_prefix}:#{identifier}:#{window_key(window)}"

    @redis.multi do |multi|
      multi.incr(key)
      multi.expire(key, window)
    end.first <= limit
  end

  def rate_limit_status(identifier, limit:, window:)
    key = "#{@key_prefix}:#{identifier}:#{window_key(window)}"
    current = @redis.get(key).to_i

    {
      limit: limit,
      remaining: [limit - current, 0].max,
      reset_at: Time.now + @redis.ttl(key)
    }
  end

  private

  def window_key(window)
    (Time.now.to_i / window) * window
  end
end

          

Handling Rate Limit Responses

Detecting Rate Limits

            ruby
            
          

            class RateLimitDetector
  RATE_LIMIT_INDICATORS = {
    status_codes: [429, 503],
    headers: ['X-RateLimit-Remaining', 'Retry-After'],
    body_patterns: [
      /rate limit exceeded/i,
      /too many requests/i,
      /throttled/i
    ]
  }

  def rate_limited?(response)
    # Check status code
    return true if RATE_LIMIT_INDICATORS[:status_codes].include?(response.code)

    # Check headers
    return true if response.headers.keys.any? { |h| 
      h.match?(/rate.?limit/i) && response.headers[h].to_i == 0
    }

    # Check response body
    return true if RATE_LIMIT_INDICATORS[:body_patterns].any? { |pattern|
      response.body.match?(pattern)
    }

    false
  end

  def extract_retry_info(response)
    {
      retry_after: response.headers['Retry-After'],
      limit: response.headers['X-RateLimit-Limit'],
      remaining: response.headers['X-RateLimit-Remaining'],
      reset: response.headers['X-RateLimit-Reset']
    }
  end
end

          

Retry Strategies

            ruby
            
          

            class RateLimitRetryHandler
  def handle_rate_limit(response)
    retry_info = extract_retry_info(response)

    if retry_info[:retry_after]
      wait_time = parse_retry_after(retry_info[:retry_after])
      logger.info "Rate limited. Waiting #{wait_time}s"
      sleep(wait_time)
    elsif retry_info[:reset]
      wait_until_reset(retry_info[:reset])
    else
      exponential_backoff
    end
  end

  private

  def wait_until_reset(reset_time)
    reset_at = Time.at(reset_time.to_i)
    wait_time = [reset_at - Time.now, 0].max

    logger.info "Rate limit resets at #{reset_at}. Waiting #{wait_time}s"
    sleep(wait_time + 1) # Add 1 second buffer
  end

  def exponential_backoff
    @retry_count ||= 0
    @retry_count += 1

    wait_time = [2 ** @retry_count, 300].min # Max 5 minutes
    logger.info "Exponential backoff: waiting #{wait_time}s"
    sleep(wait_time)
  end
end

          

Monitoring and Analytics

Rate Limit Metrics

            ruby
            
          

            class RateLimitMetrics
  def initialize
    @metrics = Hash.new { |h, k| h[k] = { requests: 0, limited: 0 } }
  end

  def record_request(domain, limited: false)
    @metrics[domain][:requests] += 1
    @metrics[domain][:limited] += 1 if limited
  end

  def rate_limit_ratio(domain)
    stats = @metrics[domain]
    return 0 if stats[:requests].zero?

    (stats[:limited].to_f / stats[:requests] * 100).round(2)
  end

  def report
    @metrics.map do |domain, stats|
      {
        domain: domain,
        total_requests: stats[:requests],
        rate_limited: stats[:limited],
        limit_ratio: rate_limit_ratio(domain)
      }
    end
  end
end

          

Performance Impact Analysis

            ruby
            
          

            class RateLimitPerformanceAnalyzer
  def analyze_impact(with_limits:, without_limits:)
    {
      throughput: {
        with_limits: calculate_throughput(with_limits),
        without_limits: calculate_throughput(without_limits),
        impact: throughput_impact(with_limits, without_limits)
      },
      response_time: {
        with_limits: average_response_time(with_limits),
        without_limits: average_response_time(without_limits),
        overhead: response_time_overhead(with_limits, without_limits)
      },
      success_rate: {
        with_limits: success_rate(with_limits),
        without_limits: success_rate(without_limits)
      }
    }
  end

  private

  def calculate_throughput(data)
    total_time = data.last[:timestamp] - data.first[:timestamp]
    data.size / total_time.to_f
  end

  def throughput_impact(with_limits, without_limits)
    with_throughput = calculate_throughput(with_limits)
    without_throughput = calculate_throughput(without_limits)

    ((without_throughput - with_throughput) / without_throughput * 100).round(2)
  end
end

          

Best Practices

1. Respect robots.txt

            ruby
            
          

            class RobotsTxtCompliance
  def initialize
    @robots_cache = {}
  end

  def can_fetch?(url)
    uri = URI.parse(url)
    robots = fetch_robots_txt(uri)

    robots.allowed?(url, "ActiCrawl")
  end

  def crawl_delay(url)
    uri = URI.parse(url)
    robots = fetch_robots_txt(uri)

    robots.crawl_delay("ActiCrawl") || 0
  end

  private

  def fetch_robots_txt(uri)
    robots_url = "#{uri.scheme}://#{uri.host}/robots.txt"

    @robots_cache[robots_url] ||= begin
      response = Net::HTTP.get_response(URI.parse(robots_url))
      Robotstxt.parse(response.body, robots_url)
    rescue
      Robotstxt.parse("", robots_url) # Empty robots.txt on error
    end
  end
end

          

2. Implement Request Queuing

            ruby
            
          

            class RequestQueue
  def initialize(rate_limiter:)
    @queue = Queue.new
    @rate_limiter = rate_limiter
    @workers = []
    start_workers
  end

  def enqueue(request)
    @queue.push(request)
  end

  def shutdown
    @workers.each { |w| w[:stop] = true }
    @workers.each { |w| w[:thread].join }
  end

  private

  def start_workers
    5.times do
      worker = { stop: false }
      worker[:thread] = Thread.new do
        until worker[:stop]
          request = @queue.pop(true) rescue nil

          if request
            @rate_limiter.wait_for_token
            process_request(request)
          else
            sleep(0.1)
          end
        end
      end

      @workers << worker
    end
  end

  def process_request(request)
    request.call
  rescue => e
    logger.error "Request failed: #{e.message}"
  end
end

          

3. Use Connection Pooling

            ruby
            
          

            class ConnectionPool
  def initialize(size: 10, timeout: 5)
    @pool = Queue.new
    @size = size
    @timeout = timeout

    size.times { @pool.push(create_connection) }
  end

  def with_connection
    connection = checkout
    yield connection
  ensure
    checkin(connection)
  end

  private

  def checkout
    @pool.pop(true)
  rescue ThreadError
    if @pool.empty? && @created < @size
      create_connection
    else
      raise "Connection pool exhausted"
    end
  end

  def checkin(connection)
    @pool.push(connection)
  end

  def create_connection
    # Create your HTTP client connection
    HTTP.persistent("https://example.com")
  end
end

          

4. Monitor and Alert

            ruby
            
          

            class RateLimitMonitor
  ALERT_THRESHOLDS = {
    rate_limit_ratio: 20, # 20% of requests rate limited
    queue_size: 1000,
    avg_wait_time: 10 # seconds
  }

  def check_health
    alerts = []

    if rate_limit_ratio > ALERT_THRESHOLDS[:rate_limit_ratio]
      alerts << "High rate limit ratio: #{rate_limit_ratio}%"
    end

    if queue_size > ALERT_THRESHOLDS[:queue_size]
      alerts << "Large request queue: #{queue_size} requests"
    end

    if avg_wait_time > ALERT_THRESHOLDS[:avg_wait_time]
      alerts << "High average wait time: #{avg_wait_time}s"
    end

    send_alerts(alerts) if alerts.any?
  end
end

          

Testing Rate Limits

            ruby
            
          

            require 'test_helper'

class RateLimiterTest < ActiveSupport::TestCase
  def setup
    @limiter = TokenBucket.new(capacity: 10, refill_rate: 5)
  end

  test "respects token capacity" do
    10.times { assert @limiter.consume }
    assert_not @limiter.consume
  end

  test "refills tokens over time" do
    10.times { @limiter.consume }
    sleep(1.1)

    assert_equal 5, @limiter.tokens
  end

  test "handles burst traffic" do
    requests = []

    20.times do |i|
      if @limiter.consume
        requests << { time: Time.now, index: i }
      end
    end

    assert_equal 10, requests.size
  end
end

          

Summary

Effective rate limiting is essential for sustainable web scraping:

Choose appropriate strategies based on your use case
Respect server resources and robots.txt rules
Implement adaptive rate limiting that responds to server feedback
Monitor rate limit metrics to optimize performance
Use distributed rate limiting for scaled deployments
Handle rate limit responses gracefully
Test your rate limiting thoroughly
Be a good citizen of the web

Remember: The goal is to collect data efficiently while being respectful to the websites you're scraping.

Documentation