# Moltbook-analysis Production Configuration # Optimized settings for production crawling # Inherit from default and override specific values # Load order: default.yaml -> production.yaml # Redis configuration - production cluster redis: host: ${MOLTBOOK_ANALYSIS_REDIS_HOST:redis} port: ${MOLTBOOK_ANALYSIS_REDIS_PORT:6379} password: ${MOLTBOOK_ANALYSIS_REDIS_PASSWORD:} db: 0 max_connections: 50 socket_timeout: 10.0 key_prefix: "moltbook-analysis:prod:" # Crawler engine - production settings crawler: # Higher concurrency for production max_concurrent: 20 requests_per_second: 2.0 # Longer timeouts for reliability timeout: 60 max_retries: 5 retry_delay: 2.0 # Production user agent user_agent: "Moltbook-analysis/1.0 (Research Crawler; Contact: research@example.com)" # Strict robots.txt compliance respect_robots_txt: true robots_cache_ttl: 3600 # 1 hour (more frequent updates) # Content settings max_content_size: 20971520 # 20 MB allowed_content_types: - "text/html" - "application/xhtml+xml" - "application/json" # Stricter error handling max_errors_per_domain: 5 error_backoff_factor: 3.0 # URL frontier - production scale frontier: bloom_filter_capacity: 100000000 # 100 million URLs bloom_filter_error_rate: 0.0001 # 0.01% false positive rate max_queue_size: 10000000 default_priority: 5 # Conservative crawl delays min_crawl_delay: 1.0 max_crawl_delay: 120.0 normalize_urls: true strip_fragments: true strip_tracking_params: true # Storage - production settings storage: base_path: /data parquet: batch_size: 5000 compression: zstd compression_level: 5 # Higher compression row_group_size: 500000 jsonl: buffer_size: 500 compression: zstd max_file_size_mb: 500 raw: enabled: true compression: zstd max_file_size_mb: 1000 rotation: enabled: true max_files_per_partition: 1000 # Content deduplication - tuned for large scale dedup: simhash_threshold: 3 simhash_bits: 64 min_content_length: 200 fingerprint_cache_size: 1000000 # Anonymization - strict for production anonymization: default_strategy: hash hash_algorithm: sha256 hash_prefix_length: 12 # Longer prefix for uniqueness pii_types: - EMAIL - PHONE - IP_ADDRESS - API_KEY - MENTION - USERNAME - NAME - LOCATION - CREDIT_CARD - SSN use_presidio: true presidio_score_threshold: 0.3 # Lower threshold = more sensitive # Checkpoint - more frequent in production checkpoint: enabled: true interval: 60 # 1 minute max_checkpoints: 50 path: /data/checkpoints # Logging - production format logging: level: INFO format: json file: enabled: true path: /data/logs/moltbook-analysis.log max_size_mb: 500 backup_count: 10 # Monitoring - enabled for production monitoring: prometheus: enabled: true port: 9090 health_check: enabled: true interval: 15 timeout: 10 # Additional production-specific settings # Rate limiting per domain (overrides) domain_rate_limits: moltbook.com: 0.5 # Be extra conservative with target site api.moltbook.com: 0.1 # Blocked domains (never crawl) blocked_domains: - localhost - 127.0.0.1 - "*.local" # Allowed domains (only crawl these if set) allowed_domains: - moltbook.com - www.moltbook.com # Seed URLs for production seed_urls: - https://moltbook.com/m - https://moltbook.com/m/all - https://moltbook.com/m/popular # Alerts configuration alerts: enabled: true # Error rate threshold (percentage) error_rate_threshold: 10 # Stall detection (seconds without progress) stall_threshold: 300 # Notification webhook (optional) webhook_url: ${MOLTBOOK_ANALYSIS_ALERT_WEBHOOK:}