# Moltbook-analysis Default Configuration # This is the base configuration for development and testing # Redis configuration for URL frontier redis: host: localhost port: 6379 db: 0 password: null # Connection pool settings max_connections: 10 socket_timeout: 5.0 # Key prefixes for namespacing key_prefix: "moltbook-analysis:" # Crawler engine settings crawler: # Concurrency settings max_concurrent: 5 requests_per_second: 1.0 # HTTP client settings timeout: 30 max_retries: 3 retry_delay: 1.0 # User agent for requests user_agent: "Moltbook-analysis/1.0 (Research Crawler; +https://github.com/moltbook-analysis)" # robots.txt compliance respect_robots_txt: true robots_cache_ttl: 86400 # 24 hours # Content settings max_content_size: 10485760 # 10 MB allowed_content_types: - "text/html" - "application/xhtml+xml" # Error handling max_errors_per_domain: 10 error_backoff_factor: 2.0 # URL frontier settings frontier: # Bloom filter for URL deduplication bloom_filter_capacity: 10000000 # 10 million URLs bloom_filter_error_rate: 0.001 # 0.1% false positive rate # Priority queue settings max_queue_size: 1000000 default_priority: 5 # Domain scheduling min_crawl_delay: 1.0 max_crawl_delay: 60.0 # URL normalization normalize_urls: true strip_fragments: true strip_tracking_params: true tracking_params: - utm_source - utm_medium - utm_campaign - utm_content - utm_term - ref - source # Storage settings storage: base_path: ./data # Parquet writer settings parquet: batch_size: 1000 compression: zstd compression_level: 3 row_group_size: 100000 # JSONL writer settings jsonl: buffer_size: 100 compression: zstd max_file_size_mb: 100 # Raw response storage raw: enabled: true compression: zstd max_file_size_mb: 500 # File rotation rotation: enabled: true max_files_per_partition: 100 # Content deduplication dedup: # SimHash settings simhash_threshold: 3 # Hamming distance threshold simhash_bits: 64 # Content fingerprinting min_content_length: 100 # Cache settings fingerprint_cache_size: 100000 # Anonymization settings anonymization: # Default redaction strategy default_strategy: hash # Hash settings hash_algorithm: sha256 hash_prefix_length: 8 # PII detection pii_types: - EMAIL - PHONE - IP_ADDRESS - API_KEY - MENTION - USERNAME # Presidio settings use_presidio: true presidio_score_threshold: 0.5 # Custom patterns (regex) custom_patterns: {} # Checkpoint settings checkpoint: enabled: true interval: 300 # 5 minutes max_checkpoints: 10 path: ./data/checkpoints # Logging settings logging: level: INFO format: json # File logging file: enabled: false path: ./logs/moltbook-analysis.log max_size_mb: 100 backup_count: 5 # Monitoring settings monitoring: # Prometheus metrics prometheus: enabled: false port: 9090 # Health checks health_check: enabled: true interval: 30 timeout: 5