175 lines
3.7 KiB
YAML
175 lines
3.7 KiB
YAML
# Moltbook-analysis Production Configuration
|
|
# Optimized settings for production crawling
|
|
|
|
# Inherit from default and override specific values
|
|
# Load order: default.yaml -> production.yaml
|
|
|
|
# Redis configuration - production cluster
|
|
redis:
|
|
host: ${MOLTBOOK_ANALYSIS_REDIS_HOST:redis}
|
|
port: ${MOLTBOOK_ANALYSIS_REDIS_PORT:6379}
|
|
password: ${MOLTBOOK_ANALYSIS_REDIS_PASSWORD:}
|
|
db: 0
|
|
max_connections: 50
|
|
socket_timeout: 10.0
|
|
key_prefix: "moltbook-analysis:prod:"
|
|
|
|
# Crawler engine - production settings
|
|
crawler:
|
|
# Higher concurrency for production
|
|
max_concurrent: 20
|
|
requests_per_second: 2.0
|
|
|
|
# Longer timeouts for reliability
|
|
timeout: 60
|
|
max_retries: 5
|
|
retry_delay: 2.0
|
|
|
|
# Production user agent
|
|
user_agent: "Moltbook-analysis/1.0 (Research Crawler; Contact: research@example.com)"
|
|
|
|
# Strict robots.txt compliance
|
|
respect_robots_txt: true
|
|
robots_cache_ttl: 3600 # 1 hour (more frequent updates)
|
|
|
|
# Content settings
|
|
max_content_size: 20971520 # 20 MB
|
|
allowed_content_types:
|
|
- "text/html"
|
|
- "application/xhtml+xml"
|
|
- "application/json"
|
|
|
|
# Stricter error handling
|
|
max_errors_per_domain: 5
|
|
error_backoff_factor: 3.0
|
|
|
|
# URL frontier - production scale
|
|
frontier:
|
|
bloom_filter_capacity: 100000000 # 100 million URLs
|
|
bloom_filter_error_rate: 0.0001 # 0.01% false positive rate
|
|
max_queue_size: 10000000
|
|
|
|
default_priority: 5
|
|
|
|
# Conservative crawl delays
|
|
min_crawl_delay: 1.0
|
|
max_crawl_delay: 120.0
|
|
|
|
normalize_urls: true
|
|
strip_fragments: true
|
|
strip_tracking_params: true
|
|
|
|
# Storage - production settings
|
|
storage:
|
|
base_path: /data
|
|
|
|
parquet:
|
|
batch_size: 5000
|
|
compression: zstd
|
|
compression_level: 5 # Higher compression
|
|
row_group_size: 500000
|
|
|
|
jsonl:
|
|
buffer_size: 500
|
|
compression: zstd
|
|
max_file_size_mb: 500
|
|
|
|
raw:
|
|
enabled: true
|
|
compression: zstd
|
|
max_file_size_mb: 1000
|
|
|
|
rotation:
|
|
enabled: true
|
|
max_files_per_partition: 1000
|
|
|
|
# Content deduplication - tuned for large scale
|
|
dedup:
|
|
simhash_threshold: 3
|
|
simhash_bits: 64
|
|
min_content_length: 200
|
|
fingerprint_cache_size: 1000000
|
|
|
|
# Anonymization - strict for production
|
|
anonymization:
|
|
default_strategy: hash
|
|
hash_algorithm: sha256
|
|
hash_prefix_length: 12 # Longer prefix for uniqueness
|
|
|
|
pii_types:
|
|
- EMAIL
|
|
- PHONE
|
|
- IP_ADDRESS
|
|
- API_KEY
|
|
- MENTION
|
|
- USERNAME
|
|
- NAME
|
|
- LOCATION
|
|
- CREDIT_CARD
|
|
- SSN
|
|
|
|
use_presidio: true
|
|
presidio_score_threshold: 0.3 # Lower threshold = more sensitive
|
|
|
|
# Checkpoint - more frequent in production
|
|
checkpoint:
|
|
enabled: true
|
|
interval: 60 # 1 minute
|
|
max_checkpoints: 50
|
|
path: /data/checkpoints
|
|
|
|
# Logging - production format
|
|
logging:
|
|
level: INFO
|
|
format: json
|
|
file:
|
|
enabled: true
|
|
path: /data/logs/moltbook-analysis.log
|
|
max_size_mb: 500
|
|
backup_count: 10
|
|
|
|
# Monitoring - enabled for production
|
|
monitoring:
|
|
prometheus:
|
|
enabled: true
|
|
port: 9090
|
|
|
|
health_check:
|
|
enabled: true
|
|
interval: 15
|
|
timeout: 10
|
|
|
|
# Additional production-specific settings
|
|
|
|
# Rate limiting per domain (overrides)
|
|
domain_rate_limits:
|
|
moltbook.com: 0.5 # Be extra conservative with target site
|
|
api.moltbook.com: 0.1
|
|
|
|
# Blocked domains (never crawl)
|
|
blocked_domains:
|
|
- localhost
|
|
- 127.0.0.1
|
|
- "*.local"
|
|
|
|
# Allowed domains (only crawl these if set)
|
|
allowed_domains:
|
|
- moltbook.com
|
|
- www.moltbook.com
|
|
|
|
# Seed URLs for production
|
|
seed_urls:
|
|
- https://moltbook.com/m
|
|
- https://moltbook.com/m/all
|
|
- https://moltbook.com/m/popular
|
|
|
|
# Alerts configuration
|
|
alerts:
|
|
enabled: true
|
|
# Error rate threshold (percentage)
|
|
error_rate_threshold: 10
|
|
# Stall detection (seconds without progress)
|
|
stall_threshold: 300
|
|
# Notification webhook (optional)
|
|
webhook_url: ${MOLTBOOK_ANALYSIS_ALERT_WEBHOOK:}
|