montana/Русский/Разведка/Moltbook/github/moltbook-analysis/config/production.yaml

175 lines
3.7 KiB
YAML

# Moltbook-analysis Production Configuration
# Optimized settings for production crawling
# Inherit from default and override specific values
# Load order: default.yaml -> production.yaml
# Redis configuration - production cluster
redis:
host: ${MOLTBOOK_ANALYSIS_REDIS_HOST:redis}
port: ${MOLTBOOK_ANALYSIS_REDIS_PORT:6379}
password: ${MOLTBOOK_ANALYSIS_REDIS_PASSWORD:}
db: 0
max_connections: 50
socket_timeout: 10.0
key_prefix: "moltbook-analysis:prod:"
# Crawler engine - production settings
crawler:
# Higher concurrency for production
max_concurrent: 20
requests_per_second: 2.0
# Longer timeouts for reliability
timeout: 60
max_retries: 5
retry_delay: 2.0
# Production user agent
user_agent: "Moltbook-analysis/1.0 (Research Crawler; Contact: research@example.com)"
# Strict robots.txt compliance
respect_robots_txt: true
robots_cache_ttl: 3600 # 1 hour (more frequent updates)
# Content settings
max_content_size: 20971520 # 20 MB
allowed_content_types:
- "text/html"
- "application/xhtml+xml"
- "application/json"
# Stricter error handling
max_errors_per_domain: 5
error_backoff_factor: 3.0
# URL frontier - production scale
frontier:
bloom_filter_capacity: 100000000 # 100 million URLs
bloom_filter_error_rate: 0.0001 # 0.01% false positive rate
max_queue_size: 10000000
default_priority: 5
# Conservative crawl delays
min_crawl_delay: 1.0
max_crawl_delay: 120.0
normalize_urls: true
strip_fragments: true
strip_tracking_params: true
# Storage - production settings
storage:
base_path: /data
parquet:
batch_size: 5000
compression: zstd
compression_level: 5 # Higher compression
row_group_size: 500000
jsonl:
buffer_size: 500
compression: zstd
max_file_size_mb: 500
raw:
enabled: true
compression: zstd
max_file_size_mb: 1000
rotation:
enabled: true
max_files_per_partition: 1000
# Content deduplication - tuned for large scale
dedup:
simhash_threshold: 3
simhash_bits: 64
min_content_length: 200
fingerprint_cache_size: 1000000
# Anonymization - strict for production
anonymization:
default_strategy: hash
hash_algorithm: sha256
hash_prefix_length: 12 # Longer prefix for uniqueness
pii_types:
- EMAIL
- PHONE
- IP_ADDRESS
- API_KEY
- MENTION
- USERNAME
- NAME
- LOCATION
- CREDIT_CARD
- SSN
use_presidio: true
presidio_score_threshold: 0.3 # Lower threshold = more sensitive
# Checkpoint - more frequent in production
checkpoint:
enabled: true
interval: 60 # 1 minute
max_checkpoints: 50
path: /data/checkpoints
# Logging - production format
logging:
level: INFO
format: json
file:
enabled: true
path: /data/logs/moltbook-analysis.log
max_size_mb: 500
backup_count: 10
# Monitoring - enabled for production
monitoring:
prometheus:
enabled: true
port: 9090
health_check:
enabled: true
interval: 15
timeout: 10
# Additional production-specific settings
# Rate limiting per domain (overrides)
domain_rate_limits:
moltbook.com: 0.5 # Be extra conservative with target site
api.moltbook.com: 0.1
# Blocked domains (never crawl)
blocked_domains:
- localhost
- 127.0.0.1
- "*.local"
# Allowed domains (only crawl these if set)
allowed_domains:
- moltbook.com
- www.moltbook.com
# Seed URLs for production
seed_urls:
- https://moltbook.com/m
- https://moltbook.com/m/all
- https://moltbook.com/m/popular
# Alerts configuration
alerts:
enabled: true
# Error rate threshold (percentage)
error_rate_threshold: 10
# Stall detection (seconds without progress)
stall_threshold: 300
# Notification webhook (optional)
webhook_url: ${MOLTBOOK_ANALYSIS_ALERT_WEBHOOK:}