166 lines
3.1 KiB
YAML
166 lines
3.1 KiB
YAML
# Moltbook-analysis Default Configuration
|
|
# This is the base configuration for development and testing
|
|
|
|
# Redis configuration for URL frontier
|
|
redis:
|
|
host: localhost
|
|
port: 6379
|
|
db: 0
|
|
password: null
|
|
# Connection pool settings
|
|
max_connections: 10
|
|
socket_timeout: 5.0
|
|
# Key prefixes for namespacing
|
|
key_prefix: "moltbook-analysis:"
|
|
|
|
# Crawler engine settings
|
|
crawler:
|
|
# Concurrency settings
|
|
max_concurrent: 5
|
|
requests_per_second: 1.0
|
|
|
|
# HTTP client settings
|
|
timeout: 30
|
|
max_retries: 3
|
|
retry_delay: 1.0
|
|
|
|
# User agent for requests
|
|
user_agent: "Moltbook-analysis/1.0 (Research Crawler; +https://github.com/moltbook-analysis)"
|
|
|
|
# robots.txt compliance
|
|
respect_robots_txt: true
|
|
robots_cache_ttl: 86400 # 24 hours
|
|
|
|
# Content settings
|
|
max_content_size: 10485760 # 10 MB
|
|
allowed_content_types:
|
|
- "text/html"
|
|
- "application/xhtml+xml"
|
|
|
|
# Error handling
|
|
max_errors_per_domain: 10
|
|
error_backoff_factor: 2.0
|
|
|
|
# URL frontier settings
|
|
frontier:
|
|
# Bloom filter for URL deduplication
|
|
bloom_filter_capacity: 10000000 # 10 million URLs
|
|
bloom_filter_error_rate: 0.001 # 0.1% false positive rate
|
|
|
|
# Priority queue settings
|
|
max_queue_size: 1000000
|
|
default_priority: 5
|
|
|
|
# Domain scheduling
|
|
min_crawl_delay: 1.0
|
|
max_crawl_delay: 60.0
|
|
|
|
# URL normalization
|
|
normalize_urls: true
|
|
strip_fragments: true
|
|
strip_tracking_params: true
|
|
tracking_params:
|
|
- utm_source
|
|
- utm_medium
|
|
- utm_campaign
|
|
- utm_content
|
|
- utm_term
|
|
- ref
|
|
- source
|
|
|
|
# Storage settings
|
|
storage:
|
|
base_path: ./data
|
|
|
|
# Parquet writer settings
|
|
parquet:
|
|
batch_size: 1000
|
|
compression: zstd
|
|
compression_level: 3
|
|
row_group_size: 100000
|
|
|
|
# JSONL writer settings
|
|
jsonl:
|
|
buffer_size: 100
|
|
compression: zstd
|
|
max_file_size_mb: 100
|
|
|
|
# Raw response storage
|
|
raw:
|
|
enabled: true
|
|
compression: zstd
|
|
max_file_size_mb: 500
|
|
|
|
# File rotation
|
|
rotation:
|
|
enabled: true
|
|
max_files_per_partition: 100
|
|
|
|
# Content deduplication
|
|
dedup:
|
|
# SimHash settings
|
|
simhash_threshold: 3 # Hamming distance threshold
|
|
simhash_bits: 64
|
|
|
|
# Content fingerprinting
|
|
min_content_length: 100
|
|
|
|
# Cache settings
|
|
fingerprint_cache_size: 100000
|
|
|
|
# Anonymization settings
|
|
anonymization:
|
|
# Default redaction strategy
|
|
default_strategy: hash
|
|
|
|
# Hash settings
|
|
hash_algorithm: sha256
|
|
hash_prefix_length: 8
|
|
|
|
# PII detection
|
|
pii_types:
|
|
- EMAIL
|
|
- PHONE
|
|
- IP_ADDRESS
|
|
- API_KEY
|
|
- MENTION
|
|
- USERNAME
|
|
|
|
# Presidio settings
|
|
use_presidio: true
|
|
presidio_score_threshold: 0.5
|
|
|
|
# Custom patterns (regex)
|
|
custom_patterns: {}
|
|
|
|
# Checkpoint settings
|
|
checkpoint:
|
|
enabled: true
|
|
interval: 300 # 5 minutes
|
|
max_checkpoints: 10
|
|
path: ./data/checkpoints
|
|
|
|
# Logging settings
|
|
logging:
|
|
level: INFO
|
|
format: json
|
|
# File logging
|
|
file:
|
|
enabled: false
|
|
path: ./logs/moltbook-analysis.log
|
|
max_size_mb: 100
|
|
backup_count: 5
|
|
|
|
# Monitoring settings
|
|
monitoring:
|
|
# Prometheus metrics
|
|
prometheus:
|
|
enabled: false
|
|
port: 9090
|
|
|
|
# Health checks
|
|
health_check:
|
|
enabled: true
|
|
interval: 30
|
|
timeout: 5
|