montana/Русский/Разведка/Moltbook/github/moltbook-analysis/config/default.yaml

166 lines
3.1 KiB
YAML

# Moltbook-analysis Default Configuration
# This is the base configuration for development and testing
# Redis configuration for URL frontier
redis:
host: localhost
port: 6379
db: 0
password: null
# Connection pool settings
max_connections: 10
socket_timeout: 5.0
# Key prefixes for namespacing
key_prefix: "moltbook-analysis:"
# Crawler engine settings
crawler:
# Concurrency settings
max_concurrent: 5
requests_per_second: 1.0
# HTTP client settings
timeout: 30
max_retries: 3
retry_delay: 1.0
# User agent for requests
user_agent: "Moltbook-analysis/1.0 (Research Crawler; +https://github.com/moltbook-analysis)"
# robots.txt compliance
respect_robots_txt: true
robots_cache_ttl: 86400 # 24 hours
# Content settings
max_content_size: 10485760 # 10 MB
allowed_content_types:
- "text/html"
- "application/xhtml+xml"
# Error handling
max_errors_per_domain: 10
error_backoff_factor: 2.0
# URL frontier settings
frontier:
# Bloom filter for URL deduplication
bloom_filter_capacity: 10000000 # 10 million URLs
bloom_filter_error_rate: 0.001 # 0.1% false positive rate
# Priority queue settings
max_queue_size: 1000000
default_priority: 5
# Domain scheduling
min_crawl_delay: 1.0
max_crawl_delay: 60.0
# URL normalization
normalize_urls: true
strip_fragments: true
strip_tracking_params: true
tracking_params:
- utm_source
- utm_medium
- utm_campaign
- utm_content
- utm_term
- ref
- source
# Storage settings
storage:
base_path: ./data
# Parquet writer settings
parquet:
batch_size: 1000
compression: zstd
compression_level: 3
row_group_size: 100000
# JSONL writer settings
jsonl:
buffer_size: 100
compression: zstd
max_file_size_mb: 100
# Raw response storage
raw:
enabled: true
compression: zstd
max_file_size_mb: 500
# File rotation
rotation:
enabled: true
max_files_per_partition: 100
# Content deduplication
dedup:
# SimHash settings
simhash_threshold: 3 # Hamming distance threshold
simhash_bits: 64
# Content fingerprinting
min_content_length: 100
# Cache settings
fingerprint_cache_size: 100000
# Anonymization settings
anonymization:
# Default redaction strategy
default_strategy: hash
# Hash settings
hash_algorithm: sha256
hash_prefix_length: 8
# PII detection
pii_types:
- EMAIL
- PHONE
- IP_ADDRESS
- API_KEY
- MENTION
- USERNAME
# Presidio settings
use_presidio: true
presidio_score_threshold: 0.5
# Custom patterns (regex)
custom_patterns: {}
# Checkpoint settings
checkpoint:
enabled: true
interval: 300 # 5 minutes
max_checkpoints: 10
path: ./data/checkpoints
# Logging settings
logging:
level: INFO
format: json
# File logging
file:
enabled: false
path: ./logs/moltbook-analysis.log
max_size_mb: 100
backup_count: 5
# Monitoring settings
monitoring:
# Prometheus metrics
prometheus:
enabled: false
port: 9090
# Health checks
health_check:
enabled: true
interval: 30
timeout: 5