297 lines
10 KiB
Python
297 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Identifiability Evaluation Pipeline
|
|
|
|
Orchestrates all analysis scripts for the identifiability research:
|
|
- RQ1.1: Non-identifiability conditions
|
|
- RQ1.2: Autonomy-invariant observables
|
|
- RQ1.3: Consistent estimators
|
|
|
|
OPTIMIZED: Loads data once and passes to all analysis modules (no subprocess overhead).
|
|
|
|
Usage:
|
|
python eval_identifiability/run_evaluation.py
|
|
python eval_identifiability/run_evaluation.py --figures-only
|
|
"""
|
|
|
|
import argparse
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Project paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
SCRIPTS_DIR = Path(__file__).parent / "scripts"
|
|
RESULTS_DIR = Path(__file__).parent / "results"
|
|
FIGURES_DIR = Path(__file__).parent / "figures"
|
|
DATA_DIR = PROJECT_ROOT / "data"
|
|
SUBMOLTS_DIR = DATA_DIR / "submolts"
|
|
PROFILES_DIR = DATA_DIR / "profiles"
|
|
|
|
# Add scripts directory to path for imports
|
|
sys.path.insert(0, str(SCRIPTS_DIR))
|
|
|
|
|
|
def print_header(title: str):
|
|
"""Print formatted header."""
|
|
width = 70
|
|
print("\n" + "=" * width)
|
|
print(f" {title}")
|
|
print("=" * width)
|
|
|
|
|
|
def print_step(step: int, total: int, description: str):
|
|
"""Print step indicator."""
|
|
print(f"\n[{step}/{total}] {description}")
|
|
print("-" * 50)
|
|
|
|
|
|
def check_prerequisites() -> bool:
|
|
"""Check if required data directories exist."""
|
|
print_step(1, 11, "Checking prerequisites & loading data")
|
|
|
|
if not SUBMOLTS_DIR.exists():
|
|
print(f" ERROR: Submolts directory not found: {SUBMOLTS_DIR}")
|
|
print(" Run: python scripts/convert_opencraw_to_submolts.py")
|
|
return False
|
|
|
|
if not PROFILES_DIR.exists():
|
|
print(f" ERROR: Profiles directory not found: {PROFILES_DIR}")
|
|
print(" Run: python scripts/convert_profiles_to_json.py")
|
|
return False
|
|
|
|
# Create output directories
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
|
|
print(f" Results directory: {RESULTS_DIR}")
|
|
print(f" Figures directory: {FIGURES_DIR}")
|
|
|
|
return True
|
|
|
|
|
|
def load_shared_data():
|
|
"""Load data once for all analysis modules."""
|
|
from data_loader import load_entities_compat, extract_posts_compat
|
|
|
|
print(" Loading entities from data/submolts/ and data/profiles/...")
|
|
entities = load_entities_compat(PROJECT_ROOT, verbose=False)
|
|
print(f" Loaded {len(entities):,} entities")
|
|
|
|
posts = extract_posts_compat(entities)
|
|
print(f" Extracted {len(posts):,} posts")
|
|
|
|
return entities, posts
|
|
|
|
|
|
def run_analysis_module(module_name: str, run_func, posts: list, description: str) -> bool:
|
|
"""Run an analysis module's run() function."""
|
|
print(f" Running: {module_name}")
|
|
try:
|
|
run_func(posts, RESULTS_DIR)
|
|
print(f" SUCCESS: {description}")
|
|
return True
|
|
except Exception as e:
|
|
print(f" ERROR: {description} failed - {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
def run_analysis_scripts(figures_only: bool = False) -> dict:
|
|
"""Run all analysis scripts using shared data."""
|
|
results = {
|
|
"01_generative_model": False,
|
|
"02_nonidentifiability": False,
|
|
"03_participation": False,
|
|
"04_entropy": False,
|
|
"05_burstiness": False,
|
|
"06_stylometry": False,
|
|
"07_cascades": False,
|
|
"08_estimators": False,
|
|
"09_synthesis": False,
|
|
"figures": False,
|
|
}
|
|
|
|
if figures_only:
|
|
for key in results:
|
|
if key != "figures":
|
|
results[key] = True
|
|
else:
|
|
# Load data ONCE
|
|
print_step(2, 11, "Loading shared data")
|
|
entities, posts = load_shared_data()
|
|
|
|
# Import and run each analysis module
|
|
analysis_modules = [
|
|
("01_generative_model", "Generative model", 3),
|
|
("02_nonidentifiability", "Non-identifiability analysis", 4),
|
|
("03_participation_invariants", "Participation invariants", 5),
|
|
("04_crosscommunity_entropy", "Cross-community entropy", 6),
|
|
("05_temporal_burstiness", "Temporal burstiness", 7),
|
|
("06_stylometric_stability", "Stylometric stability", 8),
|
|
("07_cascade_analysis", "Cascade analysis", 9),
|
|
("08_consistent_estimators", "Consistent estimators", 10),
|
|
]
|
|
|
|
# Map module names to result keys
|
|
key_map = {
|
|
"01_generative_model": "01_generative_model",
|
|
"02_nonidentifiability": "02_nonidentifiability",
|
|
"03_participation_invariants": "03_participation",
|
|
"04_crosscommunity_entropy": "04_entropy",
|
|
"05_temporal_burstiness": "05_burstiness",
|
|
"06_stylometric_stability": "06_stylometry",
|
|
"07_cascade_analysis": "07_cascades",
|
|
"08_consistent_estimators": "08_estimators",
|
|
}
|
|
|
|
for module_name, description, step_num in analysis_modules:
|
|
print_step(step_num, 11, description)
|
|
|
|
try:
|
|
# Dynamic import
|
|
module = __import__(module_name)
|
|
|
|
# Check if module has run() function
|
|
if hasattr(module, 'run'):
|
|
result_key = key_map.get(module_name, module_name)
|
|
results[result_key] = run_analysis_module(
|
|
module_name, module.run, posts, description
|
|
)
|
|
else:
|
|
# Fallback: module doesn't have run(), call main()
|
|
print(f" WARNING: {module_name} has no run() function, calling main()")
|
|
if hasattr(module, 'main'):
|
|
module.main()
|
|
result_key = key_map.get(module_name, module_name)
|
|
results[result_key] = True
|
|
else:
|
|
print(f" ERROR: {module_name} has no run() or main() function")
|
|
|
|
except ImportError as e:
|
|
print(f" ERROR: Could not import {module_name}: {e}")
|
|
except Exception as e:
|
|
print(f" ERROR: {description} failed - {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# 09_synthesis reads from result files, doesn't need posts
|
|
print_step(10, 11, "RQ synthesis")
|
|
try:
|
|
import importlib
|
|
synthesis = importlib.import_module("09_synthesis")
|
|
if hasattr(synthesis, 'run'):
|
|
results["09_synthesis"] = synthesis.run(RESULTS_DIR)
|
|
elif hasattr(synthesis, 'main'):
|
|
synthesis.main()
|
|
results["09_synthesis"] = True
|
|
print(" SUCCESS: RQ synthesis")
|
|
except Exception as e:
|
|
print(f" ERROR: RQ synthesis failed - {e}")
|
|
|
|
# Generate figures (still uses subprocess since it's independent)
|
|
print_step(11, 11, "Generating figures")
|
|
figures_script = SCRIPTS_DIR / "generate_figures.py"
|
|
if figures_script.exists():
|
|
try:
|
|
result = subprocess.run(
|
|
[sys.executable, str(figures_script)],
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=str(PROJECT_ROOT),
|
|
timeout=300
|
|
)
|
|
results["figures"] = result.returncode == 0
|
|
if results["figures"]:
|
|
print(" SUCCESS: Figure generation")
|
|
else:
|
|
print(f" ERROR: Figure generation failed")
|
|
if result.stderr:
|
|
print(f" {result.stderr[:200]}")
|
|
except Exception as e:
|
|
print(f" ERROR: Figure generation failed - {e}")
|
|
else:
|
|
print(f" SKIP: Figures script not found")
|
|
|
|
return results
|
|
|
|
|
|
def print_summary(results: dict, start_time: datetime):
|
|
"""Print final summary."""
|
|
print_header("PIPELINE COMPLETE")
|
|
print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
duration = datetime.now() - start_time
|
|
print(f"Duration: {duration.total_seconds():.1f} seconds")
|
|
|
|
print("\nStep Results:")
|
|
all_success = True
|
|
for step, success in results.items():
|
|
status = "PASS" if success else "FAIL"
|
|
symbol = "+" if success else "x"
|
|
print(f" {symbol} {step}: {status}")
|
|
if not success:
|
|
all_success = False
|
|
|
|
# List generated files
|
|
print("\nGenerated Files:")
|
|
print(f" Results: {RESULTS_DIR}")
|
|
for f in sorted(RESULTS_DIR.glob("*.json")):
|
|
print(f" - {f.name}")
|
|
|
|
print(f"\n Figures: {FIGURES_DIR}")
|
|
pdf_count = len(list(FIGURES_DIR.glob("*.pdf")))
|
|
png_count = len(list(FIGURES_DIR.glob("*.png")))
|
|
print(f" - {pdf_count} PDF files")
|
|
print(f" - {png_count} PNG files")
|
|
|
|
findings_path = Path(__file__).parent / "IDENTIFIABILITY_FINDINGS.md"
|
|
if findings_path.exists():
|
|
print(f"\n Findings: {findings_path}")
|
|
|
|
if all_success:
|
|
print("\n SUCCESS: All evaluation steps completed!")
|
|
return 0
|
|
else:
|
|
print("\n WARNING: Some steps failed (see above)")
|
|
return 1
|
|
|
|
|
|
def main():
|
|
"""Main evaluation pipeline."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Run identifiability evaluation pipeline",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python eval_identifiability/run_evaluation.py # Run full pipeline
|
|
python eval_identifiability/run_evaluation.py --figures-only # Only regenerate figures
|
|
"""
|
|
)
|
|
|
|
parser.add_argument("--figures-only", action="store_true",
|
|
help="Only regenerate figures from existing results")
|
|
|
|
args = parser.parse_args()
|
|
start_time = datetime.now()
|
|
|
|
print_header("IDENTIFIABILITY EVALUATION PIPELINE (OPTIMIZED)")
|
|
print(f"Started: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f"Project: {PROJECT_ROOT}")
|
|
print("Mode: Single-process (data loaded once)")
|
|
|
|
# Check prerequisites
|
|
if not check_prerequisites():
|
|
print("\n PIPELINE FAILED: Prerequisites not met")
|
|
return 1
|
|
|
|
# Run analysis
|
|
results = run_analysis_scripts(figures_only=args.figures_only)
|
|
|
|
# Print summary
|
|
return print_summary(results, start_time)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|