#!/usr/bin/env python3 """ Identifiability Evaluation Pipeline Orchestrates all analysis scripts for the identifiability research: - RQ1.1: Non-identifiability conditions - RQ1.2: Autonomy-invariant observables - RQ1.3: Consistent estimators OPTIMIZED: Loads data once and passes to all analysis modules (no subprocess overhead). Usage: python eval_identifiability/run_evaluation.py python eval_identifiability/run_evaluation.py --figures-only """ import argparse import subprocess import sys from datetime import datetime from pathlib import Path # Project paths PROJECT_ROOT = Path(__file__).parent.parent SCRIPTS_DIR = Path(__file__).parent / "scripts" RESULTS_DIR = Path(__file__).parent / "results" FIGURES_DIR = Path(__file__).parent / "figures" DATA_DIR = PROJECT_ROOT / "data" SUBMOLTS_DIR = DATA_DIR / "submolts" PROFILES_DIR = DATA_DIR / "profiles" # Add scripts directory to path for imports sys.path.insert(0, str(SCRIPTS_DIR)) def print_header(title: str): """Print formatted header.""" width = 70 print("\n" + "=" * width) print(f" {title}") print("=" * width) def print_step(step: int, total: int, description: str): """Print step indicator.""" print(f"\n[{step}/{total}] {description}") print("-" * 50) def check_prerequisites() -> bool: """Check if required data directories exist.""" print_step(1, 11, "Checking prerequisites & loading data") if not SUBMOLTS_DIR.exists(): print(f" ERROR: Submolts directory not found: {SUBMOLTS_DIR}") print(" Run: python scripts/convert_opencraw_to_submolts.py") return False if not PROFILES_DIR.exists(): print(f" ERROR: Profiles directory not found: {PROFILES_DIR}") print(" Run: python scripts/convert_profiles_to_json.py") return False # Create output directories RESULTS_DIR.mkdir(parents=True, exist_ok=True) FIGURES_DIR.mkdir(parents=True, exist_ok=True) print(f" Results directory: {RESULTS_DIR}") print(f" Figures directory: {FIGURES_DIR}") return True def load_shared_data(): """Load data once for all analysis modules.""" from data_loader import load_entities_compat, extract_posts_compat print(" Loading entities from data/submolts/ and data/profiles/...") entities = load_entities_compat(PROJECT_ROOT, verbose=False) print(f" Loaded {len(entities):,} entities") posts = extract_posts_compat(entities) print(f" Extracted {len(posts):,} posts") return entities, posts def run_analysis_module(module_name: str, run_func, posts: list, description: str) -> bool: """Run an analysis module's run() function.""" print(f" Running: {module_name}") try: run_func(posts, RESULTS_DIR) print(f" SUCCESS: {description}") return True except Exception as e: print(f" ERROR: {description} failed - {e}") import traceback traceback.print_exc() return False def run_analysis_scripts(figures_only: bool = False) -> dict: """Run all analysis scripts using shared data.""" results = { "01_generative_model": False, "02_nonidentifiability": False, "03_participation": False, "04_entropy": False, "05_burstiness": False, "06_stylometry": False, "07_cascades": False, "08_estimators": False, "09_synthesis": False, "figures": False, } if figures_only: for key in results: if key != "figures": results[key] = True else: # Load data ONCE print_step(2, 11, "Loading shared data") entities, posts = load_shared_data() # Import and run each analysis module analysis_modules = [ ("01_generative_model", "Generative model", 3), ("02_nonidentifiability", "Non-identifiability analysis", 4), ("03_participation_invariants", "Participation invariants", 5), ("04_crosscommunity_entropy", "Cross-community entropy", 6), ("05_temporal_burstiness", "Temporal burstiness", 7), ("06_stylometric_stability", "Stylometric stability", 8), ("07_cascade_analysis", "Cascade analysis", 9), ("08_consistent_estimators", "Consistent estimators", 10), ] # Map module names to result keys key_map = { "01_generative_model": "01_generative_model", "02_nonidentifiability": "02_nonidentifiability", "03_participation_invariants": "03_participation", "04_crosscommunity_entropy": "04_entropy", "05_temporal_burstiness": "05_burstiness", "06_stylometric_stability": "06_stylometry", "07_cascade_analysis": "07_cascades", "08_consistent_estimators": "08_estimators", } for module_name, description, step_num in analysis_modules: print_step(step_num, 11, description) try: # Dynamic import module = __import__(module_name) # Check if module has run() function if hasattr(module, 'run'): result_key = key_map.get(module_name, module_name) results[result_key] = run_analysis_module( module_name, module.run, posts, description ) else: # Fallback: module doesn't have run(), call main() print(f" WARNING: {module_name} has no run() function, calling main()") if hasattr(module, 'main'): module.main() result_key = key_map.get(module_name, module_name) results[result_key] = True else: print(f" ERROR: {module_name} has no run() or main() function") except ImportError as e: print(f" ERROR: Could not import {module_name}: {e}") except Exception as e: print(f" ERROR: {description} failed - {e}") import traceback traceback.print_exc() # 09_synthesis reads from result files, doesn't need posts print_step(10, 11, "RQ synthesis") try: import importlib synthesis = importlib.import_module("09_synthesis") if hasattr(synthesis, 'run'): results["09_synthesis"] = synthesis.run(RESULTS_DIR) elif hasattr(synthesis, 'main'): synthesis.main() results["09_synthesis"] = True print(" SUCCESS: RQ synthesis") except Exception as e: print(f" ERROR: RQ synthesis failed - {e}") # Generate figures (still uses subprocess since it's independent) print_step(11, 11, "Generating figures") figures_script = SCRIPTS_DIR / "generate_figures.py" if figures_script.exists(): try: result = subprocess.run( [sys.executable, str(figures_script)], capture_output=True, text=True, cwd=str(PROJECT_ROOT), timeout=300 ) results["figures"] = result.returncode == 0 if results["figures"]: print(" SUCCESS: Figure generation") else: print(f" ERROR: Figure generation failed") if result.stderr: print(f" {result.stderr[:200]}") except Exception as e: print(f" ERROR: Figure generation failed - {e}") else: print(f" SKIP: Figures script not found") return results def print_summary(results: dict, start_time: datetime): """Print final summary.""" print_header("PIPELINE COMPLETE") print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") duration = datetime.now() - start_time print(f"Duration: {duration.total_seconds():.1f} seconds") print("\nStep Results:") all_success = True for step, success in results.items(): status = "PASS" if success else "FAIL" symbol = "+" if success else "x" print(f" {symbol} {step}: {status}") if not success: all_success = False # List generated files print("\nGenerated Files:") print(f" Results: {RESULTS_DIR}") for f in sorted(RESULTS_DIR.glob("*.json")): print(f" - {f.name}") print(f"\n Figures: {FIGURES_DIR}") pdf_count = len(list(FIGURES_DIR.glob("*.pdf"))) png_count = len(list(FIGURES_DIR.glob("*.png"))) print(f" - {pdf_count} PDF files") print(f" - {png_count} PNG files") findings_path = Path(__file__).parent / "IDENTIFIABILITY_FINDINGS.md" if findings_path.exists(): print(f"\n Findings: {findings_path}") if all_success: print("\n SUCCESS: All evaluation steps completed!") return 0 else: print("\n WARNING: Some steps failed (see above)") return 1 def main(): """Main evaluation pipeline.""" parser = argparse.ArgumentParser( description="Run identifiability evaluation pipeline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python eval_identifiability/run_evaluation.py # Run full pipeline python eval_identifiability/run_evaluation.py --figures-only # Only regenerate figures """ ) parser.add_argument("--figures-only", action="store_true", help="Only regenerate figures from existing results") args = parser.parse_args() start_time = datetime.now() print_header("IDENTIFIABILITY EVALUATION PIPELINE (OPTIMIZED)") print(f"Started: {start_time.strftime('%Y-%m-%d %H:%M:%S')}") print(f"Project: {PROJECT_ROOT}") print("Mode: Single-process (data loaded once)") # Check prerequisites if not check_prerequisites(): print("\n PIPELINE FAILED: Prerequisites not met") return 1 # Run analysis results = run_analysis_scripts(figures_only=args.figures_only) # Print summary return print_summary(results, start_time) if __name__ == "__main__": sys.exit(main())