#!/usr/bin/env python3 """ run_evaluation.py - Microdata Evaluation Pipeline Orchestrator Runs all analysis scripts for the minimal microdata evaluation: - RQ3.1: Schema sufficiency for governance tasks - RQ3.2: Field minimality/necessity - RQ3.3: Privacy-preserving compression Optimized to load data once and pass to all modules. Usage: python eval_microdata/run_evaluation.py """ import sys from datetime import datetime from pathlib import Path # Project paths PROJECT_ROOT = Path(__file__).parent.parent EVAL_DIR = Path(__file__).parent SCRIPTS_DIR = EVAL_DIR / "scripts" RESULTS_DIR = EVAL_DIR / "results" FIGURES_DIR = EVAL_DIR / "figures" DATA_DIR = PROJECT_ROOT / "data" SUBMOLTS_DIR = DATA_DIR / "submolts" PROFILES_DIR = DATA_DIR / "profiles" # Add scripts directory to path for imports sys.path.insert(0, str(SCRIPTS_DIR)) def load_shared_data(): """Load data once for all modules.""" from data_loader import load_entities_compat entities = load_entities_compat(PROJECT_ROOT, verbose=False) return entities def run_analysis_module(module_name: str, run_func, entities: list, description: str, needs_data: bool = True) -> bool: """Run an analysis module with shared data. Args: module_name: Name of the module (for error reporting) run_func: The module's run() function entities: Shared entity data description: Human-readable description needs_data: Whether the module needs entity data Returns: True if successful, False otherwise """ try: if needs_data: run_func(entities, RESULTS_DIR) else: run_func(RESULTS_DIR) return True except Exception as e: print(f" ERROR: {description} failed with {e}") import traceback traceback.print_exc() return False def run_figures_module() -> bool: """Run figure generation module.""" try: import generate_figures generate_figures.main() return True except Exception as e: print(f" ERROR: Figure generation failed with {e}") import traceback traceback.print_exc() return False def main(): """Run the complete evaluation pipeline.""" print("=" * 70) print(" MICRODATA EVALUATION PIPELINE") print("=" * 70) print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"Project: {PROJECT_ROOT}") print() # Create output directories RESULTS_DIR.mkdir(parents=True, exist_ok=True) FIGURES_DIR.mkdir(parents=True, exist_ok=True) # Check prerequisites - use data/ directory print("[1/13] Checking prerequisites") print("-" * 50) if not SUBMOLTS_DIR.exists(): print(f" ERROR: Submolts directory not found: {SUBMOLTS_DIR}") print(" Run: python scripts/convert_opencraw_to_submolts.py") sys.exit(1) if not PROFILES_DIR.exists(): print(f" ERROR: Profiles directory not found: {PROFILES_DIR}") print(" Run: python scripts/convert_profiles_to_json.py") sys.exit(1) # Count data files post_files = list(SUBMOLTS_DIR.rglob("*.json")) profile_files = list(PROFILES_DIR.glob("*.json")) print(f" Data source: {DATA_DIR}") print(f" Posts: {len(post_files):,} files") print(f" Profiles: {len(profile_files):,} files") print(f" Results directory: {RESULTS_DIR}") print(f" Figures directory: {FIGURES_DIR}") print() # Track results results = {} start_time = datetime.now() # Load shared data once print("[2/13] Loading shared data") print("-" * 50) entities = load_shared_data() print(f" Loaded {len(entities):,} entities") print() # Analysis modules: (module_name, result_key, description, step_num, needs_data) analysis_modules = [ ("01_schema_definition", "01_schema", "Schema definition", 3, True), ("02_task_definitions", "02_tasks", "Task definitions", 4, True), ("03_coordinated_activity", "03_coordination", "Coordinated activity detection", 5, True), ("04_cross_community_diffusion", "04_diffusion", "Cross-community diffusion", 6, True), ("05_engagement_dynamics", "05_engagement", "Engagement dynamics", 7, True), ("06_leakage_patterns", "06_leakage", "Leakage pattern detection", 8, True), ("07_necessity_proofs", "07_necessity", "Necessity proofs", 9, True), ("08_ablation_study", "08_ablation", "Ablation study", 10, True), ("09_compression_analysis", "09_compression", "Compression analysis", 11, True), ("10_synthesis", "10_synthesis", "RQ synthesis", 12, False), ("11_capability_diffusion", "11_capability", "Capability diffusion", 13, True), ] for module_name, result_key, description, step_num, needs_data in analysis_modules: print(f"[{step_num}/13] {description}") print("-" * 50) print(f" Running: {module_name}") try: module = __import__(module_name) run_func = getattr(module, 'run', None) if run_func is None: print(f" WARNING: {module_name} has no run() function, using main()") success = False try: module.main() success = True except Exception as e: print(f" ERROR: {description} failed with {e}") else: success = run_analysis_module(module_name, run_func, entities, description, needs_data) except ImportError as e: print(f" ERROR: Could not import {module_name}: {e}") success = False results[result_key] = "PASS" if success else "FAIL" status = "SUCCESS" if success else "FAILED" print(f" {status}: {description}") print() # Generate figures print("[14/14] Figure generation") print("-" * 50) success = run_figures_module() results["figures"] = "PASS" if success else "FAIL" status = "SUCCESS" if success else "FAILED" print(f" {status}: Figure generation") print() # Summary duration = (datetime.now() - start_time).total_seconds() print("=" * 70) print(" PIPELINE COMPLETE") print("=" * 70) print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"Duration: {duration:.1f} seconds") print() print("Step Results:") step_names = [ ("01_schema", "Schema definition"), ("02_tasks", "Task definitions"), ("03_coordination", "Coordinated activity"), ("04_diffusion", "Cross-community diffusion"), ("05_engagement", "Engagement dynamics"), ("06_leakage", "Leakage patterns"), ("07_necessity", "Necessity proofs"), ("08_ablation", "Ablation study"), ("09_compression", "Compression analysis"), ("10_synthesis", "RQ synthesis"), ("11_capability", "Capability diffusion"), ("figures", "Figure generation"), ] all_passed = True for key, name in step_names: status = results.get(key, "SKIP") symbol = "+" if status == "PASS" else "x" print(f" {symbol} {name}: {status}") if status != "PASS": all_passed = False print() # List generated files print("Generated Files:") print(f" Results: {RESULTS_DIR}") for f in sorted(RESULTS_DIR.glob("*.json")): print(f" - {f.name}") print(f"\n Figures: {FIGURES_DIR}") pdf_count = len(list(FIGURES_DIR.glob("*.pdf"))) png_count = len(list(FIGURES_DIR.glob("*.png"))) print(f" - {pdf_count} PDF files") print(f" - {png_count} PNG files") findings_path = EVAL_DIR / "MICRODATA_FINDINGS.md" if findings_path.exists(): print(f"\n Findings: {findings_path}") print() if all_passed: print(" SUCCESS: All evaluation steps completed!") else: print(" WARNING: Some steps failed (see above)") sys.exit(1) if __name__ == "__main__": main()