montana/Русский/Разведка/Moltbook/github/moltbook-analysis/eval/identifiability/run_evaluation.py

#!/usr/bin/env python3
"""
Identifiability Evaluation Pipeline

Orchestrates all analysis scripts for the identifiability research:
- RQ1.1: Non-identifiability conditions
- RQ1.2: Autonomy-invariant observables
- RQ1.3: Consistent estimators

OPTIMIZED: Loads data once and passes to all analysis modules (no subprocess overhead).

Usage:
    python eval_identifiability/run_evaluation.py
    python eval_identifiability/run_evaluation.py --figures-only
"""

import argparse
import subprocess
import sys
from datetime import datetime
from pathlib import Path

# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
SCRIPTS_DIR = Path(__file__).parent / "scripts"
RESULTS_DIR = Path(__file__).parent / "results"
FIGURES_DIR = Path(__file__).parent / "figures"
DATA_DIR = PROJECT_ROOT / "data"
SUBMOLTS_DIR = DATA_DIR / "submolts"
PROFILES_DIR = DATA_DIR / "profiles"

# Add scripts directory to path for imports
sys.path.insert(0, str(SCRIPTS_DIR))


def print_header(title: str):
    """Print formatted header."""
    width = 70
    print("\n" + "=" * width)
    print(f" {title}")
    print("=" * width)


def print_step(step: int, total: int, description: str):
    """Print step indicator."""
    print(f"\n[{step}/{total}] {description}")
    print("-" * 50)


def check_prerequisites() -> bool:
    """Check if required data directories exist."""
    print_step(1, 11, "Checking prerequisites & loading data")

    if not SUBMOLTS_DIR.exists():
        print(f"  ERROR: Submolts directory not found: {SUBMOLTS_DIR}")
        print("  Run: python scripts/convert_opencraw_to_submolts.py")
        return False

    if not PROFILES_DIR.exists():
        print(f"  ERROR: Profiles directory not found: {PROFILES_DIR}")
        print("  Run: python scripts/convert_profiles_to_json.py")
        return False

    # Create output directories
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    FIGURES_DIR.mkdir(parents=True, exist_ok=True)
    print(f"  Results directory: {RESULTS_DIR}")
    print(f"  Figures directory: {FIGURES_DIR}")

    return True


def load_shared_data():
    """Load data once for all analysis modules."""
    from data_loader import load_entities_compat, extract_posts_compat

    print("  Loading entities from data/submolts/ and data/profiles/...")
    entities = load_entities_compat(PROJECT_ROOT, verbose=False)
    print(f"  Loaded {len(entities):,} entities")

    posts = extract_posts_compat(entities)
    print(f"  Extracted {len(posts):,} posts")

    return entities, posts


def run_analysis_module(module_name: str, run_func, posts: list, description: str) -> bool:
    """Run an analysis module's run() function."""
    print(f"  Running: {module_name}")
    try:
        run_func(posts, RESULTS_DIR)
        print(f"  SUCCESS: {description}")
        return True
    except Exception as e:
        print(f"  ERROR: {description} failed - {e}")
        import traceback
        traceback.print_exc()
        return False


def run_analysis_scripts(figures_only: bool = False) -> dict:
    """Run all analysis scripts using shared data."""
    results = {
        "01_generative_model": False,
        "02_nonidentifiability": False,
        "03_participation": False,
        "04_entropy": False,
        "05_burstiness": False,
        "06_stylometry": False,
        "07_cascades": False,
        "08_estimators": False,
        "09_synthesis": False,
        "figures": False,
    }

    if figures_only:
        for key in results:
            if key != "figures":
                results[key] = True
    else:
        # Load data ONCE
        print_step(2, 11, "Loading shared data")
        entities, posts = load_shared_data()

        # Import and run each analysis module
        analysis_modules = [
            ("01_generative_model", "Generative model", 3),
            ("02_nonidentifiability", "Non-identifiability analysis", 4),
            ("03_participation_invariants", "Participation invariants", 5),
            ("04_crosscommunity_entropy", "Cross-community entropy", 6),
            ("05_temporal_burstiness", "Temporal burstiness", 7),
            ("06_stylometric_stability", "Stylometric stability", 8),
            ("07_cascade_analysis", "Cascade analysis", 9),
            ("08_consistent_estimators", "Consistent estimators", 10),
        ]

        # Map module names to result keys
        key_map = {
            "01_generative_model": "01_generative_model",
            "02_nonidentifiability": "02_nonidentifiability",
            "03_participation_invariants": "03_participation",
            "04_crosscommunity_entropy": "04_entropy",
            "05_temporal_burstiness": "05_burstiness",
            "06_stylometric_stability": "06_stylometry",
            "07_cascade_analysis": "07_cascades",
            "08_consistent_estimators": "08_estimators",
        }

        for module_name, description, step_num in analysis_modules:
            print_step(step_num, 11, description)

            try:
                # Dynamic import
                module = __import__(module_name)

                # Check if module has run() function
                if hasattr(module, 'run'):
                    result_key = key_map.get(module_name, module_name)
                    results[result_key] = run_analysis_module(
                        module_name, module.run, posts, description
                    )
                else:
                    # Fallback: module doesn't have run(), call main()
                    print(f"  WARNING: {module_name} has no run() function, calling main()")
                    if hasattr(module, 'main'):
                        module.main()
                        result_key = key_map.get(module_name, module_name)
                        results[result_key] = True
                    else:
                        print(f"  ERROR: {module_name} has no run() or main() function")

            except ImportError as e:
                print(f"  ERROR: Could not import {module_name}: {e}")
            except Exception as e:
                print(f"  ERROR: {description} failed - {e}")
                import traceback
                traceback.print_exc()

        # 09_synthesis reads from result files, doesn't need posts
        print_step(10, 11, "RQ synthesis")
        try:
            import importlib
            synthesis = importlib.import_module("09_synthesis")
            if hasattr(synthesis, 'run'):
                results["09_synthesis"] = synthesis.run(RESULTS_DIR)
            elif hasattr(synthesis, 'main'):
                synthesis.main()
                results["09_synthesis"] = True
            print("  SUCCESS: RQ synthesis")
        except Exception as e:
            print(f"  ERROR: RQ synthesis failed - {e}")

    # Generate figures (still uses subprocess since it's independent)
    print_step(11, 11, "Generating figures")
    figures_script = SCRIPTS_DIR / "generate_figures.py"
    if figures_script.exists():
        try:
            result = subprocess.run(
                [sys.executable, str(figures_script)],
                capture_output=True,
                text=True,
                cwd=str(PROJECT_ROOT),
                timeout=300
            )
            results["figures"] = result.returncode == 0
            if results["figures"]:
                print("  SUCCESS: Figure generation")
            else:
                print(f"  ERROR: Figure generation failed")
                if result.stderr:
                    print(f"  {result.stderr[:200]}")
        except Exception as e:
            print(f"  ERROR: Figure generation failed - {e}")
    else:
        print(f"  SKIP: Figures script not found")

    return results


def print_summary(results: dict, start_time: datetime):
    """Print final summary."""
    print_header("PIPELINE COMPLETE")
    print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    duration = datetime.now() - start_time
    print(f"Duration: {duration.total_seconds():.1f} seconds")

    print("\nStep Results:")
    all_success = True
    for step, success in results.items():
        status = "PASS" if success else "FAIL"
        symbol = "+" if success else "x"
        print(f"  {symbol} {step}: {status}")
        if not success:
            all_success = False

    # List generated files
    print("\nGenerated Files:")
    print(f"  Results: {RESULTS_DIR}")
    for f in sorted(RESULTS_DIR.glob("*.json")):
        print(f"    - {f.name}")

    print(f"\n  Figures: {FIGURES_DIR}")
    pdf_count = len(list(FIGURES_DIR.glob("*.pdf")))
    png_count = len(list(FIGURES_DIR.glob("*.png")))
    print(f"    - {pdf_count} PDF files")
    print(f"    - {png_count} PNG files")

    findings_path = Path(__file__).parent / "IDENTIFIABILITY_FINDINGS.md"
    if findings_path.exists():
        print(f"\n  Findings: {findings_path}")

    if all_success:
        print("\n SUCCESS: All evaluation steps completed!")
        return 0
    else:
        print("\n WARNING: Some steps failed (see above)")
        return 1


def main():
    """Main evaluation pipeline."""
    parser = argparse.ArgumentParser(
        description="Run identifiability evaluation pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    python eval_identifiability/run_evaluation.py              # Run full pipeline
    python eval_identifiability/run_evaluation.py --figures-only   # Only regenerate figures
        """
    )

    parser.add_argument("--figures-only", action="store_true",
                        help="Only regenerate figures from existing results")

    args = parser.parse_args()
    start_time = datetime.now()

    print_header("IDENTIFIABILITY EVALUATION PIPELINE (OPTIMIZED)")
    print(f"Started: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Project: {PROJECT_ROOT}")
    print("Mode: Single-process (data loaded once)")

    # Check prerequisites
    if not check_prerequisites():
        print("\n PIPELINE FAILED: Prerequisites not met")
        return 1

    # Run analysis
    results = run_analysis_scripts(figures_only=args.figures_only)

    # Print summary
    return print_summary(results, start_time)


if __name__ == "__main__":
    sys.exit(main())