montana/Русский/Разведка/Moltbook/themed/moltbook-ai-injection-dataset/collect_comments.py

#!/usr/bin/env python3
"""
Moltbook Comment Collector (Resumable, Priority-Ordered)
=========================================================
- Skips posts with comment_count == 0  (saves 38% of API calls)
- Fetches highest comment_count posts FIRST (get the gold early)
- Alternates between both API keys every request
- On 429: switch key immediately, if both rate-limited wait 10s
- Resumes from last checkpoint if interrupted
- Saves every 100 posts

MSc Cybersecurity Research - NCI - David Keane IR240474
"""

import urllib.request
import urllib.error
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path

# ── Config ────────────────────────────────────────────────────────────────────
# Set your API keys as environment variables:
#   export MOLTBOOK_API_KEY_1="moltbook_sk_your_key_here"
#   export MOLTBOOK_API_KEY_2="moltbook_sk_your_second_key_here"  # optional

BASE_URL      = "https://moltbook.com"
_key1 = os.environ.get("MOLTBOOK_API_KEY_1", "")
_key2 = os.environ.get("MOLTBOOK_API_KEY_2", "")
if not _key1:
    raise SystemExit("Error: MOLTBOOK_API_KEY_1 environment variable not set.")
API_KEYS      = {"Account1": _key1}
if _key2:
    API_KEYS["Account2"] = _key2
KEY_NAMES       = list(API_KEYS.keys())
RATE_WAIT       = 10.0   # seconds to wait when BOTH keys are rate-limited (short retry)
COOLDOWN_WAIT   = 300.0  # seconds to wait after 3 consecutive double-failures (5 min reset)
COOLDOWN_AFTER  = 3      # number of both-keys-fail cycles before triggering cooldown
REQUEST_DELAY   = 1.0    # seconds between successful requests (sustainable overnight)
SAVE_EVERY      = 100    # checkpoint every N posts fetched

OUT_DIR       = Path(__file__).parent
POSTS_FILE    = OUT_DIR / "all_posts_with_comments.json"
PROGRESS_FILE = OUT_DIR / ".comments_progress.json"

# ── API call with dual-key switching + 5-min cooldown ────────────────────────
_key_index         = 0   # global alternating key tracker
_consecutive_fails = 0   # counts how many both-keys-fail cycles in a row

def api_get(path: str) -> dict | list | None:
    """GET with alternating keys.
    - On 429: switch key immediately
    - Both keys fail → wait RATE_WAIT (10s), increment fail counter
    - 3 consecutive both-fail cycles → COOLDOWN (5 min), then retry same post
    """
    global _key_index, _consecutive_fails
    url = BASE_URL + path

    both_fail_count = 0   # both-keys-failed cycles for THIS request

    while True:
        for attempt in range(len(KEY_NAMES) * 2):   # try each key twice
            key_name = KEY_NAMES[_key_index % len(KEY_NAMES)]
            key      = API_KEYS[key_name]
            req      = urllib.request.Request(
                url,
                headers={
                    "Authorization": f"Bearer {key}",
                    "Accept":        "application/json",
                    "User-Agent":    "MoltbookResearchCollector/1.0 (NCI MSc Cybersecurity)",
                }
            )
            try:
                with urllib.request.urlopen(req, timeout=30) as resp:
                    _key_index += 1
                    _consecutive_fails = 0   # reset on success
                    return json.loads(resp.read().decode("utf-8"))

            except urllib.error.HTTPError as e:
                if e.code == 429:
                    other = KEY_NAMES[(_key_index + 1) % len(KEY_NAMES)]
                    print(f"    [429 {key_name}] → trying {other}...", flush=True)
                    _key_index += 1
                else:
                    body = e.read().decode("utf-8", errors="replace")[:150]
                    print(f"    [HTTP {e.code}] {path} → {body}", flush=True)
                    return None

            except Exception as ex:
                print(f"    [ERROR] {path} → {ex}", flush=True)
                return None

        # All key attempts exhausted for this round
        both_fail_count     += 1
        _consecutive_fails  += 1

        if _consecutive_fails >= COOLDOWN_AFTER:
            # Deep rate-limit — full 5-minute cooldown
            resume = datetime.now(timezone.utc).strftime('%H:%M:%S')
            wake   = time.strftime('%H:%M:%S', time.localtime(time.time() + COOLDOWN_WAIT))
            print(f"\n    ╔══════════════════════════════════════════╗", flush=True)
            print(f"    ║  DEEP RATE LIMIT — cooling down 5 min   ║", flush=True)
            print(f"    ║  Now: {resume}  |  Resume: {wake}        ║", flush=True)
            print(f"    ╚══════════════════════════════════════════╝\n", flush=True)
            time.sleep(COOLDOWN_WAIT)
            _consecutive_fails = 0   # reset after cooldown
            both_fail_count    = 0
        else:
            # Short wait before next round
            print(f"    [both keys rate-limited] waiting {RATE_WAIT:.0f}s "
                  f"(cycle {both_fail_count}, cooldown in {COOLDOWN_AFTER - _consecutive_fails} more)...",
                  flush=True)
            time.sleep(RATE_WAIT)
    return None

# ── Progress tracking ─────────────────────────────────────────────────────────
def load_progress() -> set:
    """Return set of post IDs already fetched."""
    if PROGRESS_FILE.exists():
        try:
            p = json.loads(PROGRESS_FILE.read_text())
            done = set(p.get("done_ids", []))
            print(f"  Resuming — {len(done)} posts already fetched.")
            return done
        except Exception:
            pass
    return set()

def save_progress(done_ids: set):
    PROGRESS_FILE.write_text(json.dumps({
        "done_ids":  list(done_ids),
        "count":     len(done_ids),
        "updated":   datetime.now(timezone.utc).isoformat(),
    }))

# ── Main ──────────────────────────────────────────────────────────────────────
def main():
    print("=" * 60)
    print("Moltbook Comment Collector (Priority + Resumable)")
    print(f"Started: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    # Load posts
    print(f"\nLoading {POSTS_FILE.name}...")
    with open(POSTS_FILE, encoding="utf-8") as f:
        data = json.load(f)
    posts_raw = data.get("posts", []) if isinstance(data, dict) else data
    print(f"  Loaded {len(posts_raw):,} posts total.")

    # Filter: only posts with replies, sorted highest first
    posts_with_replies = [
        p for p in posts_raw
        if (p.get("comment_count") or 0) > 0
    ]
    posts_with_replies.sort(key=lambda p: p.get("comment_count", 0), reverse=True)

    skipped = len(posts_raw) - len(posts_with_replies)
    print(f"  Skipping {skipped:,} posts with 0 replies.")
    print(f"  Fetching replies for {len(posts_with_replies):,} posts.")
    print(f"  Top 5: {[p.get('comment_count') for p in posts_with_replies[:5]]}")

    # Resume state
    done_ids       = load_progress()
    todo           = [p for p in posts_with_replies if p.get("id") not in done_ids]
    total_todo     = len(todo)
    print(f"\n  Remaining: {total_todo:,} posts to fetch\n")

    # Build a lookup so we can update posts in-place
    post_by_id = {p.get("id"): p for p in posts_raw}

    total_comments = sum(
        len(p.get("comments", [])) for p in posts_raw if p.get("comments")
    )
    fetched_this_run = 0

    for i, post in enumerate(todo):
        pid           = post.get("id")
        expected      = post.get("comment_count", "?")
        title_preview = (post.get("title") or post.get("content") or "")[:40]

        resp = api_get(f"/api/v1/posts/{pid}/comments?limit=100")
        time.sleep(REQUEST_DELAY)

        if resp is None:
            comments = []
        elif isinstance(resp, list):
            comments = resp
        else:
            comments = (
                resp.get("comments") or
                resp.get("data") or
                resp.get("results") or []
            )

        # Update the post in our data
        post_by_id[pid]["comments"] = comments
        total_comments += len(comments)
        done_ids.add(pid)
        fetched_this_run += 1

        # Progress line
        pct = (i + 1) / total_todo * 100
        print(
            f"  [{i+1:5d}/{total_todo}] {pct:5.1f}% | "
            f"got {len(comments):3d}/{expected} replies | "
            f"total: {total_comments:,} | {title_preview}",
            flush=True
        )

        # Checkpoint
        if fetched_this_run % SAVE_EVERY == 0:
            save_progress(done_ids)
            data["posts"] = list(post_by_id.values())
            data["total_comments"] = total_comments
            data["last_updated"] = datetime.now(timezone.utc).isoformat()
            with open(POSTS_FILE, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            kb = POSTS_FILE.stat().st_size // 1024
            print(f"  ── checkpoint saved ({kb:,} KB) ──", flush=True)

    # Final save
    save_progress(done_ids)
    data["posts"] = list(post_by_id.values())
    data["total_comments"] = total_comments
    data["last_updated"] = datetime.now(timezone.utc).isoformat()
    with open(POSTS_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print(f"DONE")
    print(f"  Posts with replies fetched: {len(done_ids):,}")
    print(f"  Total comments collected:   {total_comments:,}")
    print(f"  File: {POSTS_FILE} ({POSTS_FILE.stat().st_size // 1024:,} KB)")
    print("=" * 60)

    if PROGRESS_FILE.exists():
        PROGRESS_FILE.unlink()

if __name__ == "__main__":
    main()