montana/Русский/Разведка/Moltbook/themed/moltbook-ai-injection-dataset/collect_all.py

#!/usr/bin/env python3
"""
Moltbook Post Collector (Resumable)
=====================================
MSc Cybersecurity Research - NCI (National College of Ireland)
David Keane (IR240474) - Thesis: AI-to-AI Indirect Prompt Injection

Collects all posts with full metadata. Resumes from last cursor if stopped.
Run multiple times — new posts are merged in, duplicates skipped.
Comments are handled separately by collect_comments.py.
"""

import urllib.request
import urllib.error
import urllib.parse
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path

# ── Config ────────────────────────────────────────────────────────────────────
# Set your API keys as environment variables:
#   export MOLTBOOK_API_KEY_1="moltbook_sk_your_key_here"
#   export MOLTBOOK_API_KEY_2="moltbook_sk_your_second_key_here"  # optional

BASE_URL    = "https://moltbook.com"
_key1 = os.environ.get("MOLTBOOK_API_KEY_1", "")
_key2 = os.environ.get("MOLTBOOK_API_KEY_2", "")
if not _key1:
    raise SystemExit("Error: MOLTBOOK_API_KEY_1 environment variable not set.")
API_KEYS    = {"Account1": _key1}
if _key2:
    API_KEYS["Account2"] = _key2
KEY_NAMES   = list(API_KEYS.keys())
PAGE_LIMIT  = 100      # posts per page
RATE_WAIT   = 10.0     # seconds to wait when both keys rate-limited
PAGE_DELAY  = 0.5      # seconds between successful page fetches

OUT_DIR     = Path(__file__).parent
POSTS_FILE  = OUT_DIR / "all_posts_with_comments.json"
CURSOR_FILE = OUT_DIR / ".posts_cursor.json"   # saves last cursor + known IDs

# ── API with dual-key switching ───────────────────────────────────────────────
_key_index = 0

def api_get(path: str, params: dict = None) -> dict | list | None:
    global _key_index
    url = BASE_URL + path
    if params:
        url += "?" + urllib.parse.urlencode(params)

    for attempt in range(10):
        key_name = KEY_NAMES[_key_index % len(KEY_NAMES)]
        key      = API_KEYS[key_name]
        req      = urllib.request.Request(
            url,
            headers={
                "Authorization": f"Bearer {key}",
                "Accept":        "application/json",
                "User-Agent":    "MoltbookResearchCollector/1.0 (NCI MSc Cybersecurity Thesis)",
            }
        )
        try:
            with urllib.request.urlopen(req, timeout=30) as resp:
                _key_index += 1
                return json.loads(resp.read().decode("utf-8"))
        except urllib.error.HTTPError as e:
            if e.code == 429:
                other = KEY_NAMES[(_key_index + 1) % len(KEY_NAMES)]
                print(f"  [429 {key_name}] → switching to {other}...", flush=True)
                _key_index += 1
                if attempt % 2 == 1:
                    print(f"  [both rate-limited] waiting {RATE_WAIT}s...", flush=True)
                    time.sleep(RATE_WAIT)
            else:
                body = e.read().decode("utf-8", errors="replace")[:200]
                print(f"  [HTTP {e.code}] {url} → {body}", flush=True)
                return None
        except Exception as ex:
            print(f"  [ERROR] {url} → {ex}", flush=True)
            return None

    print(f"  [FAIL] gave up: {url}", flush=True)
    return None

# ── Cursor progress ───────────────────────────────────────────────────────────
def load_cursor() -> tuple[str | None, set]:
    """Returns (last_cursor, set_of_known_post_ids)."""
    if CURSOR_FILE.exists():
        try:
            c = json.loads(CURSOR_FILE.read_text())
            cursor     = c.get("last_cursor")
            known_ids  = set(c.get("known_ids", []))
            saved_at   = c.get("saved_at", "unknown")
            print(f"  Resuming from cursor saved at {saved_at}")
            print(f"  Known posts: {len(known_ids):,} | cursor: {str(cursor)[:40]}...")
            return cursor, known_ids
        except Exception:
            pass
    return None, set()

def save_cursor(cursor: str | None, known_ids: set):
    CURSOR_FILE.write_text(json.dumps({
        "last_cursor": cursor,
        "known_ids":   list(known_ids),
        "count":       len(known_ids),
        "saved_at":    datetime.now(timezone.utc).isoformat(),
    }))

# ── Load existing posts ───────────────────────────────────────────────────────
def load_existing() -> tuple[dict, list]:
    """Load existing JSON file. Returns (data_dict, posts_list)."""
    if POSTS_FILE.exists():
        try:
            with open(POSTS_FILE, encoding="utf-8") as f:
                data = json.load(f)
            posts = data.get("posts", []) if isinstance(data, dict) else data
            print(f"  Loaded existing file: {len(posts):,} posts ({POSTS_FILE.stat().st_size // 1024:,} KB)")
            return (data if isinstance(data, dict) else {}), posts
        except Exception as ex:
            print(f"  [WARN] Could not load existing file: {ex}")
    return {}, []

# ── Save ──────────────────────────────────────────────────────────────────────
def save_posts(data: dict, posts: list):
    data["posts"]        = posts
    data["total_posts"]  = len(posts)
    data["last_updated"] = datetime.now(timezone.utc).isoformat()
    data.setdefault("research",    "MSc Cybersecurity NCI - AI-to-AI Indirect Prompt Injection")
    data.setdefault("researcher",  "David Keane IR240474")
    with open(POSTS_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"  Saved: {POSTS_FILE} ({POSTS_FILE.stat().st_size // 1024:,} KB)", flush=True)

# ── Main ──────────────────────────────────────────────────────────────────────
def main():
    print("=" * 60)
    print("Moltbook Post Collector (Resumable)")
    print("NCI MSc Cybersecurity - Indirect Prompt Injection Research")
    print(f"Started: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    # Load what we already have
    data, existing_posts = load_existing()
    cursor, known_ids    = load_cursor()

    # If no cursor file but we have posts, rebuild known_ids from existing posts
    if not known_ids and existing_posts:
        known_ids = {p.get("id") for p in existing_posts if p.get("id")}
        print(f"  Rebuilt {len(known_ids):,} known IDs from existing posts.")

    posts_by_id = {p.get("id"): p for p in existing_posts}
    page        = 0
    new_count   = 0

    print(f"\n=== Collecting posts (starting from {'cursor' if cursor else 'beginning'}) ===")

    while True:
        page += 1
        params = {"limit": PAGE_LIMIT}
        if cursor:
            params["cursor"] = cursor

        print(f"  Page {page} | cursor={str(cursor)[:30] if cursor else 'None'} | total so far: {len(posts_by_id):,}", flush=True)

        resp = api_get("/api/v1/posts", params=params)
        time.sleep(PAGE_DELAY)

        if resp is None:
            print("  [WARN] Null response — stopping. Run again to resume.", flush=True)
            break

        # Parse response shape
        if isinstance(resp, list):
            batch    = resp
            has_more = False
            cursor   = None
        else:
            batch    = resp.get("posts") or resp.get("data") or resp.get("results") or []
            has_more = resp.get("has_more", False)
            cursor   = resp.get("next_cursor") or resp.get("cursor") or None

        # Merge — skip posts we already have
        added = 0
        for post in batch:
            pid = post.get("id")
            if pid and pid not in known_ids:
                posts_by_id[pid] = post
                known_ids.add(pid)
                added += 1

        new_count += added
        print(f"    Got {len(batch)} | {added} new | {len(posts_by_id):,} total", flush=True)

        # Save cursor after every page so we can resume
        save_cursor(cursor, known_ids)

        # Save posts every 10 pages
        if page % 10 == 0:
            save_posts(data, list(posts_by_id.values()))

        if not batch or not has_more or not cursor:
            print("  End of pages — all posts collected!", flush=True)
            break

    # Final save
    all_posts = list(posts_by_id.values())
    save_posts(data, all_posts)

    # Clear cursor only if we reached the end naturally
    if not cursor:
        if CURSOR_FILE.exists():
            CURSOR_FILE.unlink()
        print("  Cursor cleared — full collection complete.")

    print("\n" + "=" * 60)
    print(f"DONE")
    print(f"  Total posts: {len(all_posts):,}")
    print(f"  New this run: {new_count:,}")
    print(f"  File: {POSTS_FILE} ({POSTS_FILE.stat().st_size // 1024:,} KB)")
    print("=" * 60)
    print("\nNext step: python3 collect_comments.py")

if __name__ == "__main__":
    main()