#!/usr/bin/env python3 """ Moltbook Comment Collector (Resumable, Priority-Ordered) ========================================================= - Skips posts with comment_count == 0 (saves 38% of API calls) - Fetches highest comment_count posts FIRST (get the gold early) - Alternates between both API keys every request - On 429: switch key immediately, if both rate-limited wait 10s - Resumes from last checkpoint if interrupted - Saves every 100 posts MSc Cybersecurity Research - NCI - David Keane IR240474 """ import urllib.request import urllib.error import json import os import time from datetime import datetime, timezone from pathlib import Path # ── Config ──────────────────────────────────────────────────────────────────── # Set your API keys as environment variables: # export MOLTBOOK_API_KEY_1="moltbook_sk_your_key_here" # export MOLTBOOK_API_KEY_2="moltbook_sk_your_second_key_here" # optional BASE_URL = "https://moltbook.com" _key1 = os.environ.get("MOLTBOOK_API_KEY_1", "") _key2 = os.environ.get("MOLTBOOK_API_KEY_2", "") if not _key1: raise SystemExit("Error: MOLTBOOK_API_KEY_1 environment variable not set.") API_KEYS = {"Account1": _key1} if _key2: API_KEYS["Account2"] = _key2 KEY_NAMES = list(API_KEYS.keys()) RATE_WAIT = 10.0 # seconds to wait when BOTH keys are rate-limited (short retry) COOLDOWN_WAIT = 300.0 # seconds to wait after 3 consecutive double-failures (5 min reset) COOLDOWN_AFTER = 3 # number of both-keys-fail cycles before triggering cooldown REQUEST_DELAY = 1.0 # seconds between successful requests (sustainable overnight) SAVE_EVERY = 100 # checkpoint every N posts fetched OUT_DIR = Path(__file__).parent POSTS_FILE = OUT_DIR / "all_posts_with_comments.json" PROGRESS_FILE = OUT_DIR / ".comments_progress.json" # ── API call with dual-key switching + 5-min cooldown ──────────────────────── _key_index = 0 # global alternating key tracker _consecutive_fails = 0 # counts how many both-keys-fail cycles in a row def api_get(path: str) -> dict | list | None: """GET with alternating keys. - On 429: switch key immediately - Both keys fail → wait RATE_WAIT (10s), increment fail counter - 3 consecutive both-fail cycles → COOLDOWN (5 min), then retry same post """ global _key_index, _consecutive_fails url = BASE_URL + path both_fail_count = 0 # both-keys-failed cycles for THIS request while True: for attempt in range(len(KEY_NAMES) * 2): # try each key twice key_name = KEY_NAMES[_key_index % len(KEY_NAMES)] key = API_KEYS[key_name] req = urllib.request.Request( url, headers={ "Authorization": f"Bearer {key}", "Accept": "application/json", "User-Agent": "MoltbookResearchCollector/1.0 (NCI MSc Cybersecurity)", } ) try: with urllib.request.urlopen(req, timeout=30) as resp: _key_index += 1 _consecutive_fails = 0 # reset on success return json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as e: if e.code == 429: other = KEY_NAMES[(_key_index + 1) % len(KEY_NAMES)] print(f" [429 {key_name}] → trying {other}...", flush=True) _key_index += 1 else: body = e.read().decode("utf-8", errors="replace")[:150] print(f" [HTTP {e.code}] {path} → {body}", flush=True) return None except Exception as ex: print(f" [ERROR] {path} → {ex}", flush=True) return None # All key attempts exhausted for this round both_fail_count += 1 _consecutive_fails += 1 if _consecutive_fails >= COOLDOWN_AFTER: # Deep rate-limit — full 5-minute cooldown resume = datetime.now(timezone.utc).strftime('%H:%M:%S') wake = time.strftime('%H:%M:%S', time.localtime(time.time() + COOLDOWN_WAIT)) print(f"\n ╔══════════════════════════════════════════╗", flush=True) print(f" ║ DEEP RATE LIMIT — cooling down 5 min ║", flush=True) print(f" ║ Now: {resume} | Resume: {wake} ║", flush=True) print(f" ╚══════════════════════════════════════════╝\n", flush=True) time.sleep(COOLDOWN_WAIT) _consecutive_fails = 0 # reset after cooldown both_fail_count = 0 else: # Short wait before next round print(f" [both keys rate-limited] waiting {RATE_WAIT:.0f}s " f"(cycle {both_fail_count}, cooldown in {COOLDOWN_AFTER - _consecutive_fails} more)...", flush=True) time.sleep(RATE_WAIT) return None # ── Progress tracking ───────────────────────────────────────────────────────── def load_progress() -> set: """Return set of post IDs already fetched.""" if PROGRESS_FILE.exists(): try: p = json.loads(PROGRESS_FILE.read_text()) done = set(p.get("done_ids", [])) print(f" Resuming — {len(done)} posts already fetched.") return done except Exception: pass return set() def save_progress(done_ids: set): PROGRESS_FILE.write_text(json.dumps({ "done_ids": list(done_ids), "count": len(done_ids), "updated": datetime.now(timezone.utc).isoformat(), })) # ── Main ────────────────────────────────────────────────────────────────────── def main(): print("=" * 60) print("Moltbook Comment Collector (Priority + Resumable)") print(f"Started: {datetime.now(timezone.utc).isoformat()}") print("=" * 60) # Load posts print(f"\nLoading {POSTS_FILE.name}...") with open(POSTS_FILE, encoding="utf-8") as f: data = json.load(f) posts_raw = data.get("posts", []) if isinstance(data, dict) else data print(f" Loaded {len(posts_raw):,} posts total.") # Filter: only posts with replies, sorted highest first posts_with_replies = [ p for p in posts_raw if (p.get("comment_count") or 0) > 0 ] posts_with_replies.sort(key=lambda p: p.get("comment_count", 0), reverse=True) skipped = len(posts_raw) - len(posts_with_replies) print(f" Skipping {skipped:,} posts with 0 replies.") print(f" Fetching replies for {len(posts_with_replies):,} posts.") print(f" Top 5: {[p.get('comment_count') for p in posts_with_replies[:5]]}") # Resume state done_ids = load_progress() todo = [p for p in posts_with_replies if p.get("id") not in done_ids] total_todo = len(todo) print(f"\n Remaining: {total_todo:,} posts to fetch\n") # Build a lookup so we can update posts in-place post_by_id = {p.get("id"): p for p in posts_raw} total_comments = sum( len(p.get("comments", [])) for p in posts_raw if p.get("comments") ) fetched_this_run = 0 for i, post in enumerate(todo): pid = post.get("id") expected = post.get("comment_count", "?") title_preview = (post.get("title") or post.get("content") or "")[:40] resp = api_get(f"/api/v1/posts/{pid}/comments?limit=100") time.sleep(REQUEST_DELAY) if resp is None: comments = [] elif isinstance(resp, list): comments = resp else: comments = ( resp.get("comments") or resp.get("data") or resp.get("results") or [] ) # Update the post in our data post_by_id[pid]["comments"] = comments total_comments += len(comments) done_ids.add(pid) fetched_this_run += 1 # Progress line pct = (i + 1) / total_todo * 100 print( f" [{i+1:5d}/{total_todo}] {pct:5.1f}% | " f"got {len(comments):3d}/{expected} replies | " f"total: {total_comments:,} | {title_preview}", flush=True ) # Checkpoint if fetched_this_run % SAVE_EVERY == 0: save_progress(done_ids) data["posts"] = list(post_by_id.values()) data["total_comments"] = total_comments data["last_updated"] = datetime.now(timezone.utc).isoformat() with open(POSTS_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) kb = POSTS_FILE.stat().st_size // 1024 print(f" ── checkpoint saved ({kb:,} KB) ──", flush=True) # Final save save_progress(done_ids) data["posts"] = list(post_by_id.values()) data["total_comments"] = total_comments data["last_updated"] = datetime.now(timezone.utc).isoformat() with open(POSTS_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print("\n" + "=" * 60) print(f"DONE") print(f" Posts with replies fetched: {len(done_ids):,}") print(f" Total comments collected: {total_comments:,}") print(f" File: {POSTS_FILE} ({POSTS_FILE.stat().st_size // 1024:,} KB)") print("=" * 60) if PROGRESS_FILE.exists(): PROGRESS_FILE.unlink() if __name__ == "__main__": main()