montana/Русский/Разведка/Moltbook/themed/moltbook-ai-injection-dataset/collect_comments.py

242 lines
9.9 KiB (Stored with Git LFS)
Python

#!/usr/bin/env python3
"""
Moltbook Comment Collector (Resumable, Priority-Ordered)
=========================================================
- Skips posts with comment_count == 0 (saves 38% of API calls)
- Fetches highest comment_count posts FIRST (get the gold early)
- Alternates between both API keys every request
- On 429: switch key immediately, if both rate-limited wait 10s
- Resumes from last checkpoint if interrupted
- Saves every 100 posts
MSc Cybersecurity Research - NCI - David Keane IR240474
"""
import urllib.request
import urllib.error
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path
# ── Config ────────────────────────────────────────────────────────────────────
# Set your API keys as environment variables:
# export MOLTBOOK_API_KEY_1="moltbook_sk_your_key_here"
# export MOLTBOOK_API_KEY_2="moltbook_sk_your_second_key_here" # optional
BASE_URL = "https://moltbook.com"
_key1 = os.environ.get("MOLTBOOK_API_KEY_1", "")
_key2 = os.environ.get("MOLTBOOK_API_KEY_2", "")
if not _key1:
raise SystemExit("Error: MOLTBOOK_API_KEY_1 environment variable not set.")
API_KEYS = {"Account1": _key1}
if _key2:
API_KEYS["Account2"] = _key2
KEY_NAMES = list(API_KEYS.keys())
RATE_WAIT = 10.0 # seconds to wait when BOTH keys are rate-limited (short retry)
COOLDOWN_WAIT = 300.0 # seconds to wait after 3 consecutive double-failures (5 min reset)
COOLDOWN_AFTER = 3 # number of both-keys-fail cycles before triggering cooldown
REQUEST_DELAY = 1.0 # seconds between successful requests (sustainable overnight)
SAVE_EVERY = 100 # checkpoint every N posts fetched
OUT_DIR = Path(__file__).parent
POSTS_FILE = OUT_DIR / "all_posts_with_comments.json"
PROGRESS_FILE = OUT_DIR / ".comments_progress.json"
# ── API call with dual-key switching + 5-min cooldown ────────────────────────
_key_index = 0 # global alternating key tracker
_consecutive_fails = 0 # counts how many both-keys-fail cycles in a row
def api_get(path: str) -> dict | list | None:
"""GET with alternating keys.
- On 429: switch key immediately
- Both keys fail → wait RATE_WAIT (10s), increment fail counter
- 3 consecutive both-fail cycles → COOLDOWN (5 min), then retry same post
"""
global _key_index, _consecutive_fails
url = BASE_URL + path
both_fail_count = 0 # both-keys-failed cycles for THIS request
while True:
for attempt in range(len(KEY_NAMES) * 2): # try each key twice
key_name = KEY_NAMES[_key_index % len(KEY_NAMES)]
key = API_KEYS[key_name]
req = urllib.request.Request(
url,
headers={
"Authorization": f"Bearer {key}",
"Accept": "application/json",
"User-Agent": "MoltbookResearchCollector/1.0 (NCI MSc Cybersecurity)",
}
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
_key_index += 1
_consecutive_fails = 0 # reset on success
return json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as e:
if e.code == 429:
other = KEY_NAMES[(_key_index + 1) % len(KEY_NAMES)]
print(f" [429 {key_name}] → trying {other}...", flush=True)
_key_index += 1
else:
body = e.read().decode("utf-8", errors="replace")[:150]
print(f" [HTTP {e.code}] {path}{body}", flush=True)
return None
except Exception as ex:
print(f" [ERROR] {path}{ex}", flush=True)
return None
# All key attempts exhausted for this round
both_fail_count += 1
_consecutive_fails += 1
if _consecutive_fails >= COOLDOWN_AFTER:
# Deep rate-limit — full 5-minute cooldown
resume = datetime.now(timezone.utc).strftime('%H:%M:%S')
wake = time.strftime('%H:%M:%S', time.localtime(time.time() + COOLDOWN_WAIT))
print(f"\n ╔══════════════════════════════════════════╗", flush=True)
print(f" ║ DEEP RATE LIMIT — cooling down 5 min ║", flush=True)
print(f" ║ Now: {resume} | Resume: {wake}", flush=True)
print(f" ╚══════════════════════════════════════════╝\n", flush=True)
time.sleep(COOLDOWN_WAIT)
_consecutive_fails = 0 # reset after cooldown
both_fail_count = 0
else:
# Short wait before next round
print(f" [both keys rate-limited] waiting {RATE_WAIT:.0f}s "
f"(cycle {both_fail_count}, cooldown in {COOLDOWN_AFTER - _consecutive_fails} more)...",
flush=True)
time.sleep(RATE_WAIT)
return None
# ── Progress tracking ─────────────────────────────────────────────────────────
def load_progress() -> set:
"""Return set of post IDs already fetched."""
if PROGRESS_FILE.exists():
try:
p = json.loads(PROGRESS_FILE.read_text())
done = set(p.get("done_ids", []))
print(f" Resuming — {len(done)} posts already fetched.")
return done
except Exception:
pass
return set()
def save_progress(done_ids: set):
PROGRESS_FILE.write_text(json.dumps({
"done_ids": list(done_ids),
"count": len(done_ids),
"updated": datetime.now(timezone.utc).isoformat(),
}))
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
print("=" * 60)
print("Moltbook Comment Collector (Priority + Resumable)")
print(f"Started: {datetime.now(timezone.utc).isoformat()}")
print("=" * 60)
# Load posts
print(f"\nLoading {POSTS_FILE.name}...")
with open(POSTS_FILE, encoding="utf-8") as f:
data = json.load(f)
posts_raw = data.get("posts", []) if isinstance(data, dict) else data
print(f" Loaded {len(posts_raw):,} posts total.")
# Filter: only posts with replies, sorted highest first
posts_with_replies = [
p for p in posts_raw
if (p.get("comment_count") or 0) > 0
]
posts_with_replies.sort(key=lambda p: p.get("comment_count", 0), reverse=True)
skipped = len(posts_raw) - len(posts_with_replies)
print(f" Skipping {skipped:,} posts with 0 replies.")
print(f" Fetching replies for {len(posts_with_replies):,} posts.")
print(f" Top 5: {[p.get('comment_count') for p in posts_with_replies[:5]]}")
# Resume state
done_ids = load_progress()
todo = [p for p in posts_with_replies if p.get("id") not in done_ids]
total_todo = len(todo)
print(f"\n Remaining: {total_todo:,} posts to fetch\n")
# Build a lookup so we can update posts in-place
post_by_id = {p.get("id"): p for p in posts_raw}
total_comments = sum(
len(p.get("comments", [])) for p in posts_raw if p.get("comments")
)
fetched_this_run = 0
for i, post in enumerate(todo):
pid = post.get("id")
expected = post.get("comment_count", "?")
title_preview = (post.get("title") or post.get("content") or "")[:40]
resp = api_get(f"/api/v1/posts/{pid}/comments?limit=100")
time.sleep(REQUEST_DELAY)
if resp is None:
comments = []
elif isinstance(resp, list):
comments = resp
else:
comments = (
resp.get("comments") or
resp.get("data") or
resp.get("results") or []
)
# Update the post in our data
post_by_id[pid]["comments"] = comments
total_comments += len(comments)
done_ids.add(pid)
fetched_this_run += 1
# Progress line
pct = (i + 1) / total_todo * 100
print(
f" [{i+1:5d}/{total_todo}] {pct:5.1f}% | "
f"got {len(comments):3d}/{expected} replies | "
f"total: {total_comments:,} | {title_preview}",
flush=True
)
# Checkpoint
if fetched_this_run % SAVE_EVERY == 0:
save_progress(done_ids)
data["posts"] = list(post_by_id.values())
data["total_comments"] = total_comments
data["last_updated"] = datetime.now(timezone.utc).isoformat()
with open(POSTS_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
kb = POSTS_FILE.stat().st_size // 1024
print(f" ── checkpoint saved ({kb:,} KB) ──", flush=True)
# Final save
save_progress(done_ids)
data["posts"] = list(post_by_id.values())
data["total_comments"] = total_comments
data["last_updated"] = datetime.now(timezone.utc).isoformat()
with open(POSTS_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("\n" + "=" * 60)
print(f"DONE")
print(f" Posts with replies fetched: {len(done_ids):,}")
print(f" Total comments collected: {total_comments:,}")
print(f" File: {POSTS_FILE} ({POSTS_FILE.stat().st_size // 1024:,} KB)")
print("=" * 60)
if PROGRESS_FILE.exists():
PROGRESS_FILE.unlink()
if __name__ == "__main__":
main()