242 lines
9.9 KiB (Stored with Git LFS)
Python
242 lines
9.9 KiB (Stored with Git LFS)
Python
#!/usr/bin/env python3
|
|
"""
|
|
Moltbook Comment Collector (Resumable, Priority-Ordered)
|
|
=========================================================
|
|
- Skips posts with comment_count == 0 (saves 38% of API calls)
|
|
- Fetches highest comment_count posts FIRST (get the gold early)
|
|
- Alternates between both API keys every request
|
|
- On 429: switch key immediately, if both rate-limited wait 10s
|
|
- Resumes from last checkpoint if interrupted
|
|
- Saves every 100 posts
|
|
|
|
MSc Cybersecurity Research - NCI - David Keane IR240474
|
|
"""
|
|
|
|
import urllib.request
|
|
import urllib.error
|
|
import json
|
|
import os
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# ── Config ────────────────────────────────────────────────────────────────────
|
|
# Set your API keys as environment variables:
|
|
# export MOLTBOOK_API_KEY_1="moltbook_sk_your_key_here"
|
|
# export MOLTBOOK_API_KEY_2="moltbook_sk_your_second_key_here" # optional
|
|
|
|
BASE_URL = "https://moltbook.com"
|
|
_key1 = os.environ.get("MOLTBOOK_API_KEY_1", "")
|
|
_key2 = os.environ.get("MOLTBOOK_API_KEY_2", "")
|
|
if not _key1:
|
|
raise SystemExit("Error: MOLTBOOK_API_KEY_1 environment variable not set.")
|
|
API_KEYS = {"Account1": _key1}
|
|
if _key2:
|
|
API_KEYS["Account2"] = _key2
|
|
KEY_NAMES = list(API_KEYS.keys())
|
|
RATE_WAIT = 10.0 # seconds to wait when BOTH keys are rate-limited (short retry)
|
|
COOLDOWN_WAIT = 300.0 # seconds to wait after 3 consecutive double-failures (5 min reset)
|
|
COOLDOWN_AFTER = 3 # number of both-keys-fail cycles before triggering cooldown
|
|
REQUEST_DELAY = 1.0 # seconds between successful requests (sustainable overnight)
|
|
SAVE_EVERY = 100 # checkpoint every N posts fetched
|
|
|
|
OUT_DIR = Path(__file__).parent
|
|
POSTS_FILE = OUT_DIR / "all_posts_with_comments.json"
|
|
PROGRESS_FILE = OUT_DIR / ".comments_progress.json"
|
|
|
|
# ── API call with dual-key switching + 5-min cooldown ────────────────────────
|
|
_key_index = 0 # global alternating key tracker
|
|
_consecutive_fails = 0 # counts how many both-keys-fail cycles in a row
|
|
|
|
def api_get(path: str) -> dict | list | None:
|
|
"""GET with alternating keys.
|
|
- On 429: switch key immediately
|
|
- Both keys fail → wait RATE_WAIT (10s), increment fail counter
|
|
- 3 consecutive both-fail cycles → COOLDOWN (5 min), then retry same post
|
|
"""
|
|
global _key_index, _consecutive_fails
|
|
url = BASE_URL + path
|
|
|
|
both_fail_count = 0 # both-keys-failed cycles for THIS request
|
|
|
|
while True:
|
|
for attempt in range(len(KEY_NAMES) * 2): # try each key twice
|
|
key_name = KEY_NAMES[_key_index % len(KEY_NAMES)]
|
|
key = API_KEYS[key_name]
|
|
req = urllib.request.Request(
|
|
url,
|
|
headers={
|
|
"Authorization": f"Bearer {key}",
|
|
"Accept": "application/json",
|
|
"User-Agent": "MoltbookResearchCollector/1.0 (NCI MSc Cybersecurity)",
|
|
}
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
_key_index += 1
|
|
_consecutive_fails = 0 # reset on success
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 429:
|
|
other = KEY_NAMES[(_key_index + 1) % len(KEY_NAMES)]
|
|
print(f" [429 {key_name}] → trying {other}...", flush=True)
|
|
_key_index += 1
|
|
else:
|
|
body = e.read().decode("utf-8", errors="replace")[:150]
|
|
print(f" [HTTP {e.code}] {path} → {body}", flush=True)
|
|
return None
|
|
|
|
except Exception as ex:
|
|
print(f" [ERROR] {path} → {ex}", flush=True)
|
|
return None
|
|
|
|
# All key attempts exhausted for this round
|
|
both_fail_count += 1
|
|
_consecutive_fails += 1
|
|
|
|
if _consecutive_fails >= COOLDOWN_AFTER:
|
|
# Deep rate-limit — full 5-minute cooldown
|
|
resume = datetime.now(timezone.utc).strftime('%H:%M:%S')
|
|
wake = time.strftime('%H:%M:%S', time.localtime(time.time() + COOLDOWN_WAIT))
|
|
print(f"\n ╔══════════════════════════════════════════╗", flush=True)
|
|
print(f" ║ DEEP RATE LIMIT — cooling down 5 min ║", flush=True)
|
|
print(f" ║ Now: {resume} | Resume: {wake} ║", flush=True)
|
|
print(f" ╚══════════════════════════════════════════╝\n", flush=True)
|
|
time.sleep(COOLDOWN_WAIT)
|
|
_consecutive_fails = 0 # reset after cooldown
|
|
both_fail_count = 0
|
|
else:
|
|
# Short wait before next round
|
|
print(f" [both keys rate-limited] waiting {RATE_WAIT:.0f}s "
|
|
f"(cycle {both_fail_count}, cooldown in {COOLDOWN_AFTER - _consecutive_fails} more)...",
|
|
flush=True)
|
|
time.sleep(RATE_WAIT)
|
|
return None
|
|
|
|
# ── Progress tracking ─────────────────────────────────────────────────────────
|
|
def load_progress() -> set:
|
|
"""Return set of post IDs already fetched."""
|
|
if PROGRESS_FILE.exists():
|
|
try:
|
|
p = json.loads(PROGRESS_FILE.read_text())
|
|
done = set(p.get("done_ids", []))
|
|
print(f" Resuming — {len(done)} posts already fetched.")
|
|
return done
|
|
except Exception:
|
|
pass
|
|
return set()
|
|
|
|
def save_progress(done_ids: set):
|
|
PROGRESS_FILE.write_text(json.dumps({
|
|
"done_ids": list(done_ids),
|
|
"count": len(done_ids),
|
|
"updated": datetime.now(timezone.utc).isoformat(),
|
|
}))
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
def main():
|
|
print("=" * 60)
|
|
print("Moltbook Comment Collector (Priority + Resumable)")
|
|
print(f"Started: {datetime.now(timezone.utc).isoformat()}")
|
|
print("=" * 60)
|
|
|
|
# Load posts
|
|
print(f"\nLoading {POSTS_FILE.name}...")
|
|
with open(POSTS_FILE, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
posts_raw = data.get("posts", []) if isinstance(data, dict) else data
|
|
print(f" Loaded {len(posts_raw):,} posts total.")
|
|
|
|
# Filter: only posts with replies, sorted highest first
|
|
posts_with_replies = [
|
|
p for p in posts_raw
|
|
if (p.get("comment_count") or 0) > 0
|
|
]
|
|
posts_with_replies.sort(key=lambda p: p.get("comment_count", 0), reverse=True)
|
|
|
|
skipped = len(posts_raw) - len(posts_with_replies)
|
|
print(f" Skipping {skipped:,} posts with 0 replies.")
|
|
print(f" Fetching replies for {len(posts_with_replies):,} posts.")
|
|
print(f" Top 5: {[p.get('comment_count') for p in posts_with_replies[:5]]}")
|
|
|
|
# Resume state
|
|
done_ids = load_progress()
|
|
todo = [p for p in posts_with_replies if p.get("id") not in done_ids]
|
|
total_todo = len(todo)
|
|
print(f"\n Remaining: {total_todo:,} posts to fetch\n")
|
|
|
|
# Build a lookup so we can update posts in-place
|
|
post_by_id = {p.get("id"): p for p in posts_raw}
|
|
|
|
total_comments = sum(
|
|
len(p.get("comments", [])) for p in posts_raw if p.get("comments")
|
|
)
|
|
fetched_this_run = 0
|
|
|
|
for i, post in enumerate(todo):
|
|
pid = post.get("id")
|
|
expected = post.get("comment_count", "?")
|
|
title_preview = (post.get("title") or post.get("content") or "")[:40]
|
|
|
|
resp = api_get(f"/api/v1/posts/{pid}/comments?limit=100")
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
if resp is None:
|
|
comments = []
|
|
elif isinstance(resp, list):
|
|
comments = resp
|
|
else:
|
|
comments = (
|
|
resp.get("comments") or
|
|
resp.get("data") or
|
|
resp.get("results") or []
|
|
)
|
|
|
|
# Update the post in our data
|
|
post_by_id[pid]["comments"] = comments
|
|
total_comments += len(comments)
|
|
done_ids.add(pid)
|
|
fetched_this_run += 1
|
|
|
|
# Progress line
|
|
pct = (i + 1) / total_todo * 100
|
|
print(
|
|
f" [{i+1:5d}/{total_todo}] {pct:5.1f}% | "
|
|
f"got {len(comments):3d}/{expected} replies | "
|
|
f"total: {total_comments:,} | {title_preview}",
|
|
flush=True
|
|
)
|
|
|
|
# Checkpoint
|
|
if fetched_this_run % SAVE_EVERY == 0:
|
|
save_progress(done_ids)
|
|
data["posts"] = list(post_by_id.values())
|
|
data["total_comments"] = total_comments
|
|
data["last_updated"] = datetime.now(timezone.utc).isoformat()
|
|
with open(POSTS_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
kb = POSTS_FILE.stat().st_size // 1024
|
|
print(f" ── checkpoint saved ({kb:,} KB) ──", flush=True)
|
|
|
|
# Final save
|
|
save_progress(done_ids)
|
|
data["posts"] = list(post_by_id.values())
|
|
data["total_comments"] = total_comments
|
|
data["last_updated"] = datetime.now(timezone.utc).isoformat()
|
|
with open(POSTS_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"DONE")
|
|
print(f" Posts with replies fetched: {len(done_ids):,}")
|
|
print(f" Total comments collected: {total_comments:,}")
|
|
print(f" File: {POSTS_FILE} ({POSTS_FILE.stat().st_size // 1024:,} KB)")
|
|
print("=" * 60)
|
|
|
|
if PROGRESS_FILE.exists():
|
|
PROGRESS_FILE.unlink()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|