#!/usr/bin/env python3 """ MT Enrichment — Fetch MMSI, IMO, Flag, LOA, Beam, Draught for all vessels. Uses FREE MarineTraffic detail API endpoints (no Pro subscription needed): /en/vessels/{shipid}/general → MMSI, IMO, Flag, LOA, Beam, Type, Year Built /en/vessels/{shipid}/position → Draught, Speed, Course Requires Playwright login (headless=False for Cloudflare bypass). Runs LOCALLY on PC. Speed: ~0.16s/vessel (10x faster than Reports API). Usage: python mt_enrichment.py # All vessels needing MMSI/IMO python mt_enrichment.py --probe # Test 10 vessels python mt_enrichment.py --limit 500 # Process 500 vessels python mt_enrichment.py --loop # Repeat every 12h python mt_enrichment.py --reset # Clear checkpoint, start fresh """ import asyncio, json, sys, os, time, struct, hmac, hashlib, base64, argparse import psycopg2 os.chdir(os.path.dirname(os.path.abspath(__file__))) if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace', line_buffering=True) if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace', line_buffering=True) # ---- CONFIG ---- EMAIL = "operation@mrlogisticcorp.com" PASSWORD = "NKh9i8Z!7fU9jfi" TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH" IS_SERVER = sys.platform == 'linux' DB_URL = os.environ.get('DATABASE_URL') or ( 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:5432/seafare_db' if IS_SERVER else 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db' ) DELAY = 0.15 # seconds between API calls (~0.16s/vessel observed) BATCH = 500 # commit every N vessels (fast enough for larger batches) CHECKPOINT = 'mt_enrichment_checkpoint.json' LOOP_INTERVAL_HOURS = 12 MAX_CONSECUTIVE_ERRORS = 20 # ---- TOTP ---- def totp(secret): s = secret.upper().replace(' ', '') pad = (-len(s)) % 8 key = base64.b32decode(s + '=' * pad) counter = int(time.time()) // 30 msg = struct.pack('>Q', counter) h = hmac.new(key, msg, hashlib.sha1).digest() offset = h[-1] & 0x0f code = struct.unpack('>I', h[offset:offset+4])[0] & 0x7fffffff return str(code % 1000000).zfill(6) # ---- DB HELPERS ---- def db_connect(): return psycopg2.connect( DB_URL, connect_timeout=15, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5 ) def _restart_ssh_tunnel(): if IS_SERVER: return import subprocess try: subprocess.run(['taskkill', '/F', '/IM', 'ssh.exe'], capture_output=True, timeout=5) except Exception: pass time.sleep(2) try: subprocess.Popen( ['ssh', '-o', 'ServerAliveInterval=5', '-o', 'ServerAliveCountMax=120', '-o', 'TCPKeepAlive=yes', '-o', 'StrictHostKeyChecking=no', '-L', '15432:127.0.0.1:5432', '-N', 'root@89.19.208.158'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) print(f" [SSH] Tunnel restarted, waiting 5s...", flush=True) time.sleep(5) except Exception as e: print(f" [SSH] Failed: {e}", flush=True) def db_reconnect(conn): try: conn.close() except Exception: pass for attempt in range(5): try: time.sleep(3) c = db_connect() print(f" [DB] Reconnected (attempt {attempt+1})", flush=True) return c, c.cursor() except Exception as e: print(f" [DB] Attempt {attempt+1} failed: {e}", flush=True) if not IS_SERVER: print(f" [DB] Restarting SSH tunnel...", flush=True) _restart_ssh_tunnel() for attempt in range(10): try: time.sleep(5) c = db_connect() print(f" [DB] Reconnected after tunnel restart (attempt {attempt+1})", flush=True) return c, c.cursor() except Exception as e: print(f" [DB] Post-restart attempt {attempt+1} failed: {e}", flush=True) if not IS_SERVER and attempt == 4: _restart_ssh_tunnel() raise Exception("DB reconnect failed") def db_safe_execute(conn, cur, query, params=None): try: cur.execute(query, params) return conn, cur except (psycopg2.InterfaceError, psycopg2.OperationalError) as e: print(f" [DB] Lost on execute ({e}), reconnecting...", flush=True) conn, cur = db_reconnect(conn) cur.execute(query, params) return conn, cur except psycopg2.Error: try: conn.rollback() except Exception: pass raise def db_safe_commit(conn): try: conn.commit() return conn except (psycopg2.InterfaceError, psycopg2.OperationalError): print(f" [DB] Lost on commit, reconnecting...", flush=True) conn, _ = db_reconnect(conn) return conn # ---- CHECKPOINT ---- def load_checkpoint(): if os.path.exists(CHECKPOINT): try: with open(CHECKPOINT, encoding='utf-8') as f: return json.load(f) except (json.JSONDecodeError, IOError): print(f" [WARN] Corrupt checkpoint, starting fresh", flush=True) return { 'processed_ids': [], 'stats': {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0, 'no_data': 0, 'errors': 0}, 'last_run': None, } def save_checkpoint(cp): cp['last_run'] = time.strftime('%Y-%m-%dT%H:%M:%S') with open(CHECKPOINT, 'w', encoding='utf-8') as f: json.dump(cp, f) # ---- PLAYWRIGHT LOGIN ---- async def do_login(page, max_retries=3): for attempt in range(max_retries): print(f"LOGIN (attempt {attempt+1}/{max_retries})...", flush=True) try: await page.goto('https://www.marinetraffic.com/en/users/login', wait_until='domcontentloaded', timeout=30000) except Exception as e: print(f" Nav error: {e}", flush=True) continue await asyncio.sleep(3) try: await page.click('button:has-text("AGREE")', timeout=3000) except: pass try: await page.fill('input[name="username"]', EMAIL) await page.click('button[type="submit"]') except Exception as e: print(f" Email error: {e}", flush=True) continue await asyncio.sleep(3) try: await page.fill('input[type="password"]', PASSWORD) await page.click('button[type="submit"]') except Exception as e: print(f" Password error: {e}", flush=True) continue await asyncio.sleep(4) if 'mfa' in page.url.lower() or 'auth.kpler' in page.url: try: await page.click('button:has-text("Google Authenticator")', timeout=3000) await asyncio.sleep(2) except: pass # Wait for fresh TOTP window elapsed = int(time.time()) % 30 if 30 - elapsed < 8: wait = 30 - elapsed + 2 print(f" TOTP: waiting {wait}s for fresh window...", flush=True) await asyncio.sleep(wait) otp = totp(TOTP_SECRET) print(f" TOTP: {otp}", flush=True) filled = False for selector in ['input[name="code"]', 'input[type="tel"]', 'input[inputmode="numeric"]']: try: await page.fill(selector, otp, timeout=3000) filled = True break except: continue if not filled: try: inputs = page.locator('input:visible') for i in range(await inputs.count()): inp = inputs.nth(i) inp_type = await inp.get_attribute('type') or 'text' if inp_type in ('text', 'tel', 'number'): await inp.fill(otp) filled = True break except: pass if filled: await page.click('button[type="submit"]') await asyncio.sleep(8) ok = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url if ok: print(f" Login OK", flush=True) return True print(f" Login failed: {page.url[:80]}", flush=True) return False # ---- FREE API FETCH ---- async def fetch_general(page, ship_id): """Fetch /en/vessels/{shipid}/general — MMSI, IMO, Flag, LOA, Beam, Type.""" js = """async () => { try { const r = await fetch('https://www.marinetraffic.com/en/vessels/""" + str(ship_id) + """/general', { credentials: 'include', cache: 'no-store', headers: {'Accept':'application/json','X-Requested-With':'XMLHttpRequest'} }); if (r.status !== 200) return {error: 'HTTP '+r.status}; return await r.json(); } catch(e) { return {error: e.message}; } }""" try: return await page.evaluate(js) except Exception: return {'error': 'evaluate_failed'} async def fetch_position(page, ship_id): """Fetch /en/vessels/{shipid}/position — Draught, Speed, Course.""" js = """async () => { try { const r = await fetch('https://www.marinetraffic.com/en/vessels/""" + str(ship_id) + """/position', { credentials: 'include', cache: 'no-store', headers: {'Accept':'application/json','X-Requested-With':'XMLHttpRequest'} }); if (r.status !== 200) return {error: 'HTTP '+r.status}; return await r.json(); } catch(e) { return {error: e.message}; } }""" try: return await page.evaluate(js) except Exception: return {'error': 'evaluate_failed'} # ---- MAIN ---- async def main(): parser = argparse.ArgumentParser(description='MT Vessel Enrichment (free endpoints)') parser.add_argument('--probe', action='store_true', help='Test on 10 vessels') parser.add_argument('--limit', type=int, default=0, help='Max vessels to process') parser.add_argument('--delay', type=float, default=DELAY, help='Delay between API calls (s)') parser.add_argument('--loop', action='store_true', help='Repeat every 12h') parser.add_argument('--interval', type=float, default=LOOP_INTERVAL_HOURS, help='Hours between loops') parser.add_argument('--reset', action='store_true', help='Clear checkpoint, start fresh') parser.add_argument('--with-draught', action='store_true', help='Also fetch /position for draught') args = parser.parse_args() # DB connect try: conn = db_connect() cur = conn.cursor() cur.execute('SELECT count(*) FROM mt_bulk_staging') total_db = cur.fetchone()[0] cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NULL') no_mmsi = cur.fetchone()[0] cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE imo IS NULL') no_imo = cur.fetchone()[0] cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE flag IS NULL') no_flag = cur.fetchone()[0] cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE loa IS NULL') no_loa = cur.fetchone()[0] print(f"DB: {total_db} total | {no_mmsi} no MMSI | {no_imo} no IMO | " f"{no_flag} no Flag | {no_loa} no LOA", flush=True) except Exception as e: print(f"DB ERROR: {e}", flush=True) if not IS_SERVER: print("Start SSH tunnel: ssh -L 15432:127.0.0.1:5432 -N root@89.19.208.158") return if args.reset and os.path.exists(CHECKPOINT): os.remove(CHECKPOINT) print("Checkpoint cleared.", flush=True) while True: # loop mode cp = load_checkpoint() processed_set = set(cp['processed_ids']) # Ensure stats has all needed keys (checkpoint may be from old format) default_stats = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0, 'no_data': 0, 'errors': 0} for k, v in default_stats.items(): if k not in cp['stats']: cp['stats'][k] = v stats = cp['stats'] # Vessels needing enrichment: no MMSI OR no IMO OR no LOA query = """ SELECT ship_id, name, gt_shiptype, mmsi, imo, flag, loa, draught FROM mt_bulk_staging WHERE mmsi IS NULL OR imo IS NULL OR loa IS NULL ORDER BY CASE WHEN mmsi IS NULL THEN 0 ELSE 1 END, CASE WHEN gt_shiptype = '6' THEN 0 ELSE 1 END """ if args.limit: query += f" LIMIT {args.limit}" elif args.probe: query += " LIMIT 10" cur.execute(query) all_vessels = cur.fetchall() vessels = [(sid, n, gt, mm, im, fl, lo, dr) for sid, n, gt, mm, im, fl, lo, dr in all_vessels if sid not in processed_set] total = len(vessels) skipped = len(all_vessels) - total print(f"\nVessels to process: {total} (skipped {skipped} from checkpoint)", flush=True) if total == 0: print("Nothing to do!", flush=True) if not args.loop: break print(f"Sleeping {args.interval}h until next cycle...", flush=True) time.sleep(args.interval * 3600) cp['processed_ids'] = [] cp['stats'] = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0, 'no_data': 0, 'errors': 0} save_checkpoint(cp) continue calls_per_vessel = 2 if args.with_draught else 1 eta_sec = total * (args.delay + 0.16 * calls_per_vessel) print(f"Delay: {args.delay}s | ~{0.16*calls_per_vessel:.2f}s/vessel | " f"ETA: ~{eta_sec/60:.0f} min", flush=True) # Launch Playwright from playwright.async_api import async_playwright async with async_playwright() as p: browser = await p.chromium.launch( headless=False, args=['--no-sandbox', '--disable-blink-features=AutomationControlled'] ) ctx = await browser.new_context(viewport={'width': 1440, 'height': 900}) page = await ctx.new_page() if not await do_login(page): print("LOGIN FAILED", flush=True) await browser.close() conn.close() return # Go to lightweight page for fetching await page.goto('https://www.marinetraffic.com/robots.txt', wait_until='load', timeout=15000) await asyncio.sleep(2) consecutive_errors = 0 batch_count = 0 t0 = time.time() for i, (ship_id, name, gt, mmsi, imo, flag, loa, draught) in enumerate(vessels): # Fetch /general gen = await fetch_general(page, ship_id) # API sometimes returns a list instead of dict — treat as error if isinstance(gen, list): gen = gen[0] if len(gen) == 1 and isinstance(gen[0], dict) else {'error': 'unexpected_list'} if gen.get('error'): consecutive_errors += 1 stats['errors'] += 1 if consecutive_errors >= MAX_CONSECUTIVE_ERRORS: print(f"\n [FATAL] {consecutive_errors} consecutive errors, aborting.", flush=True) break if i < 20 or (i + 1) % 100 == 0: print(f" [{i+1}/{total}] {name} -> ERROR {gen['error']}", flush=True) cp['processed_ids'].append(ship_id) stats['no_data'] += 1 batch_count += 1 await asyncio.sleep(args.delay) continue consecutive_errors = 0 # Build UPDATE api_mmsi = str(gen.get('mmsi', '')) if gen.get('mmsi') else None api_imo = str(gen.get('imo', '')) if gen.get('imo') and str(gen.get('imo')) != '0' else None api_flag = gen.get('countryCode') or None api_loa = gen.get('length') or None api_beam = gen.get('width') or None api_year = gen.get('yearBuilt') or None api_subtype = gen.get('subtype') or None ct = gen.get('commercial_type') or {} api_market = ct.get('size_class_name') or None updates = [] params = [] if api_mmsi and not mmsi: updates.append("mmsi = %s") params.append(api_mmsi) stats['mmsi'] += 1 if api_imo and not imo: updates.append("imo = %s") params.append(api_imo) stats['imo'] += 1 if api_flag: updates.append("flag = COALESCE(%s, flag)") params.append(api_flag) if not flag: stats['flag'] += 1 if api_loa: updates.append("loa = COALESCE(%s, loa)") params.append(api_loa) if not loa: stats['loa'] += 1 if api_beam: updates.append("beam = COALESCE(%s, beam)") params.append(api_beam) if api_year: updates.append("year_built = COALESCE(%s, year_built)") params.append(api_year) if api_subtype: updates.append("shiptype = COALESCE(%s, shiptype)") params.append(api_subtype) # Fetch /position for draught (optional) if args.with_draught and not draught: pos = await fetch_position(page, ship_id) if not pos.get('error'): api_draught = pos.get('draught') if api_draught: updates.append("draught = COALESCE(%s, draught)") params.append(api_draught) stats['draught'] += 1 if updates: updates.append("scraped_at = NOW()") params.append(ship_id) try: conn, cur = db_safe_execute(conn, cur, f"UPDATE mt_bulk_staging SET {', '.join(updates)} WHERE ship_id = %s", tuple(params)) except Exception as e: stats['errors'] += 1 print(f" [DB] Update error: {e}", flush=True) cp['processed_ids'].append(ship_id) batch_count += 1 # Progress if i < 20 or (i + 1) % 100 == 0: print(f" [{i+1}/{total}] {name} -> " f"MMSI={api_mmsi or mmsi or '?'} IMO={api_imo or imo or '?'} " f"Flag={api_flag or flag or '?'} LOA={api_loa or loa or '?'}", flush=True) # Batch commit + checkpoint if batch_count >= BATCH: conn = db_safe_commit(conn) cur = conn.cursor() save_checkpoint(cp) elapsed = time.time() - t0 rate = (i + 1) / elapsed if elapsed > 0 else 0 remaining = (total - i - 1) / rate if rate > 0 else 0 print(f"\n=== CHECKPOINT [{i+1}/{total}] {elapsed:.0f}s | " f"mmsi={stats['mmsi']} imo={stats['imo']} " f"flag={stats['flag']} loa={stats['loa']} " f"err={stats['errors']} | " f"ETA: {remaining/60:.0f}m ===\n", flush=True) batch_count = 0 await asyncio.sleep(args.delay) # Final commit conn = db_safe_commit(conn) cur = conn.cursor() save_checkpoint(cp) # Summary cur.execute("SELECT count(*), count(mmsi), count(imo), count(flag), count(loa) " "FROM mt_bulk_staging") t, m, im, fl, lo = cur.fetchone() cur.execute("SELECT count(*), count(mmsi), count(imo) " "FROM mt_bulk_staging WHERE gt_shiptype='6'") tb, mb, ib = cur.fetchone() elapsed = time.time() - t0 print(f"\n{'='*60}", flush=True) print(f"DONE in {elapsed/60:.1f} minutes!", flush=True) print(f" Processed: {len(cp['processed_ids'])}", flush=True) print(f" New MMSI: {stats['mmsi']}", flush=True) print(f" New IMO: {stats['imo']}", flush=True) print(f" New Flag: {stats['flag']}", flush=True) print(f" New LOA: {stats['loa']}", flush=True) print(f" Errors: {stats['errors']}", flush=True) print(f"\nGLOBAL: total={t} mmsi={m} imo={im} flag={fl} loa={lo}", flush=True) print(f"BULK: total={tb} mmsi={mb} imo={ib}", flush=True) print(f"{'='*60}", flush=True) await browser.close() if not args.loop: break cp['processed_ids'] = [] cp['stats'] = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0, 'no_data': 0, 'errors': 0} save_checkpoint(cp) print(f"\nSleeping {args.interval}h until next cycle...", flush=True) time.sleep(args.interval * 3600) conn.close() if __name__ == '__main__': asyncio.run(main())