#!/usr/bin/env python3 """ MT MMSI Lookup — Get MMSI for vessels in mt_bulk_staging by SHIP_ID Two approaches: 1. quicksearch_shipid: filter reports by SHIP_ID to get vessel with MMSI 2. /search/searchAsset?what=vessel&term=NAME: search by name to get MMSI Then for ownership: navigate to /en/ais/details/ships/shipid:ID/mmsi:MMSI/imo:IMO/vessel:NAME#ownership and capture API response with owner data. Usage: python mt_mmsi_lookup.py --probe # test endpoint approaches on a few vessels python mt_mmsi_lookup.py --limit N # process N vessels (start with bulk GT=6) python mt_mmsi_lookup.py # process all vessels without MMSI """ import asyncio, json, sys, os, time, re, struct, hmac, hashlib, base64, argparse import psycopg2 os.chdir(os.path.dirname(os.path.abspath(__file__))) if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') EMAIL = "operation@mrlogisticcorp.com" PASSWORD = "NKh9i8Z!7fU9jfi" TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH" DB_URL = 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db' DELAY = 0.8 # seconds between lookups CKPT_FILE = 'mt_mmsi_checkpoint.json' BATCH = 200 # commit every N vessels # Columns for reports endpoint COLUMNS = 'flag,shipname,imo,ship_type,lat_of_latest_position,lon_of_latest_position,speed' def totp(secret): s = secret.upper().replace(' ', '') pad = (-len(s)) % 8 key = base64.b32decode(s + '=' * pad) counter = int(time.time()) // 30 msg = struct.pack('>Q', counter) h = hmac.new(key, msg, hashlib.sha1).digest() offset = h[-1] & 0x0f code = struct.unpack('>I', h[offset:offset + 4])[0] & 0x7fffffff return str(code % 1000000).zfill(6) async def do_login(page): print("Login to MT Pro...") await page.goto('https://www.marinetraffic.com/en/users/login', wait_until='domcontentloaded', timeout=30000) await asyncio.sleep(3) await page.fill('input[name="username"]', EMAIL) await page.click('button[type="submit"]') await asyncio.sleep(3) await page.fill('input[type="password"]', PASSWORD) await page.click('button[type="submit"]') await asyncio.sleep(4) if 'mfa-login-options' in page.url or 'mfa' in page.url.lower(): print(" 2FA: Google Authenticator...") try: await page.click('button:has-text("Google Authenticator")', timeout=5000) except Exception: pass await asyncio.sleep(2) otp = totp(TOTP_SECRET) print(f" TOTP: {otp}") await page.fill('input[name="code"]', otp) await page.click('button[type="submit"]') await asyncio.sleep(5) logged_in = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url print(f" Logged in: {logged_in} URL: {page.url}") return logged_in async def lookup_mmsi_by_shipid(page, ship_id): """ Filter reports endpoint by SHIP_ID using quicksearch_shipid param. Returns {mmsi, imo, name, ...} or None. """ url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels' f'&columns={COLUMNS}&quicksearch_shipid={ship_id}') js = f""" async () => {{ try {{ const resp = await fetch({json.dumps(url)}, {{ credentials: 'include', headers: {{ 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json', 'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels', }} }}); const text = await resp.text(); const parsed = JSON.parse(text); return {{ status: resp.status, rows: (parsed.data || []).length, data: (parsed.data || []).slice(0, 5), }}; }} catch(e) {{ return {{status: 0, error: e.message}}; }} }} """ try: result = await page.evaluate(js) return result except Exception as e: return {'error': str(e)} async def lookup_mmsi_by_name(page, name): """ Use MT search autocomplete endpoint to find vessel by name. Returns list of matching vessels with MMSI. """ encoded_name = name[:30].replace('"', '').replace("'", '') url = f'https://www.marinetraffic.com/search/searchAsset?what=vessel&term={encoded_name}' js = f""" async () => {{ try {{ const resp = await fetch({json.dumps(url)}, {{ credentials: 'include', headers: {{ 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json', 'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels', }} }}); const text = await resp.text(); const parsed = JSON.parse(text); return {{status: resp.status, data: parsed}}; }} catch(e) {{ return {{status: 0, error: e.message}}; }} }} """ try: result = await page.evaluate(js) return result except Exception as e: return {'error': str(e)} async def get_ownership_page(page, ship_id, mmsi, imo, name): """ Navigate to vessel #ownership page and capture API response. Returns ownership dict or None. """ # Build URL with all identifiers name_slug = re.sub(r'[^A-Z0-9]', '_', name.upper())[:30] imo_part = f'/imo:{imo}' if imo else '' url = (f'https://www.marinetraffic.com/en/ais/details/ships' f'/shipid:{ship_id}/mmsi:{mmsi}{imo_part}/vessel:{name_slug}') ownership_data = {} captured = [] async def capture_ownership(response): r_url = response.url # Look for ownership API call if 'ownership' in r_url.lower() or 'company' in r_url.lower(): try: body = await response.body() text = body.decode('utf-8', errors='replace') if text.startswith('{') or text.startswith('['): captured.append({'url': r_url, 'body': text}) except Exception: pass # Also capture any vessel info API calls elif 'getData' in r_url and not any(x in r_url for x in ['.js', '.css']): try: body = await response.body() text = body.decode('utf-8', errors='replace') if 'MMSI' in text or 'owner' in text.lower(): captured.append({'url': r_url, 'body': text[:3000]}) except Exception: pass page.on('response', capture_ownership) try: await page.goto(url, wait_until='load', timeout=25000) await asyncio.sleep(5) except Exception as e: print(f" goto error: {e}") page.remove_listener('response', capture_ownership) # Also try via page.evaluate on ownership endpoint if not captured: js_ownership = f""" async () => {{ const urls = [ '/getData/get_vessel_info/shipid:{ship_id}/mmsi:{mmsi}', '/en/getData/get_vessel_ownership/mmsi:{mmsi}', '/getData/get_vessel_ownership/mmsi:{mmsi}', ]; const results = []; for (const u of urls) {{ try {{ const r = await fetch(u, {{ credentials: 'include', headers: {{ 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json', }} }}); const text = await r.text(); results.push({{url: u, status: r.status, body: text.substring(0, 2000)}}); }} catch(e) {{ results.push({{url: u, error: e.message}}); }} }} return results; }} """ try: ownership_results = await page.evaluate(js_ownership) for r in ownership_results: if r.get('status') == 200 and r.get('body', '').startswith('{'): captured.append(r) except Exception: pass # Parse captured responses for cap in captured: text = cap.get('body', '') try: data = json.loads(text) except Exception: continue # Extract ownership fields for k in ['BENEFICIAL_OWNER', 'beneficial_owner', 'manager']: if data.get(k): ownership_data['owner'] = str(data[k]) break for k in ['OPERATOR', 'operator', 'charterer']: if data.get(k): ownership_data['operator'] = str(data[k]) break # Try nested rows = data.get('data', {}).get('rows', []) or data.get('rows', []) if rows and isinstance(rows, list): row = rows[0] for k in ['MANAGER', 'BENEFICIAL_OWNER', 'owner']: if row.get(k): ownership_data['owner'] = str(row[k]) break for k in ['OPERATOR', 'operator']: if row.get(k): ownership_data['operator'] = str(row[k]) break # Also check current page URL - if MT loaded vessel page, URL contains mmsi current_url = page.url if 'mmsi:' in current_url and mmsi not in current_url: # Extract MMSI from URL (for cross-check) m = re.search(r'mmsi:(\d+)', current_url) if m: ownership_data['mmsi_from_url'] = m.group(1) return ownership_data if ownership_data else None async def main(): parser = argparse.ArgumentParser() parser.add_argument('--probe', action='store_true', help='Test approaches on sample vessels') parser.add_argument('--limit', type=int, default=0, help='Max vessels to process') parser.add_argument('--gt6only', action='store_true', help='Only bulk carriers (GT=6)') parser.add_argument('--ownership', action='store_true', help='Also scrape ownership pages') args = parser.parse_args() conn = psycopg2.connect(DB_URL) cur = conn.cursor() from playwright.async_api import async_playwright async with async_playwright() as p: browser = await p.chromium.launch( headless=False, args=['--no-sandbox', '--disable-blink-features=AutomationControlled'] ) context = await browser.new_context( viewport={'width': 1440, 'height': 900}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', ) page = await context.new_page() # Login logged_in = await do_login(page) if not logged_in: print("ERROR: Login failed!") await browser.close() conn.close() return await asyncio.sleep(5) # Warm up session on data page await page.goto('https://www.marinetraffic.com/en/data/?asset_type=vessels', wait_until='load', timeout=40000) await asyncio.sleep(5) print(f"Session warmed up: {page.url}") # ---- PROBE MODE ---- if args.probe: print("\n=== PROBE: Testing MMSI lookup approaches ===") # Get a few test vessels cur.execute(""" SELECT ship_id, name, gt_shiptype, dwt, flag FROM mt_bulk_staging WHERE mmsi IS NULL AND lat IS NOT NULL ORDER BY gt_shiptype, dwt DESC NULLS LAST LIMIT 10 """) test_vessels = cur.fetchall() print(f"Test vessels: {len(test_vessels)}") for ship_id, name, gt, dwt, flag in test_vessels: print(f"\n [{gt}] {name} (SHIP_ID={ship_id}, DWT={dwt}, {flag})") # Approach 1: quicksearch_shipid r1 = await lookup_mmsi_by_shipid(page, ship_id) rows = r1.get('data', []) if rows: row = rows[0] print(f" quicksearch_shipid: MMSI={row.get('MMSI')} IMO={row.get('IMO')} name={row.get('SHIPNAME')}") print(f" Full row: {json.dumps(row)[:300]}") else: print(f" quicksearch_shipid: {r1.get('rows',0)} rows, error={r1.get('error','')}") await asyncio.sleep(0.5) # Approach 2: name search if name: r2 = await lookup_mmsi_by_name(page, name) search_data = r2.get('data', []) if isinstance(search_data, list) and search_data: print(f" name search: {len(search_data)} results") print(f" First result: {json.dumps(search_data[0])[:300]}") elif isinstance(search_data, dict): print(f" name search dict: {json.dumps(search_data)[:300]}") else: print(f" name search: no results, error={r2.get('error','')}") await asyncio.sleep(0.5) await browser.close() conn.close() return # ---- MAIN SCRAPE ---- # Load checkpoint processed_ids = set() if os.path.exists(CKPT_FILE): with open(CKPT_FILE) as f: cp = json.load(f) processed_ids = set(cp.get('processed', [])) print(f"Checkpoint: {len(processed_ids)} already processed") # Load vessels to process query = """ SELECT ship_id, name, gt_shiptype, dwt, flag, lat, lon, mmsi FROM mt_bulk_staging WHERE mmsi IS NULL """ if args.gt6only: query += " AND gt_shiptype = '6'" query += " ORDER BY gt_shiptype, dwt DESC NULLS LAST" if args.limit: query += f" LIMIT {args.limit}" cur.execute(query) vessels = [(r[0], r[1], r[2], r[3], r[4], r[5], r[6]) for r in cur.fetchall() if r[0] not in processed_ids] print(f"Vessels to process: {len(vessels)}") found_mmsi = 0 not_found_mmsi = 0 found_owner = 0 batch_count = 0 total = len(vessels) for i, (ship_id, name, gt, dwt, flag, lat, lon) in enumerate(vessels): print(f"\n[{i+1}/{total}] [{gt}] {name} ({flag}, DWT={dwt})") mmsi = None imo = None # ---- Try quicksearch_shipid ---- r = await lookup_mmsi_by_shipid(page, ship_id) rows = r.get('data', []) if rows: row = rows[0] # Verify it's the same vessel (name match or position match) api_name = (row.get('SHIPNAME') or '').upper() our_name = (name or '').upper() match_score = 0 if api_name and our_name: if api_name == our_name: match_score = 3 elif our_name[:10] in api_name or api_name[:10] in our_name: match_score = 2 else: match_score = 0 if match_score >= 2 or (len(rows) == 1): mmsi = str(row.get('MMSI', '')) or None imo = str(row.get('IMO', '')) or None print(f" quicksearch: MMSI={mmsi} IMO={imo} (match={match_score})") else: print(f" quicksearch: {len(rows)} rows, name mismatch: {api_name} vs {our_name}") # ---- Try name search if no MMSI yet ---- if not mmsi and name: r2 = await lookup_mmsi_by_name(page, name) search_data = r2.get('data', []) if isinstance(search_data, list): # Find best match by name for item in search_data[:5]: item_name = str(item.get('name', item.get('NAME', item.get('title', '')))).upper() if item_name == name.upper() or name.upper()[:15] in item_name: # Extract MMSI for k in ['mmsi', 'MMSI', 'id']: if item.get(k) and str(item[k]).isdigit(): mmsi = str(item[k]) break if mmsi: print(f" name search: MMSI={mmsi} ({item_name})") break if not mmsi: print(f" name search: no match") # ---- Update DB ---- if mmsi: found_mmsi += 1 cur.execute(""" UPDATE mt_bulk_staging SET mmsi=%s, imo=COALESCE(%s, imo), scraped_at=NOW() WHERE ship_id=%s """, (mmsi, imo, ship_id)) # ---- Ownership (if --ownership flag) ---- if args.ownership and imo: print(f" Getting ownership...") own = await get_ownership_page(page, ship_id, mmsi, imo, name or 'VESSEL') if own: found_owner += 1 cur.execute(""" UPDATE mt_bulk_staging SET owner=COALESCE(%s, owner), operator=COALESCE(%s, operator), scraped_at=NOW() WHERE ship_id=%s """, (own.get('owner'), own.get('operator'), ship_id)) print(f" owner={own.get('owner')} op={own.get('operator')}") else: print(f" No ownership data found") await asyncio.sleep(1.0) else: not_found_mmsi += 1 print(f" No MMSI found") processed_ids.add(ship_id) batch_count += 1 # Commit and checkpoint if batch_count >= BATCH: conn.commit() with open(CKPT_FILE, 'w') as f: json.dump({'processed': list(processed_ids)}, f) print(f"\n=== CHECKPOINT {i+1}/{total}: " f"found={found_mmsi}, not_found={not_found_mmsi}, owners={found_owner} ===") batch_count = 0 await asyncio.sleep(DELAY) # Final commit conn.commit() with open(CKPT_FILE, 'w') as f: json.dump({'processed': list(processed_ids)}, f) # Summary cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NOT NULL') total_mmsi = cur.fetchone()[0] cur.execute("SELECT count(*) FROM mt_bulk_staging WHERE gt_shiptype='6' AND mmsi IS NOT NULL") bulk_mmsi = cur.fetchone()[0] print(f"\n=== DONE ===") print(f" Processed: {len(vessels)}") print(f" MMSI found this run: {found_mmsi}") print(f" Not found: {not_found_mmsi}") print(f" Owners found: {found_owner}") print(f" Total with MMSI in DB: {total_mmsi}") print(f" Bulk carriers with MMSI: {bulk_mmsi}") conn.close() await browser.close() print("\nLookup complete!") asyncio.run(main())