384 lines
14 KiB
Python
384 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
MT Final Sweep — Get MMSI + Ownership in ONE query per vessel.
|
||
|
|
|
||
|
|
For all vessels in mt_bulk_staging that still need MMSI or ownership data.
|
||
|
|
Uses quicksearch_shipid with columns: shipname,imo,mmsi,flag,beneficial_owner,operator,registered_owner
|
||
|
|
|
||
|
|
This combines the MMSI lookup + ownership fetch into a single API call per vessel.
|
||
|
|
~0.5s per vessel = ~2,340 vessels in ~20 minutes.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python mt_final_sweep.py # Process all needing data
|
||
|
|
python mt_final_sweep.py --probe # Test on 10 vessels
|
||
|
|
python mt_final_sweep.py --limit 100 # Process 100 vessels
|
||
|
|
"""
|
||
|
|
import asyncio, json, sys, os, time, struct, hmac, hashlib, base64, argparse
|
||
|
|
import psycopg2
|
||
|
|
|
||
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
||
|
|
if hasattr(sys.stdout, 'reconfigure'):
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace', line_buffering=True)
|
||
|
|
if hasattr(sys.stderr, 'reconfigure'):
|
||
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace', line_buffering=True)
|
||
|
|
|
||
|
|
EMAIL = "operation@mrlogisticcorp.com"
|
||
|
|
PASSWORD = "NKh9i8Z!7fU9jfi"
|
||
|
|
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
|
||
|
|
DB_URL = 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
|
||
|
|
DELAY = 0.5
|
||
|
|
BATCH = 200
|
||
|
|
|
||
|
|
# Combined columns: get MMSI + all ownership fields in one query
|
||
|
|
COMBINED_COLS = 'shipname,imo,mmsi,flag,beneficial_owner,operator,registered_owner,commercial_manager'
|
||
|
|
|
||
|
|
|
||
|
|
def totp(secret):
|
||
|
|
s = secret.upper().replace(' ', '')
|
||
|
|
pad = (-len(s)) % 8
|
||
|
|
key = base64.b32decode(s + '=' * pad)
|
||
|
|
counter = int(time.time()) // 30
|
||
|
|
msg = struct.pack('>Q', counter)
|
||
|
|
h = hmac.new(key, msg, hashlib.sha1).digest()
|
||
|
|
offset = h[-1] & 0x0f
|
||
|
|
code = struct.unpack('>I', h[offset:offset+4])[0] & 0x7fffffff
|
||
|
|
return str(code % 1000000).zfill(6)
|
||
|
|
|
||
|
|
|
||
|
|
def db_connect():
|
||
|
|
return psycopg2.connect(
|
||
|
|
DB_URL, connect_timeout=15,
|
||
|
|
keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _restart_ssh_tunnel():
|
||
|
|
import subprocess
|
||
|
|
try:
|
||
|
|
subprocess.run(['taskkill', '/F', '/IM', 'ssh.exe'],
|
||
|
|
capture_output=True, timeout=5)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
time.sleep(2)
|
||
|
|
try:
|
||
|
|
subprocess.Popen(
|
||
|
|
['ssh', '-o', 'ServerAliveInterval=5', '-o', 'ServerAliveCountMax=120',
|
||
|
|
'-o', 'TCPKeepAlive=yes', '-o', 'StrictHostKeyChecking=no',
|
||
|
|
'-L', '15432:127.0.0.1:5432', '-N', 'root@89.19.208.158'],
|
||
|
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||
|
|
)
|
||
|
|
print(f" [SSH] Tunnel restarted, waiting 5s...")
|
||
|
|
time.sleep(5)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [SSH] Failed to restart: {e}")
|
||
|
|
|
||
|
|
|
||
|
|
def db_reconnect(conn):
|
||
|
|
try:
|
||
|
|
conn.close()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
for attempt in range(5):
|
||
|
|
try:
|
||
|
|
time.sleep(3)
|
||
|
|
new_conn = db_connect()
|
||
|
|
print(f" [DB] Reconnected (attempt {attempt+1})")
|
||
|
|
return new_conn, new_conn.cursor()
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [DB] Reconnect attempt {attempt+1} failed: {e}")
|
||
|
|
print(f" [DB] Restarting SSH tunnel...")
|
||
|
|
_restart_ssh_tunnel()
|
||
|
|
for attempt in range(10):
|
||
|
|
try:
|
||
|
|
time.sleep(5)
|
||
|
|
new_conn = db_connect()
|
||
|
|
print(f" [DB] Reconnected after tunnel restart (attempt {attempt+1})")
|
||
|
|
return new_conn, new_conn.cursor()
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [DB] Post-restart attempt {attempt+1} failed: {e}")
|
||
|
|
if attempt == 4:
|
||
|
|
_restart_ssh_tunnel()
|
||
|
|
raise Exception("DB reconnect failed")
|
||
|
|
|
||
|
|
|
||
|
|
def db_safe_execute(conn, cur, query, params=None):
|
||
|
|
try:
|
||
|
|
cur.execute(query, params)
|
||
|
|
return conn, cur
|
||
|
|
except (psycopg2.InterfaceError, psycopg2.OperationalError) as e:
|
||
|
|
print(f" [DB] Connection lost on execute ({e}), reconnecting...")
|
||
|
|
conn, cur = db_reconnect(conn)
|
||
|
|
cur.execute(query, params)
|
||
|
|
return conn, cur
|
||
|
|
|
||
|
|
|
||
|
|
def db_safe_commit(conn):
|
||
|
|
try:
|
||
|
|
conn.commit()
|
||
|
|
return conn
|
||
|
|
except (psycopg2.InterfaceError, psycopg2.OperationalError):
|
||
|
|
print(f" [DB] Connection lost on commit, reconnecting...")
|
||
|
|
conn, _ = db_reconnect(conn)
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
async def do_login(page, max_retries=2):
|
||
|
|
for attempt in range(max_retries):
|
||
|
|
print(f"LOGIN (attempt {attempt+1}/{max_retries})...")
|
||
|
|
await page.goto('https://www.marinetraffic.com/en/users/login',
|
||
|
|
wait_until='domcontentloaded', timeout=30000)
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
await page.fill('input[name="username"]', EMAIL)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
await page.fill('input[type="password"]', PASSWORD)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
await asyncio.sleep(4)
|
||
|
|
if 'mfa' in page.url.lower() or 'auth.kpler' in page.url:
|
||
|
|
try:
|
||
|
|
await page.click('button:has-text("Google Authenticator")', timeout=3000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
await asyncio.sleep(1)
|
||
|
|
otp = totp(TOTP_SECRET)
|
||
|
|
print(f" TOTP: {otp}")
|
||
|
|
try:
|
||
|
|
await page.fill('input[name="code"]', otp)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
except Exception:
|
||
|
|
try:
|
||
|
|
await page.fill('input[type="text"]', otp)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
except Exception as e:
|
||
|
|
print(f" 2FA error: {e}")
|
||
|
|
await asyncio.sleep(8)
|
||
|
|
ok = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
|
||
|
|
if ok:
|
||
|
|
print(f" Login: OK")
|
||
|
|
return True
|
||
|
|
print(f" Login attempt {attempt+1} failed | {page.url[:80]}")
|
||
|
|
if attempt < max_retries - 1:
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
async def fetch_combined(page, ship_id):
|
||
|
|
"""Get MMSI + ownership in one API call using quicksearch_shipid."""
|
||
|
|
url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels'
|
||
|
|
f'&columns={COMBINED_COLS}&quicksearch_shipid={ship_id}')
|
||
|
|
js = f"""
|
||
|
|
async () => {{
|
||
|
|
try {{
|
||
|
|
const r = await fetch({json.dumps(url)}, {{
|
||
|
|
credentials: 'include', cache: 'no-store',
|
||
|
|
headers: {{
|
||
|
|
'X-Requested-With': 'XMLHttpRequest',
|
||
|
|
'Accept': 'application/json',
|
||
|
|
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
|
||
|
|
}}
|
||
|
|
}});
|
||
|
|
if (r.status !== 200) return {{error: 'HTTP ' + r.status}};
|
||
|
|
const text = await r.text();
|
||
|
|
const d = JSON.parse(text);
|
||
|
|
const rows = d.data || [];
|
||
|
|
if (rows.length === 0) return {{error: 'no rows'}};
|
||
|
|
return rows[0];
|
||
|
|
}} catch(e) {{ return {{error: e.message}}; }}
|
||
|
|
}}
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
result = await page.evaluate(js)
|
||
|
|
if result and not result.get('error'):
|
||
|
|
return result
|
||
|
|
return None
|
||
|
|
except Exception:
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument('--probe', action='store_true')
|
||
|
|
parser.add_argument('--limit', type=int, default=0)
|
||
|
|
parser.add_argument('--delay', type=float, default=DELAY)
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
try:
|
||
|
|
conn = db_connect()
|
||
|
|
cur = conn.cursor()
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging')
|
||
|
|
total_db = cur.fetchone()[0]
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NULL')
|
||
|
|
no_mmsi = cur.fetchone()[0]
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NOT NULL AND owner IS NULL')
|
||
|
|
no_owner = cur.fetchone()[0]
|
||
|
|
print(f"DB: {total_db} total | {no_mmsi} need MMSI | {no_owner} need owner")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"DB ERROR: {e}")
|
||
|
|
return
|
||
|
|
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(
|
||
|
|
headless=False,
|
||
|
|
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
|
||
|
|
)
|
||
|
|
context = await browser.new_context(
|
||
|
|
viewport={'width': 1440, 'height': 900},
|
||
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||
|
|
'(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||
|
|
)
|
||
|
|
page = await context.new_page()
|
||
|
|
|
||
|
|
if not await do_login(page):
|
||
|
|
await browser.close()
|
||
|
|
conn.close()
|
||
|
|
return
|
||
|
|
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
await page.goto('https://www.marinetraffic.com/en/data/?asset_type=vessels',
|
||
|
|
wait_until='load', timeout=40000)
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
# Get vessels: prioritize those without MMSI/owner/commercial_manager
|
||
|
|
query = """
|
||
|
|
SELECT ship_id, name, gt_shiptype, dwt, flag, mmsi, imo, owner
|
||
|
|
FROM mt_bulk_staging
|
||
|
|
WHERE mmsi IS NULL OR owner IS NULL OR commercial_manager IS NULL
|
||
|
|
ORDER BY
|
||
|
|
CASE WHEN mmsi IS NULL THEN 0 ELSE 1 END,
|
||
|
|
CASE WHEN gt_shiptype = '6' THEN 0 ELSE 1 END,
|
||
|
|
dwt DESC NULLS LAST
|
||
|
|
"""
|
||
|
|
if args.limit:
|
||
|
|
query += f" LIMIT {args.limit}"
|
||
|
|
elif args.probe:
|
||
|
|
query += " LIMIT 20"
|
||
|
|
|
||
|
|
cur.execute(query)
|
||
|
|
vessels = cur.fetchall()
|
||
|
|
total = len(vessels)
|
||
|
|
|
||
|
|
print(f"\nVessels to process: {total}")
|
||
|
|
if total == 0:
|
||
|
|
print("Nothing to do!")
|
||
|
|
await browser.close()
|
||
|
|
conn.close()
|
||
|
|
return
|
||
|
|
|
||
|
|
eta_sec = total * (args.delay + 0.3)
|
||
|
|
print(f"Delay: {args.delay}s | ETA: ~{eta_sec/60:.0f} min")
|
||
|
|
|
||
|
|
got_mmsi = 0
|
||
|
|
got_owner = 0
|
||
|
|
no_data = 0
|
||
|
|
errors = 0
|
||
|
|
batch_count = 0
|
||
|
|
t0 = time.time()
|
||
|
|
|
||
|
|
for i, (ship_id, name, gt, dwt, flag, mmsi, imo, existing_owner) in enumerate(vessels):
|
||
|
|
try:
|
||
|
|
row = await fetch_combined(page, ship_id)
|
||
|
|
except Exception as e:
|
||
|
|
errors += 1
|
||
|
|
row = None
|
||
|
|
|
||
|
|
if row:
|
||
|
|
api_mmsi = str(row.get('MMSI', '')) if row.get('MMSI') else None
|
||
|
|
api_imo = str(row.get('IMO', '')) if row.get('IMO') and str(row.get('IMO')) != '0' else None
|
||
|
|
api_owner = row.get('BENEFICIAL_OWNER') or row.get('REGISTERED_OWNER') or None
|
||
|
|
api_reg_owner = row.get('REGISTERED_OWNER') or None
|
||
|
|
api_operator = row.get('OPERATOR') or None
|
||
|
|
api_commercial_manager = row.get('COMMERCIAL_MANAGER') or None
|
||
|
|
|
||
|
|
updates = []
|
||
|
|
params = []
|
||
|
|
|
||
|
|
if api_mmsi and not mmsi:
|
||
|
|
updates.append("mmsi = %s")
|
||
|
|
params.append(api_mmsi)
|
||
|
|
got_mmsi += 1
|
||
|
|
|
||
|
|
if api_imo:
|
||
|
|
updates.append("imo = COALESCE(%s, imo)")
|
||
|
|
params.append(api_imo)
|
||
|
|
|
||
|
|
if api_owner and not existing_owner:
|
||
|
|
updates.append("owner = %s")
|
||
|
|
params.append(api_owner)
|
||
|
|
got_owner += 1
|
||
|
|
|
||
|
|
if api_reg_owner:
|
||
|
|
updates.append("registered_owner = COALESCE(%s, registered_owner)")
|
||
|
|
params.append(api_reg_owner)
|
||
|
|
|
||
|
|
if api_operator:
|
||
|
|
updates.append("operator = COALESCE(%s, operator)")
|
||
|
|
params.append(api_operator)
|
||
|
|
|
||
|
|
if api_commercial_manager:
|
||
|
|
updates.append("commercial_manager = COALESCE(%s, commercial_manager)")
|
||
|
|
params.append(api_commercial_manager)
|
||
|
|
|
||
|
|
if updates:
|
||
|
|
updates.append("scraped_at = NOW()")
|
||
|
|
params.append(ship_id)
|
||
|
|
conn, cur = db_safe_execute(conn, cur,
|
||
|
|
f"UPDATE mt_bulk_staging SET {', '.join(updates)} WHERE ship_id = %s",
|
||
|
|
tuple(params))
|
||
|
|
else:
|
||
|
|
no_data += 1
|
||
|
|
|
||
|
|
# Progress
|
||
|
|
if i < 20 or (i + 1) % 50 == 0:
|
||
|
|
if row:
|
||
|
|
m = api_mmsi or mmsi or '?'
|
||
|
|
o = (api_owner or existing_owner or '?')[:35]
|
||
|
|
print(f" [{i+1}/{total}] {name} -> MMSI={m} | {o}")
|
||
|
|
else:
|
||
|
|
print(f" [{i+1}/{total}] {name} -> no data")
|
||
|
|
|
||
|
|
batch_count += 1
|
||
|
|
if batch_count >= BATCH:
|
||
|
|
conn = db_safe_commit(conn)
|
||
|
|
cur = conn.cursor()
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
rate = (i + 1) / elapsed if elapsed > 0 else 0
|
||
|
|
remaining = (total - i - 1) / rate if rate > 0 else 0
|
||
|
|
print(f"\n=== CHECKPOINT [{i+1}/{total}] {elapsed:.0f}s | "
|
||
|
|
f"mmsi={got_mmsi} owner={got_owner} "
|
||
|
|
f"none={no_data} err={errors} | "
|
||
|
|
f"ETA: {remaining/60:.0f}m ===\n")
|
||
|
|
batch_count = 0
|
||
|
|
|
||
|
|
await asyncio.sleep(args.delay)
|
||
|
|
|
||
|
|
# Final commit
|
||
|
|
conn = db_safe_commit(conn)
|
||
|
|
cur = conn.cursor()
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
cur.execute("SELECT count(*), count(mmsi), count(owner) FROM mt_bulk_staging")
|
||
|
|
t, m, o = cur.fetchone()
|
||
|
|
cur.execute("SELECT count(*), count(mmsi), count(owner) FROM mt_bulk_staging WHERE gt_shiptype='6'")
|
||
|
|
tb, mb, ob = cur.fetchone()
|
||
|
|
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"DONE in {elapsed/60:.1f} minutes!")
|
||
|
|
print(f" Processed: {total}")
|
||
|
|
print(f" New MMSI: {got_mmsi}")
|
||
|
|
print(f" New owners: {got_owner}")
|
||
|
|
print(f" No data: {no_data}")
|
||
|
|
print(f" Errors: {errors}")
|
||
|
|
print(f"\nGLOBAL: total={t} mmsi={m} owner={o}")
|
||
|
|
print(f"BULK: total={tb} mmsi={mb} owner={ob}")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
|
||
|
|
asyncio.run(main())
|