576 lines
22 KiB
Python
576 lines
22 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
MT Enrichment — Fetch MMSI, IMO, Flag, LOA, Beam, Draught for all vessels.
|
||
|
|
|
||
|
|
Uses FREE MarineTraffic detail API endpoints (no Pro subscription needed):
|
||
|
|
/en/vessels/{shipid}/general → MMSI, IMO, Flag, LOA, Beam, Type, Year Built
|
||
|
|
/en/vessels/{shipid}/position → Draught, Speed, Course
|
||
|
|
|
||
|
|
Requires Playwright login (headless=False for Cloudflare bypass).
|
||
|
|
Runs LOCALLY on PC. Speed: ~0.16s/vessel (10x faster than Reports API).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python mt_enrichment.py # All vessels needing MMSI/IMO
|
||
|
|
python mt_enrichment.py --probe # Test 10 vessels
|
||
|
|
python mt_enrichment.py --limit 500 # Process 500 vessels
|
||
|
|
python mt_enrichment.py --loop # Repeat every 12h
|
||
|
|
python mt_enrichment.py --reset # Clear checkpoint, start fresh
|
||
|
|
"""
|
||
|
|
import asyncio, json, sys, os, time, struct, hmac, hashlib, base64, argparse
|
||
|
|
import psycopg2
|
||
|
|
|
||
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
||
|
|
if hasattr(sys.stdout, 'reconfigure'):
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace', line_buffering=True)
|
||
|
|
if hasattr(sys.stderr, 'reconfigure'):
|
||
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace', line_buffering=True)
|
||
|
|
|
||
|
|
# ---- CONFIG ----
|
||
|
|
|
||
|
|
EMAIL = "operation@mrlogisticcorp.com"
|
||
|
|
PASSWORD = "NKh9i8Z!7fU9jfi"
|
||
|
|
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
|
||
|
|
|
||
|
|
IS_SERVER = sys.platform == 'linux'
|
||
|
|
DB_URL = os.environ.get('DATABASE_URL') or (
|
||
|
|
'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:5432/seafare_db' if IS_SERVER
|
||
|
|
else 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
|
||
|
|
)
|
||
|
|
|
||
|
|
DELAY = 0.15 # seconds between API calls (~0.16s/vessel observed)
|
||
|
|
BATCH = 500 # commit every N vessels (fast enough for larger batches)
|
||
|
|
CHECKPOINT = 'mt_enrichment_checkpoint.json'
|
||
|
|
LOOP_INTERVAL_HOURS = 12
|
||
|
|
MAX_CONSECUTIVE_ERRORS = 20
|
||
|
|
|
||
|
|
|
||
|
|
# ---- TOTP ----
|
||
|
|
|
||
|
|
def totp(secret):
|
||
|
|
s = secret.upper().replace(' ', '')
|
||
|
|
pad = (-len(s)) % 8
|
||
|
|
key = base64.b32decode(s + '=' * pad)
|
||
|
|
counter = int(time.time()) // 30
|
||
|
|
msg = struct.pack('>Q', counter)
|
||
|
|
h = hmac.new(key, msg, hashlib.sha1).digest()
|
||
|
|
offset = h[-1] & 0x0f
|
||
|
|
code = struct.unpack('>I', h[offset:offset+4])[0] & 0x7fffffff
|
||
|
|
return str(code % 1000000).zfill(6)
|
||
|
|
|
||
|
|
|
||
|
|
# ---- DB HELPERS ----
|
||
|
|
|
||
|
|
def db_connect():
|
||
|
|
return psycopg2.connect(
|
||
|
|
DB_URL, connect_timeout=15,
|
||
|
|
keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _restart_ssh_tunnel():
|
||
|
|
if IS_SERVER:
|
||
|
|
return
|
||
|
|
import subprocess
|
||
|
|
try:
|
||
|
|
subprocess.run(['taskkill', '/F', '/IM', 'ssh.exe'], capture_output=True, timeout=5)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
time.sleep(2)
|
||
|
|
try:
|
||
|
|
subprocess.Popen(
|
||
|
|
['ssh', '-o', 'ServerAliveInterval=5', '-o', 'ServerAliveCountMax=120',
|
||
|
|
'-o', 'TCPKeepAlive=yes', '-o', 'StrictHostKeyChecking=no',
|
||
|
|
'-L', '15432:127.0.0.1:5432', '-N', 'root@89.19.208.158'],
|
||
|
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||
|
|
)
|
||
|
|
print(f" [SSH] Tunnel restarted, waiting 5s...", flush=True)
|
||
|
|
time.sleep(5)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [SSH] Failed: {e}", flush=True)
|
||
|
|
|
||
|
|
|
||
|
|
def db_reconnect(conn):
|
||
|
|
try:
|
||
|
|
conn.close()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
for attempt in range(5):
|
||
|
|
try:
|
||
|
|
time.sleep(3)
|
||
|
|
c = db_connect()
|
||
|
|
print(f" [DB] Reconnected (attempt {attempt+1})", flush=True)
|
||
|
|
return c, c.cursor()
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [DB] Attempt {attempt+1} failed: {e}", flush=True)
|
||
|
|
if not IS_SERVER:
|
||
|
|
print(f" [DB] Restarting SSH tunnel...", flush=True)
|
||
|
|
_restart_ssh_tunnel()
|
||
|
|
for attempt in range(10):
|
||
|
|
try:
|
||
|
|
time.sleep(5)
|
||
|
|
c = db_connect()
|
||
|
|
print(f" [DB] Reconnected after tunnel restart (attempt {attempt+1})", flush=True)
|
||
|
|
return c, c.cursor()
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [DB] Post-restart attempt {attempt+1} failed: {e}", flush=True)
|
||
|
|
if not IS_SERVER and attempt == 4:
|
||
|
|
_restart_ssh_tunnel()
|
||
|
|
raise Exception("DB reconnect failed")
|
||
|
|
|
||
|
|
|
||
|
|
def db_safe_execute(conn, cur, query, params=None):
|
||
|
|
try:
|
||
|
|
cur.execute(query, params)
|
||
|
|
return conn, cur
|
||
|
|
except (psycopg2.InterfaceError, psycopg2.OperationalError) as e:
|
||
|
|
print(f" [DB] Lost on execute ({e}), reconnecting...", flush=True)
|
||
|
|
conn, cur = db_reconnect(conn)
|
||
|
|
cur.execute(query, params)
|
||
|
|
return conn, cur
|
||
|
|
except psycopg2.Error:
|
||
|
|
try:
|
||
|
|
conn.rollback()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
raise
|
||
|
|
|
||
|
|
|
||
|
|
def db_safe_commit(conn):
|
||
|
|
try:
|
||
|
|
conn.commit()
|
||
|
|
return conn
|
||
|
|
except (psycopg2.InterfaceError, psycopg2.OperationalError):
|
||
|
|
print(f" [DB] Lost on commit, reconnecting...", flush=True)
|
||
|
|
conn, _ = db_reconnect(conn)
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
# ---- CHECKPOINT ----
|
||
|
|
|
||
|
|
def load_checkpoint():
|
||
|
|
if os.path.exists(CHECKPOINT):
|
||
|
|
try:
|
||
|
|
with open(CHECKPOINT, encoding='utf-8') as f:
|
||
|
|
return json.load(f)
|
||
|
|
except (json.JSONDecodeError, IOError):
|
||
|
|
print(f" [WARN] Corrupt checkpoint, starting fresh", flush=True)
|
||
|
|
return {
|
||
|
|
'processed_ids': [],
|
||
|
|
'stats': {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
|
||
|
|
'no_data': 0, 'errors': 0},
|
||
|
|
'last_run': None,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def save_checkpoint(cp):
|
||
|
|
cp['last_run'] = time.strftime('%Y-%m-%dT%H:%M:%S')
|
||
|
|
with open(CHECKPOINT, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump(cp, f)
|
||
|
|
|
||
|
|
|
||
|
|
# ---- PLAYWRIGHT LOGIN ----
|
||
|
|
|
||
|
|
async def do_login(page, max_retries=3):
|
||
|
|
for attempt in range(max_retries):
|
||
|
|
print(f"LOGIN (attempt {attempt+1}/{max_retries})...", flush=True)
|
||
|
|
try:
|
||
|
|
await page.goto('https://www.marinetraffic.com/en/users/login',
|
||
|
|
wait_until='domcontentloaded', timeout=30000)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Nav error: {e}", flush=True)
|
||
|
|
continue
|
||
|
|
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
try:
|
||
|
|
await page.click('button:has-text("AGREE")', timeout=3000)
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
try:
|
||
|
|
await page.fill('input[name="username"]', EMAIL)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Email error: {e}", flush=True)
|
||
|
|
continue
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
|
||
|
|
try:
|
||
|
|
await page.fill('input[type="password"]', PASSWORD)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Password error: {e}", flush=True)
|
||
|
|
continue
|
||
|
|
await asyncio.sleep(4)
|
||
|
|
|
||
|
|
if 'mfa' in page.url.lower() or 'auth.kpler' in page.url:
|
||
|
|
try:
|
||
|
|
await page.click('button:has-text("Google Authenticator")', timeout=3000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Wait for fresh TOTP window
|
||
|
|
elapsed = int(time.time()) % 30
|
||
|
|
if 30 - elapsed < 8:
|
||
|
|
wait = 30 - elapsed + 2
|
||
|
|
print(f" TOTP: waiting {wait}s for fresh window...", flush=True)
|
||
|
|
await asyncio.sleep(wait)
|
||
|
|
|
||
|
|
otp = totp(TOTP_SECRET)
|
||
|
|
print(f" TOTP: {otp}", flush=True)
|
||
|
|
|
||
|
|
filled = False
|
||
|
|
for selector in ['input[name="code"]', 'input[type="tel"]', 'input[inputmode="numeric"]']:
|
||
|
|
try:
|
||
|
|
await page.fill(selector, otp, timeout=3000)
|
||
|
|
filled = True
|
||
|
|
break
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
if not filled:
|
||
|
|
try:
|
||
|
|
inputs = page.locator('input:visible')
|
||
|
|
for i in range(await inputs.count()):
|
||
|
|
inp = inputs.nth(i)
|
||
|
|
inp_type = await inp.get_attribute('type') or 'text'
|
||
|
|
if inp_type in ('text', 'tel', 'number'):
|
||
|
|
await inp.fill(otp)
|
||
|
|
filled = True
|
||
|
|
break
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
if filled:
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
await asyncio.sleep(8)
|
||
|
|
|
||
|
|
ok = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
|
||
|
|
if ok:
|
||
|
|
print(f" Login OK", flush=True)
|
||
|
|
return True
|
||
|
|
print(f" Login failed: {page.url[:80]}", flush=True)
|
||
|
|
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
# ---- FREE API FETCH ----
|
||
|
|
|
||
|
|
async def fetch_general(page, ship_id):
|
||
|
|
"""Fetch /en/vessels/{shipid}/general — MMSI, IMO, Flag, LOA, Beam, Type."""
|
||
|
|
js = """async () => {
|
||
|
|
try {
|
||
|
|
const r = await fetch('https://www.marinetraffic.com/en/vessels/""" + str(ship_id) + """/general', {
|
||
|
|
credentials: 'include', cache: 'no-store',
|
||
|
|
headers: {'Accept':'application/json','X-Requested-With':'XMLHttpRequest'}
|
||
|
|
});
|
||
|
|
if (r.status !== 200) return {error: 'HTTP '+r.status};
|
||
|
|
return await r.json();
|
||
|
|
} catch(e) { return {error: e.message}; }
|
||
|
|
}"""
|
||
|
|
try:
|
||
|
|
return await page.evaluate(js)
|
||
|
|
except Exception:
|
||
|
|
return {'error': 'evaluate_failed'}
|
||
|
|
|
||
|
|
|
||
|
|
async def fetch_position(page, ship_id):
|
||
|
|
"""Fetch /en/vessels/{shipid}/position — Draught, Speed, Course."""
|
||
|
|
js = """async () => {
|
||
|
|
try {
|
||
|
|
const r = await fetch('https://www.marinetraffic.com/en/vessels/""" + str(ship_id) + """/position', {
|
||
|
|
credentials: 'include', cache: 'no-store',
|
||
|
|
headers: {'Accept':'application/json','X-Requested-With':'XMLHttpRequest'}
|
||
|
|
});
|
||
|
|
if (r.status !== 200) return {error: 'HTTP '+r.status};
|
||
|
|
return await r.json();
|
||
|
|
} catch(e) { return {error: e.message}; }
|
||
|
|
}"""
|
||
|
|
try:
|
||
|
|
return await page.evaluate(js)
|
||
|
|
except Exception:
|
||
|
|
return {'error': 'evaluate_failed'}
|
||
|
|
|
||
|
|
|
||
|
|
# ---- MAIN ----
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
parser = argparse.ArgumentParser(description='MT Vessel Enrichment (free endpoints)')
|
||
|
|
parser.add_argument('--probe', action='store_true', help='Test on 10 vessels')
|
||
|
|
parser.add_argument('--limit', type=int, default=0, help='Max vessels to process')
|
||
|
|
parser.add_argument('--delay', type=float, default=DELAY, help='Delay between API calls (s)')
|
||
|
|
parser.add_argument('--loop', action='store_true', help='Repeat every 12h')
|
||
|
|
parser.add_argument('--interval', type=float, default=LOOP_INTERVAL_HOURS, help='Hours between loops')
|
||
|
|
parser.add_argument('--reset', action='store_true', help='Clear checkpoint, start fresh')
|
||
|
|
parser.add_argument('--with-draught', action='store_true', help='Also fetch /position for draught')
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# DB connect
|
||
|
|
try:
|
||
|
|
conn = db_connect()
|
||
|
|
cur = conn.cursor()
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging')
|
||
|
|
total_db = cur.fetchone()[0]
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NULL')
|
||
|
|
no_mmsi = cur.fetchone()[0]
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE imo IS NULL')
|
||
|
|
no_imo = cur.fetchone()[0]
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE flag IS NULL')
|
||
|
|
no_flag = cur.fetchone()[0]
|
||
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE loa IS NULL')
|
||
|
|
no_loa = cur.fetchone()[0]
|
||
|
|
print(f"DB: {total_db} total | {no_mmsi} no MMSI | {no_imo} no IMO | "
|
||
|
|
f"{no_flag} no Flag | {no_loa} no LOA", flush=True)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"DB ERROR: {e}", flush=True)
|
||
|
|
if not IS_SERVER:
|
||
|
|
print("Start SSH tunnel: ssh -L 15432:127.0.0.1:5432 -N root@89.19.208.158")
|
||
|
|
return
|
||
|
|
|
||
|
|
if args.reset and os.path.exists(CHECKPOINT):
|
||
|
|
os.remove(CHECKPOINT)
|
||
|
|
print("Checkpoint cleared.", flush=True)
|
||
|
|
|
||
|
|
while True: # loop mode
|
||
|
|
cp = load_checkpoint()
|
||
|
|
processed_set = set(cp['processed_ids'])
|
||
|
|
# Ensure stats has all needed keys (checkpoint may be from old format)
|
||
|
|
default_stats = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
|
||
|
|
'no_data': 0, 'errors': 0}
|
||
|
|
for k, v in default_stats.items():
|
||
|
|
if k not in cp['stats']:
|
||
|
|
cp['stats'][k] = v
|
||
|
|
stats = cp['stats']
|
||
|
|
|
||
|
|
# Vessels needing enrichment: no MMSI OR no IMO OR no LOA
|
||
|
|
query = """
|
||
|
|
SELECT ship_id, name, gt_shiptype, mmsi, imo, flag, loa, draught
|
||
|
|
FROM mt_bulk_staging
|
||
|
|
WHERE mmsi IS NULL OR imo IS NULL OR loa IS NULL
|
||
|
|
ORDER BY
|
||
|
|
CASE WHEN mmsi IS NULL THEN 0 ELSE 1 END,
|
||
|
|
CASE WHEN gt_shiptype = '6' THEN 0 ELSE 1 END
|
||
|
|
"""
|
||
|
|
if args.limit:
|
||
|
|
query += f" LIMIT {args.limit}"
|
||
|
|
elif args.probe:
|
||
|
|
query += " LIMIT 10"
|
||
|
|
|
||
|
|
cur.execute(query)
|
||
|
|
all_vessels = cur.fetchall()
|
||
|
|
|
||
|
|
vessels = [(sid, n, gt, mm, im, fl, lo, dr)
|
||
|
|
for sid, n, gt, mm, im, fl, lo, dr in all_vessels
|
||
|
|
if sid not in processed_set]
|
||
|
|
|
||
|
|
total = len(vessels)
|
||
|
|
skipped = len(all_vessels) - total
|
||
|
|
|
||
|
|
print(f"\nVessels to process: {total} (skipped {skipped} from checkpoint)", flush=True)
|
||
|
|
if total == 0:
|
||
|
|
print("Nothing to do!", flush=True)
|
||
|
|
if not args.loop:
|
||
|
|
break
|
||
|
|
print(f"Sleeping {args.interval}h until next cycle...", flush=True)
|
||
|
|
time.sleep(args.interval * 3600)
|
||
|
|
cp['processed_ids'] = []
|
||
|
|
cp['stats'] = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
|
||
|
|
'no_data': 0, 'errors': 0}
|
||
|
|
save_checkpoint(cp)
|
||
|
|
continue
|
||
|
|
|
||
|
|
calls_per_vessel = 2 if args.with_draught else 1
|
||
|
|
eta_sec = total * (args.delay + 0.16 * calls_per_vessel)
|
||
|
|
print(f"Delay: {args.delay}s | ~{0.16*calls_per_vessel:.2f}s/vessel | "
|
||
|
|
f"ETA: ~{eta_sec/60:.0f} min", flush=True)
|
||
|
|
|
||
|
|
# Launch Playwright
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(
|
||
|
|
headless=False,
|
||
|
|
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
|
||
|
|
)
|
||
|
|
ctx = await browser.new_context(viewport={'width': 1440, 'height': 900})
|
||
|
|
page = await ctx.new_page()
|
||
|
|
|
||
|
|
if not await do_login(page):
|
||
|
|
print("LOGIN FAILED", flush=True)
|
||
|
|
await browser.close()
|
||
|
|
conn.close()
|
||
|
|
return
|
||
|
|
|
||
|
|
# Go to lightweight page for fetching
|
||
|
|
await page.goto('https://www.marinetraffic.com/robots.txt',
|
||
|
|
wait_until='load', timeout=15000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
consecutive_errors = 0
|
||
|
|
batch_count = 0
|
||
|
|
t0 = time.time()
|
||
|
|
|
||
|
|
for i, (ship_id, name, gt, mmsi, imo, flag, loa, draught) in enumerate(vessels):
|
||
|
|
# Fetch /general
|
||
|
|
gen = await fetch_general(page, ship_id)
|
||
|
|
|
||
|
|
# API sometimes returns a list instead of dict — treat as error
|
||
|
|
if isinstance(gen, list):
|
||
|
|
gen = gen[0] if len(gen) == 1 and isinstance(gen[0], dict) else {'error': 'unexpected_list'}
|
||
|
|
|
||
|
|
if gen.get('error'):
|
||
|
|
consecutive_errors += 1
|
||
|
|
stats['errors'] += 1
|
||
|
|
if consecutive_errors >= MAX_CONSECUTIVE_ERRORS:
|
||
|
|
print(f"\n [FATAL] {consecutive_errors} consecutive errors, aborting.",
|
||
|
|
flush=True)
|
||
|
|
break
|
||
|
|
if i < 20 or (i + 1) % 100 == 0:
|
||
|
|
print(f" [{i+1}/{total}] {name} -> ERROR {gen['error']}", flush=True)
|
||
|
|
cp['processed_ids'].append(ship_id)
|
||
|
|
stats['no_data'] += 1
|
||
|
|
batch_count += 1
|
||
|
|
await asyncio.sleep(args.delay)
|
||
|
|
continue
|
||
|
|
|
||
|
|
consecutive_errors = 0
|
||
|
|
|
||
|
|
# Build UPDATE
|
||
|
|
api_mmsi = str(gen.get('mmsi', '')) if gen.get('mmsi') else None
|
||
|
|
api_imo = str(gen.get('imo', '')) if gen.get('imo') and str(gen.get('imo')) != '0' else None
|
||
|
|
api_flag = gen.get('countryCode') or None
|
||
|
|
api_loa = gen.get('length') or None
|
||
|
|
api_beam = gen.get('width') or None
|
||
|
|
api_year = gen.get('yearBuilt') or None
|
||
|
|
api_subtype = gen.get('subtype') or None
|
||
|
|
ct = gen.get('commercial_type') or {}
|
||
|
|
api_market = ct.get('size_class_name') or None
|
||
|
|
|
||
|
|
updates = []
|
||
|
|
params = []
|
||
|
|
|
||
|
|
if api_mmsi and not mmsi:
|
||
|
|
updates.append("mmsi = %s")
|
||
|
|
params.append(api_mmsi)
|
||
|
|
stats['mmsi'] += 1
|
||
|
|
|
||
|
|
if api_imo and not imo:
|
||
|
|
updates.append("imo = %s")
|
||
|
|
params.append(api_imo)
|
||
|
|
stats['imo'] += 1
|
||
|
|
|
||
|
|
if api_flag:
|
||
|
|
updates.append("flag = COALESCE(%s, flag)")
|
||
|
|
params.append(api_flag)
|
||
|
|
if not flag:
|
||
|
|
stats['flag'] += 1
|
||
|
|
|
||
|
|
if api_loa:
|
||
|
|
updates.append("loa = COALESCE(%s, loa)")
|
||
|
|
params.append(api_loa)
|
||
|
|
if not loa:
|
||
|
|
stats['loa'] += 1
|
||
|
|
|
||
|
|
if api_beam:
|
||
|
|
updates.append("beam = COALESCE(%s, beam)")
|
||
|
|
params.append(api_beam)
|
||
|
|
|
||
|
|
if api_year:
|
||
|
|
updates.append("year_built = COALESCE(%s, year_built)")
|
||
|
|
params.append(api_year)
|
||
|
|
|
||
|
|
if api_subtype:
|
||
|
|
updates.append("shiptype = COALESCE(%s, shiptype)")
|
||
|
|
params.append(api_subtype)
|
||
|
|
|
||
|
|
# Fetch /position for draught (optional)
|
||
|
|
if args.with_draught and not draught:
|
||
|
|
pos = await fetch_position(page, ship_id)
|
||
|
|
if not pos.get('error'):
|
||
|
|
api_draught = pos.get('draught')
|
||
|
|
if api_draught:
|
||
|
|
updates.append("draught = COALESCE(%s, draught)")
|
||
|
|
params.append(api_draught)
|
||
|
|
stats['draught'] += 1
|
||
|
|
|
||
|
|
if updates:
|
||
|
|
updates.append("scraped_at = NOW()")
|
||
|
|
params.append(ship_id)
|
||
|
|
try:
|
||
|
|
conn, cur = db_safe_execute(conn, cur,
|
||
|
|
f"UPDATE mt_bulk_staging SET {', '.join(updates)} WHERE ship_id = %s",
|
||
|
|
tuple(params))
|
||
|
|
except Exception as e:
|
||
|
|
stats['errors'] += 1
|
||
|
|
print(f" [DB] Update error: {e}", flush=True)
|
||
|
|
|
||
|
|
cp['processed_ids'].append(ship_id)
|
||
|
|
batch_count += 1
|
||
|
|
|
||
|
|
# Progress
|
||
|
|
if i < 20 or (i + 1) % 100 == 0:
|
||
|
|
print(f" [{i+1}/{total}] {name} -> "
|
||
|
|
f"MMSI={api_mmsi or mmsi or '?'} IMO={api_imo or imo or '?'} "
|
||
|
|
f"Flag={api_flag or flag or '?'} LOA={api_loa or loa or '?'}",
|
||
|
|
flush=True)
|
||
|
|
|
||
|
|
# Batch commit + checkpoint
|
||
|
|
if batch_count >= BATCH:
|
||
|
|
conn = db_safe_commit(conn)
|
||
|
|
cur = conn.cursor()
|
||
|
|
save_checkpoint(cp)
|
||
|
|
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
rate = (i + 1) / elapsed if elapsed > 0 else 0
|
||
|
|
remaining = (total - i - 1) / rate if rate > 0 else 0
|
||
|
|
print(f"\n=== CHECKPOINT [{i+1}/{total}] {elapsed:.0f}s | "
|
||
|
|
f"mmsi={stats['mmsi']} imo={stats['imo']} "
|
||
|
|
f"flag={stats['flag']} loa={stats['loa']} "
|
||
|
|
f"err={stats['errors']} | "
|
||
|
|
f"ETA: {remaining/60:.0f}m ===\n", flush=True)
|
||
|
|
batch_count = 0
|
||
|
|
|
||
|
|
await asyncio.sleep(args.delay)
|
||
|
|
|
||
|
|
# Final commit
|
||
|
|
conn = db_safe_commit(conn)
|
||
|
|
cur = conn.cursor()
|
||
|
|
save_checkpoint(cp)
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
cur.execute("SELECT count(*), count(mmsi), count(imo), count(flag), count(loa) "
|
||
|
|
"FROM mt_bulk_staging")
|
||
|
|
t, m, im, fl, lo = cur.fetchone()
|
||
|
|
cur.execute("SELECT count(*), count(mmsi), count(imo) "
|
||
|
|
"FROM mt_bulk_staging WHERE gt_shiptype='6'")
|
||
|
|
tb, mb, ib = cur.fetchone()
|
||
|
|
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
print(f"\n{'='*60}", flush=True)
|
||
|
|
print(f"DONE in {elapsed/60:.1f} minutes!", flush=True)
|
||
|
|
print(f" Processed: {len(cp['processed_ids'])}", flush=True)
|
||
|
|
print(f" New MMSI: {stats['mmsi']}", flush=True)
|
||
|
|
print(f" New IMO: {stats['imo']}", flush=True)
|
||
|
|
print(f" New Flag: {stats['flag']}", flush=True)
|
||
|
|
print(f" New LOA: {stats['loa']}", flush=True)
|
||
|
|
print(f" Errors: {stats['errors']}", flush=True)
|
||
|
|
print(f"\nGLOBAL: total={t} mmsi={m} imo={im} flag={fl} loa={lo}", flush=True)
|
||
|
|
print(f"BULK: total={tb} mmsi={mb} imo={ib}", flush=True)
|
||
|
|
print(f"{'='*60}", flush=True)
|
||
|
|
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
if not args.loop:
|
||
|
|
break
|
||
|
|
|
||
|
|
cp['processed_ids'] = []
|
||
|
|
cp['stats'] = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
|
||
|
|
'no_data': 0, 'errors': 0}
|
||
|
|
save_checkpoint(cp)
|
||
|
|
print(f"\nSleeping {args.interval}h until next cycle...", flush=True)
|
||
|
|
time.sleep(args.interval * 3600)
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
asyncio.run(main())
|