montana/Русский/Логистика/mt_enrichment.py

576 lines
22 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
MT Enrichment Fetch MMSI, IMO, Flag, LOA, Beam, Draught for all vessels.
Uses FREE MarineTraffic detail API endpoints (no Pro subscription needed):
/en/vessels/{shipid}/general MMSI, IMO, Flag, LOA, Beam, Type, Year Built
/en/vessels/{shipid}/position Draught, Speed, Course
Requires Playwright login (headless=False for Cloudflare bypass).
Runs LOCALLY on PC. Speed: ~0.16s/vessel (10x faster than Reports API).
Usage:
python mt_enrichment.py # All vessels needing MMSI/IMO
python mt_enrichment.py --probe # Test 10 vessels
python mt_enrichment.py --limit 500 # Process 500 vessels
python mt_enrichment.py --loop # Repeat every 12h
python mt_enrichment.py --reset # Clear checkpoint, start fresh
"""
import asyncio, json, sys, os, time, struct, hmac, hashlib, base64, argparse
import psycopg2
os.chdir(os.path.dirname(os.path.abspath(__file__)))
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8', errors='replace', line_buffering=True)
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8', errors='replace', line_buffering=True)
# ---- CONFIG ----
EMAIL = "operation@mrlogisticcorp.com"
PASSWORD = "NKh9i8Z!7fU9jfi"
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
IS_SERVER = sys.platform == 'linux'
DB_URL = os.environ.get('DATABASE_URL') or (
'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:5432/seafare_db' if IS_SERVER
else 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
)
DELAY = 0.15 # seconds between API calls (~0.16s/vessel observed)
BATCH = 500 # commit every N vessels (fast enough for larger batches)
CHECKPOINT = 'mt_enrichment_checkpoint.json'
LOOP_INTERVAL_HOURS = 12
MAX_CONSECUTIVE_ERRORS = 20
# ---- TOTP ----
def totp(secret):
s = secret.upper().replace(' ', '')
pad = (-len(s)) % 8
key = base64.b32decode(s + '=' * pad)
counter = int(time.time()) // 30
msg = struct.pack('>Q', counter)
h = hmac.new(key, msg, hashlib.sha1).digest()
offset = h[-1] & 0x0f
code = struct.unpack('>I', h[offset:offset+4])[0] & 0x7fffffff
return str(code % 1000000).zfill(6)
# ---- DB HELPERS ----
def db_connect():
return psycopg2.connect(
DB_URL, connect_timeout=15,
keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5
)
def _restart_ssh_tunnel():
if IS_SERVER:
return
import subprocess
try:
subprocess.run(['taskkill', '/F', '/IM', 'ssh.exe'], capture_output=True, timeout=5)
except Exception:
pass
time.sleep(2)
try:
subprocess.Popen(
['ssh', '-o', 'ServerAliveInterval=5', '-o', 'ServerAliveCountMax=120',
'-o', 'TCPKeepAlive=yes', '-o', 'StrictHostKeyChecking=no',
'-L', '15432:127.0.0.1:5432', '-N', 'root@89.19.208.158'],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
print(f" [SSH] Tunnel restarted, waiting 5s...", flush=True)
time.sleep(5)
except Exception as e:
print(f" [SSH] Failed: {e}", flush=True)
def db_reconnect(conn):
try:
conn.close()
except Exception:
pass
for attempt in range(5):
try:
time.sleep(3)
c = db_connect()
print(f" [DB] Reconnected (attempt {attempt+1})", flush=True)
return c, c.cursor()
except Exception as e:
print(f" [DB] Attempt {attempt+1} failed: {e}", flush=True)
if not IS_SERVER:
print(f" [DB] Restarting SSH tunnel...", flush=True)
_restart_ssh_tunnel()
for attempt in range(10):
try:
time.sleep(5)
c = db_connect()
print(f" [DB] Reconnected after tunnel restart (attempt {attempt+1})", flush=True)
return c, c.cursor()
except Exception as e:
print(f" [DB] Post-restart attempt {attempt+1} failed: {e}", flush=True)
if not IS_SERVER and attempt == 4:
_restart_ssh_tunnel()
raise Exception("DB reconnect failed")
def db_safe_execute(conn, cur, query, params=None):
try:
cur.execute(query, params)
return conn, cur
except (psycopg2.InterfaceError, psycopg2.OperationalError) as e:
print(f" [DB] Lost on execute ({e}), reconnecting...", flush=True)
conn, cur = db_reconnect(conn)
cur.execute(query, params)
return conn, cur
except psycopg2.Error:
try:
conn.rollback()
except Exception:
pass
raise
def db_safe_commit(conn):
try:
conn.commit()
return conn
except (psycopg2.InterfaceError, psycopg2.OperationalError):
print(f" [DB] Lost on commit, reconnecting...", flush=True)
conn, _ = db_reconnect(conn)
return conn
# ---- CHECKPOINT ----
def load_checkpoint():
if os.path.exists(CHECKPOINT):
try:
with open(CHECKPOINT, encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, IOError):
print(f" [WARN] Corrupt checkpoint, starting fresh", flush=True)
return {
'processed_ids': [],
'stats': {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
'no_data': 0, 'errors': 0},
'last_run': None,
}
def save_checkpoint(cp):
cp['last_run'] = time.strftime('%Y-%m-%dT%H:%M:%S')
with open(CHECKPOINT, 'w', encoding='utf-8') as f:
json.dump(cp, f)
# ---- PLAYWRIGHT LOGIN ----
async def do_login(page, max_retries=3):
for attempt in range(max_retries):
print(f"LOGIN (attempt {attempt+1}/{max_retries})...", flush=True)
try:
await page.goto('https://www.marinetraffic.com/en/users/login',
wait_until='domcontentloaded', timeout=30000)
except Exception as e:
print(f" Nav error: {e}", flush=True)
continue
await asyncio.sleep(3)
try:
await page.click('button:has-text("AGREE")', timeout=3000)
except:
pass
try:
await page.fill('input[name="username"]', EMAIL)
await page.click('button[type="submit"]')
except Exception as e:
print(f" Email error: {e}", flush=True)
continue
await asyncio.sleep(3)
try:
await page.fill('input[type="password"]', PASSWORD)
await page.click('button[type="submit"]')
except Exception as e:
print(f" Password error: {e}", flush=True)
continue
await asyncio.sleep(4)
if 'mfa' in page.url.lower() or 'auth.kpler' in page.url:
try:
await page.click('button:has-text("Google Authenticator")', timeout=3000)
await asyncio.sleep(2)
except:
pass
# Wait for fresh TOTP window
elapsed = int(time.time()) % 30
if 30 - elapsed < 8:
wait = 30 - elapsed + 2
print(f" TOTP: waiting {wait}s for fresh window...", flush=True)
await asyncio.sleep(wait)
otp = totp(TOTP_SECRET)
print(f" TOTP: {otp}", flush=True)
filled = False
for selector in ['input[name="code"]', 'input[type="tel"]', 'input[inputmode="numeric"]']:
try:
await page.fill(selector, otp, timeout=3000)
filled = True
break
except:
continue
if not filled:
try:
inputs = page.locator('input:visible')
for i in range(await inputs.count()):
inp = inputs.nth(i)
inp_type = await inp.get_attribute('type') or 'text'
if inp_type in ('text', 'tel', 'number'):
await inp.fill(otp)
filled = True
break
except:
pass
if filled:
await page.click('button[type="submit"]')
await asyncio.sleep(8)
ok = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
if ok:
print(f" Login OK", flush=True)
return True
print(f" Login failed: {page.url[:80]}", flush=True)
return False
# ---- FREE API FETCH ----
async def fetch_general(page, ship_id):
"""Fetch /en/vessels/{shipid}/general — MMSI, IMO, Flag, LOA, Beam, Type."""
js = """async () => {
try {
const r = await fetch('https://www.marinetraffic.com/en/vessels/""" + str(ship_id) + """/general', {
credentials: 'include', cache: 'no-store',
headers: {'Accept':'application/json','X-Requested-With':'XMLHttpRequest'}
});
if (r.status !== 200) return {error: 'HTTP '+r.status};
return await r.json();
} catch(e) { return {error: e.message}; }
}"""
try:
return await page.evaluate(js)
except Exception:
return {'error': 'evaluate_failed'}
async def fetch_position(page, ship_id):
"""Fetch /en/vessels/{shipid}/position — Draught, Speed, Course."""
js = """async () => {
try {
const r = await fetch('https://www.marinetraffic.com/en/vessels/""" + str(ship_id) + """/position', {
credentials: 'include', cache: 'no-store',
headers: {'Accept':'application/json','X-Requested-With':'XMLHttpRequest'}
});
if (r.status !== 200) return {error: 'HTTP '+r.status};
return await r.json();
} catch(e) { return {error: e.message}; }
}"""
try:
return await page.evaluate(js)
except Exception:
return {'error': 'evaluate_failed'}
# ---- MAIN ----
async def main():
parser = argparse.ArgumentParser(description='MT Vessel Enrichment (free endpoints)')
parser.add_argument('--probe', action='store_true', help='Test on 10 vessels')
parser.add_argument('--limit', type=int, default=0, help='Max vessels to process')
parser.add_argument('--delay', type=float, default=DELAY, help='Delay between API calls (s)')
parser.add_argument('--loop', action='store_true', help='Repeat every 12h')
parser.add_argument('--interval', type=float, default=LOOP_INTERVAL_HOURS, help='Hours between loops')
parser.add_argument('--reset', action='store_true', help='Clear checkpoint, start fresh')
parser.add_argument('--with-draught', action='store_true', help='Also fetch /position for draught')
args = parser.parse_args()
# DB connect
try:
conn = db_connect()
cur = conn.cursor()
cur.execute('SELECT count(*) FROM mt_bulk_staging')
total_db = cur.fetchone()[0]
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NULL')
no_mmsi = cur.fetchone()[0]
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE imo IS NULL')
no_imo = cur.fetchone()[0]
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE flag IS NULL')
no_flag = cur.fetchone()[0]
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE loa IS NULL')
no_loa = cur.fetchone()[0]
print(f"DB: {total_db} total | {no_mmsi} no MMSI | {no_imo} no IMO | "
f"{no_flag} no Flag | {no_loa} no LOA", flush=True)
except Exception as e:
print(f"DB ERROR: {e}", flush=True)
if not IS_SERVER:
print("Start SSH tunnel: ssh -L 15432:127.0.0.1:5432 -N root@89.19.208.158")
return
if args.reset and os.path.exists(CHECKPOINT):
os.remove(CHECKPOINT)
print("Checkpoint cleared.", flush=True)
while True: # loop mode
cp = load_checkpoint()
processed_set = set(cp['processed_ids'])
# Ensure stats has all needed keys (checkpoint may be from old format)
default_stats = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
'no_data': 0, 'errors': 0}
for k, v in default_stats.items():
if k not in cp['stats']:
cp['stats'][k] = v
stats = cp['stats']
# Vessels needing enrichment: no MMSI OR no IMO OR no LOA
query = """
SELECT ship_id, name, gt_shiptype, mmsi, imo, flag, loa, draught
FROM mt_bulk_staging
WHERE mmsi IS NULL OR imo IS NULL OR loa IS NULL
ORDER BY
CASE WHEN mmsi IS NULL THEN 0 ELSE 1 END,
CASE WHEN gt_shiptype = '6' THEN 0 ELSE 1 END
"""
if args.limit:
query += f" LIMIT {args.limit}"
elif args.probe:
query += " LIMIT 10"
cur.execute(query)
all_vessels = cur.fetchall()
vessels = [(sid, n, gt, mm, im, fl, lo, dr)
for sid, n, gt, mm, im, fl, lo, dr in all_vessels
if sid not in processed_set]
total = len(vessels)
skipped = len(all_vessels) - total
print(f"\nVessels to process: {total} (skipped {skipped} from checkpoint)", flush=True)
if total == 0:
print("Nothing to do!", flush=True)
if not args.loop:
break
print(f"Sleeping {args.interval}h until next cycle...", flush=True)
time.sleep(args.interval * 3600)
cp['processed_ids'] = []
cp['stats'] = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
'no_data': 0, 'errors': 0}
save_checkpoint(cp)
continue
calls_per_vessel = 2 if args.with_draught else 1
eta_sec = total * (args.delay + 0.16 * calls_per_vessel)
print(f"Delay: {args.delay}s | ~{0.16*calls_per_vessel:.2f}s/vessel | "
f"ETA: ~{eta_sec/60:.0f} min", flush=True)
# Launch Playwright
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
)
ctx = await browser.new_context(viewport={'width': 1440, 'height': 900})
page = await ctx.new_page()
if not await do_login(page):
print("LOGIN FAILED", flush=True)
await browser.close()
conn.close()
return
# Go to lightweight page for fetching
await page.goto('https://www.marinetraffic.com/robots.txt',
wait_until='load', timeout=15000)
await asyncio.sleep(2)
consecutive_errors = 0
batch_count = 0
t0 = time.time()
for i, (ship_id, name, gt, mmsi, imo, flag, loa, draught) in enumerate(vessels):
# Fetch /general
gen = await fetch_general(page, ship_id)
# API sometimes returns a list instead of dict — treat as error
if isinstance(gen, list):
gen = gen[0] if len(gen) == 1 and isinstance(gen[0], dict) else {'error': 'unexpected_list'}
if gen.get('error'):
consecutive_errors += 1
stats['errors'] += 1
if consecutive_errors >= MAX_CONSECUTIVE_ERRORS:
print(f"\n [FATAL] {consecutive_errors} consecutive errors, aborting.",
flush=True)
break
if i < 20 or (i + 1) % 100 == 0:
print(f" [{i+1}/{total}] {name} -> ERROR {gen['error']}", flush=True)
cp['processed_ids'].append(ship_id)
stats['no_data'] += 1
batch_count += 1
await asyncio.sleep(args.delay)
continue
consecutive_errors = 0
# Build UPDATE
api_mmsi = str(gen.get('mmsi', '')) if gen.get('mmsi') else None
api_imo = str(gen.get('imo', '')) if gen.get('imo') and str(gen.get('imo')) != '0' else None
api_flag = gen.get('countryCode') or None
api_loa = gen.get('length') or None
api_beam = gen.get('width') or None
api_year = gen.get('yearBuilt') or None
api_subtype = gen.get('subtype') or None
ct = gen.get('commercial_type') or {}
api_market = ct.get('size_class_name') or None
updates = []
params = []
if api_mmsi and not mmsi:
updates.append("mmsi = %s")
params.append(api_mmsi)
stats['mmsi'] += 1
if api_imo and not imo:
updates.append("imo = %s")
params.append(api_imo)
stats['imo'] += 1
if api_flag:
updates.append("flag = COALESCE(%s, flag)")
params.append(api_flag)
if not flag:
stats['flag'] += 1
if api_loa:
updates.append("loa = COALESCE(%s, loa)")
params.append(api_loa)
if not loa:
stats['loa'] += 1
if api_beam:
updates.append("beam = COALESCE(%s, beam)")
params.append(api_beam)
if api_year:
updates.append("year_built = COALESCE(%s, year_built)")
params.append(api_year)
if api_subtype:
updates.append("shiptype = COALESCE(%s, shiptype)")
params.append(api_subtype)
# Fetch /position for draught (optional)
if args.with_draught and not draught:
pos = await fetch_position(page, ship_id)
if not pos.get('error'):
api_draught = pos.get('draught')
if api_draught:
updates.append("draught = COALESCE(%s, draught)")
params.append(api_draught)
stats['draught'] += 1
if updates:
updates.append("scraped_at = NOW()")
params.append(ship_id)
try:
conn, cur = db_safe_execute(conn, cur,
f"UPDATE mt_bulk_staging SET {', '.join(updates)} WHERE ship_id = %s",
tuple(params))
except Exception as e:
stats['errors'] += 1
print(f" [DB] Update error: {e}", flush=True)
cp['processed_ids'].append(ship_id)
batch_count += 1
# Progress
if i < 20 or (i + 1) % 100 == 0:
print(f" [{i+1}/{total}] {name} -> "
f"MMSI={api_mmsi or mmsi or '?'} IMO={api_imo or imo or '?'} "
f"Flag={api_flag or flag or '?'} LOA={api_loa or loa or '?'}",
flush=True)
# Batch commit + checkpoint
if batch_count >= BATCH:
conn = db_safe_commit(conn)
cur = conn.cursor()
save_checkpoint(cp)
elapsed = time.time() - t0
rate = (i + 1) / elapsed if elapsed > 0 else 0
remaining = (total - i - 1) / rate if rate > 0 else 0
print(f"\n=== CHECKPOINT [{i+1}/{total}] {elapsed:.0f}s | "
f"mmsi={stats['mmsi']} imo={stats['imo']} "
f"flag={stats['flag']} loa={stats['loa']} "
f"err={stats['errors']} | "
f"ETA: {remaining/60:.0f}m ===\n", flush=True)
batch_count = 0
await asyncio.sleep(args.delay)
# Final commit
conn = db_safe_commit(conn)
cur = conn.cursor()
save_checkpoint(cp)
# Summary
cur.execute("SELECT count(*), count(mmsi), count(imo), count(flag), count(loa) "
"FROM mt_bulk_staging")
t, m, im, fl, lo = cur.fetchone()
cur.execute("SELECT count(*), count(mmsi), count(imo) "
"FROM mt_bulk_staging WHERE gt_shiptype='6'")
tb, mb, ib = cur.fetchone()
elapsed = time.time() - t0
print(f"\n{'='*60}", flush=True)
print(f"DONE in {elapsed/60:.1f} minutes!", flush=True)
print(f" Processed: {len(cp['processed_ids'])}", flush=True)
print(f" New MMSI: {stats['mmsi']}", flush=True)
print(f" New IMO: {stats['imo']}", flush=True)
print(f" New Flag: {stats['flag']}", flush=True)
print(f" New LOA: {stats['loa']}", flush=True)
print(f" Errors: {stats['errors']}", flush=True)
print(f"\nGLOBAL: total={t} mmsi={m} imo={im} flag={fl} loa={lo}", flush=True)
print(f"BULK: total={tb} mmsi={mb} imo={ib}", flush=True)
print(f"{'='*60}", flush=True)
await browser.close()
if not args.loop:
break
cp['processed_ids'] = []
cp['stats'] = {'mmsi': 0, 'imo': 0, 'flag': 0, 'loa': 0, 'draught': 0,
'no_data': 0, 'errors': 0}
save_checkpoint(cp)
print(f"\nSleeping {args.interval}h until next cycle...", flush=True)
time.sleep(args.interval * 3600)
conn.close()
if __name__ == '__main__':
asyncio.run(main())