543 lines
20 KiB
Python
543 lines
20 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
MT Phase 2: Get MMSI + Ownership data for vessels in mt_bulk_staging.
|
||
|
|
|
||
|
|
KEY INSIGHT: page.request.get() = 403 (CF blocks programmatic requests)
|
||
|
|
page.evaluate(fetch()) = 200 (browser's own TLS/cookies pass CF)
|
||
|
|
|
||
|
|
Strategy:
|
||
|
|
1. Login to MT Pro
|
||
|
|
2. Navigate to MT map (establishes session)
|
||
|
|
3. For each vessel without MMSI:
|
||
|
|
a. Try page.evaluate(fetch) to discover vessel info endpoint
|
||
|
|
b. Alternatively use MT search box (type name → capture autocomplete)
|
||
|
|
4. Update mt_bulk_staging with MMSI + ownership
|
||
|
|
5. Checkpoint progress
|
||
|
|
|
||
|
|
Usage: python mt_phase2_mmsi.py [--limit N] [--gt6only]
|
||
|
|
"""
|
||
|
|
import asyncio, json, sys, os, time, re, struct, hmac, hashlib, base64, argparse
|
||
|
|
import psycopg2
|
||
|
|
|
||
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
||
|
|
if hasattr(sys.stdout, 'reconfigure'):
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
if hasattr(sys.stderr, 'reconfigure'):
|
||
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
|
||
|
|
EMAIL = "operation@mrlogisticcorp.com"
|
||
|
|
PASSWORD = "NKh9i8Z!7fU9jfi"
|
||
|
|
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
|
||
|
|
|
||
|
|
DB_URL = 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
|
||
|
|
DELAY = 1.5 # seconds between vessel requests
|
||
|
|
BATCH = 100 # commit to DB every N vessels
|
||
|
|
CHECKPOINT_FILE = 'mt_phase2_checkpoint.json'
|
||
|
|
|
||
|
|
|
||
|
|
def totp(secret):
|
||
|
|
s = secret.upper().replace(' ', '')
|
||
|
|
pad = (-len(s)) % 8
|
||
|
|
key = base64.b32decode(s + '=' * pad)
|
||
|
|
counter = int(time.time()) // 30
|
||
|
|
msg = struct.pack('>Q', counter)
|
||
|
|
h = hmac.new(key, msg, hashlib.sha1).digest()
|
||
|
|
offset = h[-1] & 0x0f
|
||
|
|
code = struct.unpack('>I', h[offset:offset + 4])[0] & 0x7fffffff
|
||
|
|
return str(code % 1000000).zfill(6)
|
||
|
|
|
||
|
|
|
||
|
|
async def do_login(page):
|
||
|
|
print("Login to MT Pro...")
|
||
|
|
await page.goto('https://www.marinetraffic.com/en/users/login',
|
||
|
|
wait_until='domcontentloaded', timeout=30000)
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
await page.fill('input[name="username"]', EMAIL)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
await page.fill('input[type="password"]', PASSWORD)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
await asyncio.sleep(4)
|
||
|
|
if 'mfa-login-options' in page.url or 'mfa' in page.url.lower():
|
||
|
|
print(" 2FA: Google Authenticator...")
|
||
|
|
try:
|
||
|
|
await page.click('button:has-text("Google Authenticator")', timeout=5000)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
otp = totp(TOTP_SECRET)
|
||
|
|
print(f" TOTP: {otp}")
|
||
|
|
await page.fill('input[name="code"]', otp)
|
||
|
|
await page.click('button[type="submit"]')
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
logged_in = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
|
||
|
|
print(f" Logged in: {logged_in} URL: {page.url}")
|
||
|
|
return logged_in
|
||
|
|
|
||
|
|
|
||
|
|
async def discover_vessel_endpoint(page):
|
||
|
|
"""Navigate to map, click a vessel, discover what API endpoints are called."""
|
||
|
|
print("\nDiscovering vessel info endpoint...")
|
||
|
|
discovered = []
|
||
|
|
|
||
|
|
async def capture_all(response):
|
||
|
|
url = response.url
|
||
|
|
# Skip static assets
|
||
|
|
if any(x in url for x in ['.js', '.css', '.png', '.jpg', '.gif', '.woff', '.ico']):
|
||
|
|
return
|
||
|
|
# Capture ALL json/api responses
|
||
|
|
ct = response.headers.get('content-type', '')
|
||
|
|
if 'json' in ct or 'javascript' in ct or 'getData' in url or 'vessel' in url.lower():
|
||
|
|
try:
|
||
|
|
body = await response.body()
|
||
|
|
text = body.decode('utf-8', errors='replace')
|
||
|
|
if text.startswith('{') or text.startswith('['):
|
||
|
|
discovered.append({'url': url, 'size': len(body), 'body': text[:500]})
|
||
|
|
print(f" API: {url[:100]} ({len(body)}b)")
|
||
|
|
except Exception as e:
|
||
|
|
pass
|
||
|
|
|
||
|
|
page.on('response', capture_all)
|
||
|
|
|
||
|
|
# Navigate to map with some vessels
|
||
|
|
print(" Going to Rotterdam map...")
|
||
|
|
await page.goto(
|
||
|
|
'https://www.marinetraffic.com/en/ais/home/centerx:4.5/centery:51.9/zoom:12',
|
||
|
|
wait_until='load', timeout=35000)
|
||
|
|
await asyncio.sleep(10)
|
||
|
|
|
||
|
|
# Try clicking in the center of the map (where vessels likely are)
|
||
|
|
print(" Attempting map click...")
|
||
|
|
try:
|
||
|
|
map_el = await page.query_selector('canvas, .mapboxgl-canvas, #mapContainer, .map-container')
|
||
|
|
if map_el:
|
||
|
|
box = await map_el.bounding_box()
|
||
|
|
if box:
|
||
|
|
cx = box['x'] + box['width'] / 2
|
||
|
|
cy = box['y'] + box['height'] / 2
|
||
|
|
await page.mouse.click(cx, cy)
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
await page.mouse.click(cx - 50, cy)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
await page.mouse.click(cx + 50, cy - 30)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Click error: {e}")
|
||
|
|
|
||
|
|
page.remove_listener('response', capture_all)
|
||
|
|
|
||
|
|
if discovered:
|
||
|
|
with open('mt_discovered_endpoints.json', 'w') as f:
|
||
|
|
json.dump(discovered, f, indent=2)
|
||
|
|
print(f" Saved {len(discovered)} API calls -> mt_discovered_endpoints.json")
|
||
|
|
|
||
|
|
return discovered
|
||
|
|
|
||
|
|
|
||
|
|
async def fetch_vessel_info_browser(page, ship_id, name=''):
|
||
|
|
"""
|
||
|
|
Use page.evaluate(fetch()) to call MT vessel info API from browser context.
|
||
|
|
Browser's TLS fingerprint + cookies bypass Cloudflare.
|
||
|
|
Returns dict with mmsi, imo, owner, operator or empty dict.
|
||
|
|
"""
|
||
|
|
# Try multiple endpoint candidates
|
||
|
|
js_code = """
|
||
|
|
async (shipId) => {
|
||
|
|
const candidates = [
|
||
|
|
'/getData/get_vessel_info/ship_id:' + shipId,
|
||
|
|
'/en/getData/get_vessel_info/ship_id:' + shipId,
|
||
|
|
'/getData/get_data_json_4/ship_id:' + shipId,
|
||
|
|
'/getData/get_vessel_photo/ship_id:' + shipId,
|
||
|
|
];
|
||
|
|
const results = {};
|
||
|
|
for (const url of candidates) {
|
||
|
|
try {
|
||
|
|
const resp = await fetch(url, {
|
||
|
|
credentials: 'include',
|
||
|
|
headers: {
|
||
|
|
'X-Requested-With': 'XMLHttpRequest',
|
||
|
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||
|
|
'Referer': window.location.href,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
const text = await resp.text();
|
||
|
|
results[url] = {status: resp.status, body: text.substring(0, 2000)};
|
||
|
|
} catch(e) {
|
||
|
|
results[url] = {status: 0, error: e.message};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return results;
|
||
|
|
}
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
results = await page.evaluate(js_code, str(ship_id))
|
||
|
|
return results
|
||
|
|
except Exception as e:
|
||
|
|
print(f" evaluate error for {ship_id} ({name}): {e}")
|
||
|
|
return {}
|
||
|
|
|
||
|
|
|
||
|
|
async def search_vessel_by_name_browser(page, name):
|
||
|
|
"""
|
||
|
|
Use MT search box via browser evaluate.
|
||
|
|
Type vessel name, capture search results JSON.
|
||
|
|
Returns list of vessel dicts or empty list.
|
||
|
|
"""
|
||
|
|
# MT search endpoint (typically called by search input)
|
||
|
|
js_code = """
|
||
|
|
async (vesselName) => {
|
||
|
|
// Try common MT search endpoints
|
||
|
|
const candidates = [
|
||
|
|
'/getData/get_vessels_in_area/search_by:name/name:' + encodeURIComponent(vesselName),
|
||
|
|
'/getData/get_vessel_info/search:' + encodeURIComponent(vesselName),
|
||
|
|
'/en/getData/get_vessels_in_area/search:' + encodeURIComponent(vesselName),
|
||
|
|
];
|
||
|
|
const results = {};
|
||
|
|
for (const url of candidates) {
|
||
|
|
try {
|
||
|
|
const resp = await fetch(url, {
|
||
|
|
credentials: 'include',
|
||
|
|
headers: {
|
||
|
|
'X-Requested-With': 'XMLHttpRequest',
|
||
|
|
'Accept': 'application/json, */*',
|
||
|
|
'Referer': window.location.href,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
const text = await resp.text();
|
||
|
|
results[url] = {status: resp.status, body: text.substring(0, 3000)};
|
||
|
|
} catch(e) {
|
||
|
|
results[url] = {status: 0, error: e.message};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return results;
|
||
|
|
}
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
return await page.evaluate(js_code, name)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" search error for '{name}': {e}")
|
||
|
|
return {}
|
||
|
|
|
||
|
|
|
||
|
|
async def search_via_searchbox(page, name, captured_results):
|
||
|
|
"""Fill MT search box, wait for autocomplete, return results."""
|
||
|
|
# Find search input
|
||
|
|
selectors = [
|
||
|
|
'input[placeholder*="Search"]',
|
||
|
|
'input[placeholder*="search"]',
|
||
|
|
'input[type="search"]',
|
||
|
|
'.search-input input',
|
||
|
|
'[data-testid*="search"] input',
|
||
|
|
'#search input',
|
||
|
|
'.navbar input',
|
||
|
|
]
|
||
|
|
search_input = None
|
||
|
|
for sel in selectors:
|
||
|
|
try:
|
||
|
|
el = await page.query_selector(sel)
|
||
|
|
if el:
|
||
|
|
search_input = el
|
||
|
|
break
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
if not search_input:
|
||
|
|
return []
|
||
|
|
|
||
|
|
try:
|
||
|
|
await search_input.fill('')
|
||
|
|
await asyncio.sleep(0.3)
|
||
|
|
await search_input.type(name[:20], delay=80)
|
||
|
|
await asyncio.sleep(2.0)
|
||
|
|
# Check captured_results for search responses
|
||
|
|
return list(captured_results)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" searchbox error: {e}")
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def parse_vessel_from_response(text, ship_id=None):
|
||
|
|
"""Try to extract MMSI, IMO, owner from MT API response text."""
|
||
|
|
if not text or not text.startswith('{'):
|
||
|
|
return {}
|
||
|
|
try:
|
||
|
|
data = json.loads(text)
|
||
|
|
except Exception:
|
||
|
|
return {}
|
||
|
|
|
||
|
|
result = {}
|
||
|
|
|
||
|
|
# Direct fields
|
||
|
|
for key in ['MMSI', 'mmsi']:
|
||
|
|
if key in data:
|
||
|
|
result['mmsi'] = str(data[key])
|
||
|
|
for key in ['IMO', 'imo']:
|
||
|
|
if key in data:
|
||
|
|
result['imo'] = str(data[key])
|
||
|
|
|
||
|
|
# Nested: data.rows, data.vessel, etc.
|
||
|
|
rows = data.get('data', {}).get('rows', [])
|
||
|
|
if rows and isinstance(rows, list):
|
||
|
|
row = rows[0]
|
||
|
|
if 'MMSI' in row:
|
||
|
|
result['mmsi'] = str(row['MMSI'])
|
||
|
|
if 'IMO' in row:
|
||
|
|
result['imo'] = str(row['IMO'])
|
||
|
|
if 'SHIPNAME' in row:
|
||
|
|
result['name'] = row['SHIPNAME']
|
||
|
|
|
||
|
|
vessel = data.get('data', {}).get('vessel', {}) or data.get('vessel', {})
|
||
|
|
if vessel:
|
||
|
|
if 'mmsi' in vessel:
|
||
|
|
result['mmsi'] = str(vessel['mmsi'])
|
||
|
|
if 'imo' in vessel:
|
||
|
|
result['imo'] = str(vessel['imo'])
|
||
|
|
|
||
|
|
# Ownership
|
||
|
|
for key in ['BENEFICIAL_OWNER', 'beneficial_owner', 'owner']:
|
||
|
|
if key in data:
|
||
|
|
result['owner'] = str(data[key])
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def load_checkpoint():
|
||
|
|
if os.path.exists(CHECKPOINT_FILE):
|
||
|
|
with open(CHECKPOINT_FILE) as f:
|
||
|
|
return json.load(f)
|
||
|
|
return {'processed_ship_ids': [], 'found_mmsi': 0, 'not_found': 0}
|
||
|
|
|
||
|
|
|
||
|
|
def save_checkpoint(cp):
|
||
|
|
with open(CHECKPOINT_FILE, 'w') as f:
|
||
|
|
json.dump(cp, f)
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument('--limit', type=int, default=0, help='Max vessels to process (0=all)')
|
||
|
|
parser.add_argument('--gt6only', action='store_true', help='Only process GT_SHIPTYPE=6 (bulk)')
|
||
|
|
parser.add_argument('--probe', action='store_true', help='Just probe endpoints and exit')
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
cp = load_checkpoint()
|
||
|
|
processed_ids = set(cp.get('processed_ship_ids', []))
|
||
|
|
print(f"Checkpoint: {len(processed_ids)} already processed, "
|
||
|
|
f"{cp.get('found_mmsi', 0)} MMSI found")
|
||
|
|
|
||
|
|
# Load vessels from DB
|
||
|
|
conn = psycopg2.connect(DB_URL)
|
||
|
|
cur = conn.cursor()
|
||
|
|
|
||
|
|
query = """
|
||
|
|
SELECT ship_id, name, lat, lon, gt_shiptype, dwt, flag
|
||
|
|
FROM mt_bulk_staging
|
||
|
|
WHERE mmsi IS NULL
|
||
|
|
"""
|
||
|
|
if args.gt6only:
|
||
|
|
query += " AND gt_shiptype = '6'"
|
||
|
|
query += " ORDER BY gt_shiptype, dwt DESC NULLS LAST"
|
||
|
|
if args.limit:
|
||
|
|
query += f" LIMIT {args.limit}"
|
||
|
|
|
||
|
|
cur.execute(query)
|
||
|
|
vessels = cur.fetchall()
|
||
|
|
print(f"Vessels to process: {len(vessels)} (MMSI=NULL in DB)")
|
||
|
|
|
||
|
|
# Filter already processed
|
||
|
|
vessels = [(sid, nm, lat, lon, gt, dwt, flag)
|
||
|
|
for (sid, nm, lat, lon, gt, dwt, flag) in vessels
|
||
|
|
if sid not in processed_ids]
|
||
|
|
print(f"After checkpoint filter: {len(vessels)}")
|
||
|
|
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(
|
||
|
|
headless=False,
|
||
|
|
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
|
||
|
|
)
|
||
|
|
context = await browser.new_context(
|
||
|
|
viewport={'width': 1280, 'height': 900},
|
||
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||
|
|
'(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||
|
|
)
|
||
|
|
page = await context.new_page()
|
||
|
|
|
||
|
|
# Login
|
||
|
|
logged_in = await do_login(page)
|
||
|
|
if not logged_in:
|
||
|
|
print("ERROR: Login failed!")
|
||
|
|
await browser.close()
|
||
|
|
return
|
||
|
|
|
||
|
|
# Stabilize session
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
print("\nSession stabilized.")
|
||
|
|
|
||
|
|
# ---- PROBE MODE: just discover endpoints ----
|
||
|
|
if args.probe:
|
||
|
|
discovered = await discover_vessel_endpoint(page)
|
||
|
|
await browser.close()
|
||
|
|
return
|
||
|
|
|
||
|
|
# ---- Navigate to map (establishes AIS session) ----
|
||
|
|
print("Loading map for session warmup...")
|
||
|
|
await page.goto(
|
||
|
|
'https://www.marinetraffic.com/en/ais/home/centerx:4.5/centery:51.9/zoom:10',
|
||
|
|
wait_until='load', timeout=35000)
|
||
|
|
await asyncio.sleep(8)
|
||
|
|
print(f" Map loaded: {page.url}")
|
||
|
|
|
||
|
|
# ---- Set up search response capture ----
|
||
|
|
search_captures = []
|
||
|
|
|
||
|
|
async def capture_search_responses(response):
|
||
|
|
url = response.url
|
||
|
|
if any(x in url for x in ['getData', 'vessel', 'search', 'query']):
|
||
|
|
if not any(x in url for x in ['.js', '.css', '.png', '.jpg']):
|
||
|
|
try:
|
||
|
|
body = await response.body()
|
||
|
|
text = body.decode('utf-8', errors='replace')
|
||
|
|
if text.startswith('{') or text.startswith('['):
|
||
|
|
search_captures.append({'url': url, 'body': text})
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
page.on('response', capture_search_responses)
|
||
|
|
|
||
|
|
# ---- Process vessels ----
|
||
|
|
found_mmsi = cp.get('found_mmsi', 0)
|
||
|
|
not_found = cp.get('not_found', 0)
|
||
|
|
updated = 0
|
||
|
|
batch_ids = []
|
||
|
|
|
||
|
|
total = len(vessels)
|
||
|
|
for i, (ship_id, name, lat, lon, gt, dwt, flag) in enumerate(vessels):
|
||
|
|
print(f"\n[{i+1}/{total}] {name} (SHIP_ID={ship_id}, GT={gt}, DWT={dwt}, {flag})")
|
||
|
|
|
||
|
|
search_captures.clear()
|
||
|
|
result = {}
|
||
|
|
|
||
|
|
# ---- Approach 1: page.evaluate fetch on vessel info endpoint ----
|
||
|
|
fetch_results = await fetch_vessel_info_browser(page, ship_id, name)
|
||
|
|
for url, resp_data in fetch_results.items():
|
||
|
|
status = resp_data.get('status', 0)
|
||
|
|
body = resp_data.get('body', '')
|
||
|
|
if status == 200 and body.startswith('{'):
|
||
|
|
parsed = parse_vessel_from_response(body, ship_id)
|
||
|
|
if parsed.get('mmsi'):
|
||
|
|
result = parsed
|
||
|
|
print(f" Endpoint {url.split('/')[-1]}: MMSI={parsed['mmsi']}")
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
short = body[:80].replace('\n', ' ')
|
||
|
|
print(f" {url.split('/')[-1]}: {status} -> {short}")
|
||
|
|
|
||
|
|
# ---- Approach 2: search box (if no MMSI yet) ----
|
||
|
|
if not result.get('mmsi') and name:
|
||
|
|
print(f" Trying search box for '{name}'...")
|
||
|
|
search_captures.clear()
|
||
|
|
await search_via_searchbox(page, name, search_captures)
|
||
|
|
await asyncio.sleep(1.5)
|
||
|
|
|
||
|
|
for cap in search_captures:
|
||
|
|
parsed = parse_vessel_from_response(cap['body'], ship_id)
|
||
|
|
if parsed.get('mmsi'):
|
||
|
|
result = parsed
|
||
|
|
print(f" Search box: MMSI={parsed['mmsi']}")
|
||
|
|
break
|
||
|
|
|
||
|
|
# ---- Approach 3: navigate to vessel lat/lon, listen for click API ----
|
||
|
|
if not result.get('mmsi') and lat and lon:
|
||
|
|
# Navigate to vessel position, try JS click on vessel marker
|
||
|
|
tile_url = (f'https://www.marinetraffic.com/en/ais/home'
|
||
|
|
f'/centerx:{lon}/centery:{lat}/zoom:14')
|
||
|
|
try:
|
||
|
|
search_captures.clear()
|
||
|
|
await page.goto(tile_url, wait_until='load', timeout=25000)
|
||
|
|
await asyncio.sleep(8)
|
||
|
|
|
||
|
|
# Try clicking center of map and nearby positions
|
||
|
|
canvas = await page.query_selector('canvas, .mapboxgl-canvas')
|
||
|
|
if canvas:
|
||
|
|
box = await canvas.bounding_box()
|
||
|
|
if box:
|
||
|
|
cx = box['x'] + box['width'] / 2
|
||
|
|
cy = box['y'] + box['height'] / 2
|
||
|
|
for dx, dy in [(0,0), (20,0), (-20,0), (0,20), (0,-20),
|
||
|
|
(30,30), (-30,-30), (15,-15), (-15,15)]:
|
||
|
|
await page.mouse.click(cx + dx, cy + dy)
|
||
|
|
await asyncio.sleep(0.5)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
for cap in search_captures:
|
||
|
|
parsed = parse_vessel_from_response(cap['body'], ship_id)
|
||
|
|
if parsed.get('mmsi'):
|
||
|
|
result = parsed
|
||
|
|
print(f" Map click: MMSI={parsed['mmsi']}")
|
||
|
|
break
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Map nav error: {e}")
|
||
|
|
|
||
|
|
# ---- Update DB ----
|
||
|
|
if result.get('mmsi'):
|
||
|
|
mmsi = result['mmsi']
|
||
|
|
imo = result.get('imo')
|
||
|
|
owner = result.get('owner')
|
||
|
|
operator = result.get('operator')
|
||
|
|
cur.execute("""
|
||
|
|
UPDATE mt_bulk_staging
|
||
|
|
SET mmsi=?, imo=COALESCE(?, imo), owner=COALESCE(?, owner),
|
||
|
|
operator=COALESCE(?, operator), scraped_at=NOW()
|
||
|
|
WHERE ship_id=?
|
||
|
|
""".replace('?', '%s'), (mmsi, imo, owner, operator, ship_id))
|
||
|
|
found_mmsi += 1
|
||
|
|
updated += 1
|
||
|
|
print(f" -> UPDATED: MMSI={mmsi} IMO={imo} owner={owner}")
|
||
|
|
else:
|
||
|
|
not_found += 1
|
||
|
|
print(f" -> No MMSI found")
|
||
|
|
|
||
|
|
# Track processed
|
||
|
|
processed_ids.add(ship_id)
|
||
|
|
batch_ids.append(ship_id)
|
||
|
|
|
||
|
|
# Commit every BATCH vessels
|
||
|
|
if len(batch_ids) >= BATCH:
|
||
|
|
conn.commit()
|
||
|
|
cp['processed_ship_ids'] = list(processed_ids)
|
||
|
|
cp['found_mmsi'] = found_mmsi
|
||
|
|
cp['not_found'] = not_found
|
||
|
|
save_checkpoint(cp)
|
||
|
|
print(f"\n === CHECKPOINT: {i+1}/{total} processed, "
|
||
|
|
f"{found_mmsi} MMSI found, {not_found} not found ===")
|
||
|
|
batch_ids.clear()
|
||
|
|
|
||
|
|
await asyncio.sleep(DELAY)
|
||
|
|
|
||
|
|
# Final commit
|
||
|
|
conn.commit()
|
||
|
|
cp['processed_ship_ids'] = list(processed_ids)
|
||
|
|
cp['found_mmsi'] = found_mmsi
|
||
|
|
cp['not_found'] = not_found
|
||
|
|
save_checkpoint(cp)
|
||
|
|
|
||
|
|
print(f"\n=== DONE ===")
|
||
|
|
print(f" Processed: {len(vessels)}")
|
||
|
|
print(f" MMSI found: {found_mmsi}")
|
||
|
|
print(f" Not found: {not_found}")
|
||
|
|
print(f" Updated in DB: {updated}")
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
cur.execute("SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NOT NULL")
|
||
|
|
total_with_mmsi = cur.fetchone()[0]
|
||
|
|
print(f" Total with MMSI in DB: {total_with_mmsi}")
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
await browser.close()
|
||
|
|
print("\nPhase 2 complete!")
|
||
|
|
|
||
|
|
|
||
|
|
asyncio.run(main())
|