509 lines
19 KiB
Python
509 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
MT MMSI Lookup — Get MMSI for vessels in mt_bulk_staging by SHIP_ID
|
|
|
|
Two approaches:
|
|
1. quicksearch_shipid: filter reports by SHIP_ID to get vessel with MMSI
|
|
2. /search/searchAsset?what=vessel&term=NAME: search by name to get MMSI
|
|
|
|
Then for ownership: navigate to
|
|
/en/ais/details/ships/shipid:ID/mmsi:MMSI/imo:IMO/vessel:NAME#ownership
|
|
and capture API response with owner data.
|
|
|
|
Usage:
|
|
python mt_mmsi_lookup.py --probe # test endpoint approaches on a few vessels
|
|
python mt_mmsi_lookup.py --limit N # process N vessels (start with bulk GT=6)
|
|
python mt_mmsi_lookup.py # process all vessels without MMSI
|
|
"""
|
|
import asyncio, json, sys, os, time, re, struct, hmac, hashlib, base64, argparse
|
|
import psycopg2
|
|
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
if hasattr(sys.stdout, 'reconfigure'):
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|
if hasattr(sys.stderr, 'reconfigure'):
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
|
|
|
EMAIL = "operation@mrlogisticcorp.com"
|
|
PASSWORD = "NKh9i8Z!7fU9jfi"
|
|
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
|
|
DB_URL = 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
|
|
DELAY = 0.8 # seconds between lookups
|
|
CKPT_FILE = 'mt_mmsi_checkpoint.json'
|
|
BATCH = 200 # commit every N vessels
|
|
|
|
# Columns for reports endpoint
|
|
COLUMNS = 'flag,shipname,imo,ship_type,lat_of_latest_position,lon_of_latest_position,speed'
|
|
|
|
|
|
def totp(secret):
|
|
s = secret.upper().replace(' ', '')
|
|
pad = (-len(s)) % 8
|
|
key = base64.b32decode(s + '=' * pad)
|
|
counter = int(time.time()) // 30
|
|
msg = struct.pack('>Q', counter)
|
|
h = hmac.new(key, msg, hashlib.sha1).digest()
|
|
offset = h[-1] & 0x0f
|
|
code = struct.unpack('>I', h[offset:offset + 4])[0] & 0x7fffffff
|
|
return str(code % 1000000).zfill(6)
|
|
|
|
|
|
async def do_login(page):
|
|
print("Login to MT Pro...")
|
|
await page.goto('https://www.marinetraffic.com/en/users/login',
|
|
wait_until='domcontentloaded', timeout=30000)
|
|
await asyncio.sleep(3)
|
|
await page.fill('input[name="username"]', EMAIL)
|
|
await page.click('button[type="submit"]')
|
|
await asyncio.sleep(3)
|
|
await page.fill('input[type="password"]', PASSWORD)
|
|
await page.click('button[type="submit"]')
|
|
await asyncio.sleep(4)
|
|
if 'mfa-login-options' in page.url or 'mfa' in page.url.lower():
|
|
print(" 2FA: Google Authenticator...")
|
|
try:
|
|
await page.click('button:has-text("Google Authenticator")', timeout=5000)
|
|
except Exception:
|
|
pass
|
|
await asyncio.sleep(2)
|
|
otp = totp(TOTP_SECRET)
|
|
print(f" TOTP: {otp}")
|
|
await page.fill('input[name="code"]', otp)
|
|
await page.click('button[type="submit"]')
|
|
await asyncio.sleep(5)
|
|
logged_in = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
|
|
print(f" Logged in: {logged_in} URL: {page.url}")
|
|
return logged_in
|
|
|
|
|
|
async def lookup_mmsi_by_shipid(page, ship_id):
|
|
"""
|
|
Filter reports endpoint by SHIP_ID using quicksearch_shipid param.
|
|
Returns {mmsi, imo, name, ...} or None.
|
|
"""
|
|
url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels'
|
|
f'&columns={COLUMNS}&quicksearch_shipid={ship_id}')
|
|
js = f"""
|
|
async () => {{
|
|
try {{
|
|
const resp = await fetch({json.dumps(url)}, {{
|
|
credentials: 'include',
|
|
headers: {{
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'Accept': 'application/json',
|
|
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
|
|
}}
|
|
}});
|
|
const text = await resp.text();
|
|
const parsed = JSON.parse(text);
|
|
return {{
|
|
status: resp.status,
|
|
rows: (parsed.data || []).length,
|
|
data: (parsed.data || []).slice(0, 5),
|
|
}};
|
|
}} catch(e) {{
|
|
return {{status: 0, error: e.message}};
|
|
}}
|
|
}}
|
|
"""
|
|
try:
|
|
result = await page.evaluate(js)
|
|
return result
|
|
except Exception as e:
|
|
return {'error': str(e)}
|
|
|
|
|
|
async def lookup_mmsi_by_name(page, name):
|
|
"""
|
|
Use MT search autocomplete endpoint to find vessel by name.
|
|
Returns list of matching vessels with MMSI.
|
|
"""
|
|
encoded_name = name[:30].replace('"', '').replace("'", '')
|
|
url = f'https://www.marinetraffic.com/search/searchAsset?what=vessel&term={encoded_name}'
|
|
js = f"""
|
|
async () => {{
|
|
try {{
|
|
const resp = await fetch({json.dumps(url)}, {{
|
|
credentials: 'include',
|
|
headers: {{
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'Accept': 'application/json',
|
|
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
|
|
}}
|
|
}});
|
|
const text = await resp.text();
|
|
const parsed = JSON.parse(text);
|
|
return {{status: resp.status, data: parsed}};
|
|
}} catch(e) {{
|
|
return {{status: 0, error: e.message}};
|
|
}}
|
|
}}
|
|
"""
|
|
try:
|
|
result = await page.evaluate(js)
|
|
return result
|
|
except Exception as e:
|
|
return {'error': str(e)}
|
|
|
|
|
|
async def get_ownership_page(page, ship_id, mmsi, imo, name):
|
|
"""
|
|
Navigate to vessel #ownership page and capture API response.
|
|
Returns ownership dict or None.
|
|
"""
|
|
# Build URL with all identifiers
|
|
name_slug = re.sub(r'[^A-Z0-9]', '_', name.upper())[:30]
|
|
imo_part = f'/imo:{imo}' if imo else ''
|
|
url = (f'https://www.marinetraffic.com/en/ais/details/ships'
|
|
f'/shipid:{ship_id}/mmsi:{mmsi}{imo_part}/vessel:{name_slug}')
|
|
|
|
ownership_data = {}
|
|
captured = []
|
|
|
|
async def capture_ownership(response):
|
|
r_url = response.url
|
|
# Look for ownership API call
|
|
if 'ownership' in r_url.lower() or 'company' in r_url.lower():
|
|
try:
|
|
body = await response.body()
|
|
text = body.decode('utf-8', errors='replace')
|
|
if text.startswith('{') or text.startswith('['):
|
|
captured.append({'url': r_url, 'body': text})
|
|
except Exception:
|
|
pass
|
|
# Also capture any vessel info API calls
|
|
elif 'getData' in r_url and not any(x in r_url for x in ['.js', '.css']):
|
|
try:
|
|
body = await response.body()
|
|
text = body.decode('utf-8', errors='replace')
|
|
if 'MMSI' in text or 'owner' in text.lower():
|
|
captured.append({'url': r_url, 'body': text[:3000]})
|
|
except Exception:
|
|
pass
|
|
|
|
page.on('response', capture_ownership)
|
|
try:
|
|
await page.goto(url, wait_until='load', timeout=25000)
|
|
await asyncio.sleep(5)
|
|
except Exception as e:
|
|
print(f" goto error: {e}")
|
|
page.remove_listener('response', capture_ownership)
|
|
|
|
# Also try via page.evaluate on ownership endpoint
|
|
if not captured:
|
|
js_ownership = f"""
|
|
async () => {{
|
|
const urls = [
|
|
'/getData/get_vessel_info/shipid:{ship_id}/mmsi:{mmsi}',
|
|
'/en/getData/get_vessel_ownership/mmsi:{mmsi}',
|
|
'/getData/get_vessel_ownership/mmsi:{mmsi}',
|
|
];
|
|
const results = [];
|
|
for (const u of urls) {{
|
|
try {{
|
|
const r = await fetch(u, {{
|
|
credentials: 'include',
|
|
headers: {{
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'Accept': 'application/json',
|
|
}}
|
|
}});
|
|
const text = await r.text();
|
|
results.push({{url: u, status: r.status, body: text.substring(0, 2000)}});
|
|
}} catch(e) {{
|
|
results.push({{url: u, error: e.message}});
|
|
}}
|
|
}}
|
|
return results;
|
|
}}
|
|
"""
|
|
try:
|
|
ownership_results = await page.evaluate(js_ownership)
|
|
for r in ownership_results:
|
|
if r.get('status') == 200 and r.get('body', '').startswith('{'):
|
|
captured.append(r)
|
|
except Exception:
|
|
pass
|
|
|
|
# Parse captured responses
|
|
for cap in captured:
|
|
text = cap.get('body', '')
|
|
try:
|
|
data = json.loads(text)
|
|
except Exception:
|
|
continue
|
|
# Extract ownership fields
|
|
for k in ['BENEFICIAL_OWNER', 'beneficial_owner', 'manager']:
|
|
if data.get(k):
|
|
ownership_data['owner'] = str(data[k])
|
|
break
|
|
for k in ['OPERATOR', 'operator', 'charterer']:
|
|
if data.get(k):
|
|
ownership_data['operator'] = str(data[k])
|
|
break
|
|
# Try nested
|
|
rows = data.get('data', {}).get('rows', []) or data.get('rows', [])
|
|
if rows and isinstance(rows, list):
|
|
row = rows[0]
|
|
for k in ['MANAGER', 'BENEFICIAL_OWNER', 'owner']:
|
|
if row.get(k):
|
|
ownership_data['owner'] = str(row[k])
|
|
break
|
|
for k in ['OPERATOR', 'operator']:
|
|
if row.get(k):
|
|
ownership_data['operator'] = str(row[k])
|
|
break
|
|
|
|
# Also check current page URL - if MT loaded vessel page, URL contains mmsi
|
|
current_url = page.url
|
|
if 'mmsi:' in current_url and mmsi not in current_url:
|
|
# Extract MMSI from URL (for cross-check)
|
|
m = re.search(r'mmsi:(\d+)', current_url)
|
|
if m:
|
|
ownership_data['mmsi_from_url'] = m.group(1)
|
|
|
|
return ownership_data if ownership_data else None
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--probe', action='store_true', help='Test approaches on sample vessels')
|
|
parser.add_argument('--limit', type=int, default=0, help='Max vessels to process')
|
|
parser.add_argument('--gt6only', action='store_true', help='Only bulk carriers (GT=6)')
|
|
parser.add_argument('--ownership', action='store_true', help='Also scrape ownership pages')
|
|
args = parser.parse_args()
|
|
|
|
conn = psycopg2.connect(DB_URL)
|
|
cur = conn.cursor()
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=False,
|
|
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
|
|
)
|
|
context = await browser.new_context(
|
|
viewport={'width': 1440, 'height': 900},
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
)
|
|
page = await context.new_page()
|
|
|
|
# Login
|
|
logged_in = await do_login(page)
|
|
if not logged_in:
|
|
print("ERROR: Login failed!")
|
|
await browser.close()
|
|
conn.close()
|
|
return
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
# Warm up session on data page
|
|
await page.goto('https://www.marinetraffic.com/en/data/?asset_type=vessels',
|
|
wait_until='load', timeout=40000)
|
|
await asyncio.sleep(5)
|
|
print(f"Session warmed up: {page.url}")
|
|
|
|
# ---- PROBE MODE ----
|
|
if args.probe:
|
|
print("\n=== PROBE: Testing MMSI lookup approaches ===")
|
|
|
|
# Get a few test vessels
|
|
cur.execute("""
|
|
SELECT ship_id, name, gt_shiptype, dwt, flag
|
|
FROM mt_bulk_staging
|
|
WHERE mmsi IS NULL AND lat IS NOT NULL
|
|
ORDER BY gt_shiptype, dwt DESC NULLS LAST
|
|
LIMIT 10
|
|
""")
|
|
test_vessels = cur.fetchall()
|
|
print(f"Test vessels: {len(test_vessels)}")
|
|
|
|
for ship_id, name, gt, dwt, flag in test_vessels:
|
|
print(f"\n [{gt}] {name} (SHIP_ID={ship_id}, DWT={dwt}, {flag})")
|
|
|
|
# Approach 1: quicksearch_shipid
|
|
r1 = await lookup_mmsi_by_shipid(page, ship_id)
|
|
rows = r1.get('data', [])
|
|
if rows:
|
|
row = rows[0]
|
|
print(f" quicksearch_shipid: MMSI={row.get('MMSI')} IMO={row.get('IMO')} name={row.get('SHIPNAME')}")
|
|
print(f" Full row: {json.dumps(row)[:300]}")
|
|
else:
|
|
print(f" quicksearch_shipid: {r1.get('rows',0)} rows, error={r1.get('error','')}")
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Approach 2: name search
|
|
if name:
|
|
r2 = await lookup_mmsi_by_name(page, name)
|
|
search_data = r2.get('data', [])
|
|
if isinstance(search_data, list) and search_data:
|
|
print(f" name search: {len(search_data)} results")
|
|
print(f" First result: {json.dumps(search_data[0])[:300]}")
|
|
elif isinstance(search_data, dict):
|
|
print(f" name search dict: {json.dumps(search_data)[:300]}")
|
|
else:
|
|
print(f" name search: no results, error={r2.get('error','')}")
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
await browser.close()
|
|
conn.close()
|
|
return
|
|
|
|
# ---- MAIN SCRAPE ----
|
|
# Load checkpoint
|
|
processed_ids = set()
|
|
if os.path.exists(CKPT_FILE):
|
|
with open(CKPT_FILE) as f:
|
|
cp = json.load(f)
|
|
processed_ids = set(cp.get('processed', []))
|
|
print(f"Checkpoint: {len(processed_ids)} already processed")
|
|
|
|
# Load vessels to process
|
|
query = """
|
|
SELECT ship_id, name, gt_shiptype, dwt, flag, lat, lon, mmsi
|
|
FROM mt_bulk_staging
|
|
WHERE mmsi IS NULL
|
|
"""
|
|
if args.gt6only:
|
|
query += " AND gt_shiptype = '6'"
|
|
query += " ORDER BY gt_shiptype, dwt DESC NULLS LAST"
|
|
if args.limit:
|
|
query += f" LIMIT {args.limit}"
|
|
|
|
cur.execute(query)
|
|
vessels = [(r[0], r[1], r[2], r[3], r[4], r[5], r[6])
|
|
for r in cur.fetchall()
|
|
if r[0] not in processed_ids]
|
|
print(f"Vessels to process: {len(vessels)}")
|
|
|
|
found_mmsi = 0
|
|
not_found_mmsi = 0
|
|
found_owner = 0
|
|
batch_count = 0
|
|
total = len(vessels)
|
|
|
|
for i, (ship_id, name, gt, dwt, flag, lat, lon) in enumerate(vessels):
|
|
print(f"\n[{i+1}/{total}] [{gt}] {name} ({flag}, DWT={dwt})")
|
|
|
|
mmsi = None
|
|
imo = None
|
|
|
|
# ---- Try quicksearch_shipid ----
|
|
r = await lookup_mmsi_by_shipid(page, ship_id)
|
|
rows = r.get('data', [])
|
|
if rows:
|
|
row = rows[0]
|
|
# Verify it's the same vessel (name match or position match)
|
|
api_name = (row.get('SHIPNAME') or '').upper()
|
|
our_name = (name or '').upper()
|
|
match_score = 0
|
|
if api_name and our_name:
|
|
if api_name == our_name:
|
|
match_score = 3
|
|
elif our_name[:10] in api_name or api_name[:10] in our_name:
|
|
match_score = 2
|
|
else:
|
|
match_score = 0
|
|
|
|
if match_score >= 2 or (len(rows) == 1):
|
|
mmsi = str(row.get('MMSI', '')) or None
|
|
imo = str(row.get('IMO', '')) or None
|
|
print(f" quicksearch: MMSI={mmsi} IMO={imo} (match={match_score})")
|
|
else:
|
|
print(f" quicksearch: {len(rows)} rows, name mismatch: {api_name} vs {our_name}")
|
|
|
|
# ---- Try name search if no MMSI yet ----
|
|
if not mmsi and name:
|
|
r2 = await lookup_mmsi_by_name(page, name)
|
|
search_data = r2.get('data', [])
|
|
if isinstance(search_data, list):
|
|
# Find best match by name
|
|
for item in search_data[:5]:
|
|
item_name = str(item.get('name', item.get('NAME', item.get('title', '')))).upper()
|
|
if item_name == name.upper() or name.upper()[:15] in item_name:
|
|
# Extract MMSI
|
|
for k in ['mmsi', 'MMSI', 'id']:
|
|
if item.get(k) and str(item[k]).isdigit():
|
|
mmsi = str(item[k])
|
|
break
|
|
if mmsi:
|
|
print(f" name search: MMSI={mmsi} ({item_name})")
|
|
break
|
|
if not mmsi:
|
|
print(f" name search: no match")
|
|
|
|
# ---- Update DB ----
|
|
if mmsi:
|
|
found_mmsi += 1
|
|
cur.execute("""
|
|
UPDATE mt_bulk_staging
|
|
SET mmsi=%s, imo=COALESCE(%s, imo), scraped_at=NOW()
|
|
WHERE ship_id=%s
|
|
""", (mmsi, imo, ship_id))
|
|
|
|
# ---- Ownership (if --ownership flag) ----
|
|
if args.ownership and imo:
|
|
print(f" Getting ownership...")
|
|
own = await get_ownership_page(page, ship_id, mmsi, imo, name or 'VESSEL')
|
|
if own:
|
|
found_owner += 1
|
|
cur.execute("""
|
|
UPDATE mt_bulk_staging
|
|
SET owner=COALESCE(%s, owner),
|
|
operator=COALESCE(%s, operator),
|
|
scraped_at=NOW()
|
|
WHERE ship_id=%s
|
|
""", (own.get('owner'), own.get('operator'), ship_id))
|
|
print(f" owner={own.get('owner')} op={own.get('operator')}")
|
|
else:
|
|
print(f" No ownership data found")
|
|
await asyncio.sleep(1.0)
|
|
else:
|
|
not_found_mmsi += 1
|
|
print(f" No MMSI found")
|
|
|
|
processed_ids.add(ship_id)
|
|
batch_count += 1
|
|
|
|
# Commit and checkpoint
|
|
if batch_count >= BATCH:
|
|
conn.commit()
|
|
with open(CKPT_FILE, 'w') as f:
|
|
json.dump({'processed': list(processed_ids)}, f)
|
|
print(f"\n=== CHECKPOINT {i+1}/{total}: "
|
|
f"found={found_mmsi}, not_found={not_found_mmsi}, owners={found_owner} ===")
|
|
batch_count = 0
|
|
|
|
await asyncio.sleep(DELAY)
|
|
|
|
# Final commit
|
|
conn.commit()
|
|
with open(CKPT_FILE, 'w') as f:
|
|
json.dump({'processed': list(processed_ids)}, f)
|
|
|
|
# Summary
|
|
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NOT NULL')
|
|
total_mmsi = cur.fetchone()[0]
|
|
cur.execute("SELECT count(*) FROM mt_bulk_staging WHERE gt_shiptype='6' AND mmsi IS NOT NULL")
|
|
bulk_mmsi = cur.fetchone()[0]
|
|
|
|
print(f"\n=== DONE ===")
|
|
print(f" Processed: {len(vessels)}")
|
|
print(f" MMSI found this run: {found_mmsi}")
|
|
print(f" Not found: {not_found_mmsi}")
|
|
print(f" Owners found: {found_owner}")
|
|
print(f" Total with MMSI in DB: {total_mmsi}")
|
|
print(f" Bulk carriers with MMSI: {bulk_mmsi}")
|
|
|
|
conn.close()
|
|
await browser.close()
|
|
print("\nLookup complete!")
|
|
|
|
|
|
asyncio.run(main())
|