montana/Русский/Логистика/mt_mmsi_lookup.py

509 lines
19 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
MT MMSI Lookup Get MMSI for vessels in mt_bulk_staging by SHIP_ID
Two approaches:
1. quicksearch_shipid: filter reports by SHIP_ID to get vessel with MMSI
2. /search/searchAsset?what=vessel&term=NAME: search by name to get MMSI
Then for ownership: navigate to
/en/ais/details/ships/shipid:ID/mmsi:MMSI/imo:IMO/vessel:NAME#ownership
and capture API response with owner data.
Usage:
python mt_mmsi_lookup.py --probe # test endpoint approaches on a few vessels
python mt_mmsi_lookup.py --limit N # process N vessels (start with bulk GT=6)
python mt_mmsi_lookup.py # process all vessels without MMSI
"""
import asyncio, json, sys, os, time, re, struct, hmac, hashlib, base64, argparse
import psycopg2
os.chdir(os.path.dirname(os.path.abspath(__file__)))
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
EMAIL = "operation@mrlogisticcorp.com"
PASSWORD = "NKh9i8Z!7fU9jfi"
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
DB_URL = 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
DELAY = 0.8 # seconds between lookups
CKPT_FILE = 'mt_mmsi_checkpoint.json'
BATCH = 200 # commit every N vessels
# Columns for reports endpoint
COLUMNS = 'flag,shipname,imo,ship_type,lat_of_latest_position,lon_of_latest_position,speed'
def totp(secret):
s = secret.upper().replace(' ', '')
pad = (-len(s)) % 8
key = base64.b32decode(s + '=' * pad)
counter = int(time.time()) // 30
msg = struct.pack('>Q', counter)
h = hmac.new(key, msg, hashlib.sha1).digest()
offset = h[-1] & 0x0f
code = struct.unpack('>I', h[offset:offset + 4])[0] & 0x7fffffff
return str(code % 1000000).zfill(6)
async def do_login(page):
print("Login to MT Pro...")
await page.goto('https://www.marinetraffic.com/en/users/login',
wait_until='domcontentloaded', timeout=30000)
await asyncio.sleep(3)
await page.fill('input[name="username"]', EMAIL)
await page.click('button[type="submit"]')
await asyncio.sleep(3)
await page.fill('input[type="password"]', PASSWORD)
await page.click('button[type="submit"]')
await asyncio.sleep(4)
if 'mfa-login-options' in page.url or 'mfa' in page.url.lower():
print(" 2FA: Google Authenticator...")
try:
await page.click('button:has-text("Google Authenticator")', timeout=5000)
except Exception:
pass
await asyncio.sleep(2)
otp = totp(TOTP_SECRET)
print(f" TOTP: {otp}")
await page.fill('input[name="code"]', otp)
await page.click('button[type="submit"]')
await asyncio.sleep(5)
logged_in = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
print(f" Logged in: {logged_in} URL: {page.url}")
return logged_in
async def lookup_mmsi_by_shipid(page, ship_id):
"""
Filter reports endpoint by SHIP_ID using quicksearch_shipid param.
Returns {mmsi, imo, name, ...} or None.
"""
url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels'
f'&columns={COLUMNS}&quicksearch_shipid={ship_id}')
js = f"""
async () => {{
try {{
const resp = await fetch({json.dumps(url)}, {{
credentials: 'include',
headers: {{
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json',
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
}}
}});
const text = await resp.text();
const parsed = JSON.parse(text);
return {{
status: resp.status,
rows: (parsed.data || []).length,
data: (parsed.data || []).slice(0, 5),
}};
}} catch(e) {{
return {{status: 0, error: e.message}};
}}
}}
"""
try:
result = await page.evaluate(js)
return result
except Exception as e:
return {'error': str(e)}
async def lookup_mmsi_by_name(page, name):
"""
Use MT search autocomplete endpoint to find vessel by name.
Returns list of matching vessels with MMSI.
"""
encoded_name = name[:30].replace('"', '').replace("'", '')
url = f'https://www.marinetraffic.com/search/searchAsset?what=vessel&term={encoded_name}'
js = f"""
async () => {{
try {{
const resp = await fetch({json.dumps(url)}, {{
credentials: 'include',
headers: {{
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json',
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
}}
}});
const text = await resp.text();
const parsed = JSON.parse(text);
return {{status: resp.status, data: parsed}};
}} catch(e) {{
return {{status: 0, error: e.message}};
}}
}}
"""
try:
result = await page.evaluate(js)
return result
except Exception as e:
return {'error': str(e)}
async def get_ownership_page(page, ship_id, mmsi, imo, name):
"""
Navigate to vessel #ownership page and capture API response.
Returns ownership dict or None.
"""
# Build URL with all identifiers
name_slug = re.sub(r'[^A-Z0-9]', '_', name.upper())[:30]
imo_part = f'/imo:{imo}' if imo else ''
url = (f'https://www.marinetraffic.com/en/ais/details/ships'
f'/shipid:{ship_id}/mmsi:{mmsi}{imo_part}/vessel:{name_slug}')
ownership_data = {}
captured = []
async def capture_ownership(response):
r_url = response.url
# Look for ownership API call
if 'ownership' in r_url.lower() or 'company' in r_url.lower():
try:
body = await response.body()
text = body.decode('utf-8', errors='replace')
if text.startswith('{') or text.startswith('['):
captured.append({'url': r_url, 'body': text})
except Exception:
pass
# Also capture any vessel info API calls
elif 'getData' in r_url and not any(x in r_url for x in ['.js', '.css']):
try:
body = await response.body()
text = body.decode('utf-8', errors='replace')
if 'MMSI' in text or 'owner' in text.lower():
captured.append({'url': r_url, 'body': text[:3000]})
except Exception:
pass
page.on('response', capture_ownership)
try:
await page.goto(url, wait_until='load', timeout=25000)
await asyncio.sleep(5)
except Exception as e:
print(f" goto error: {e}")
page.remove_listener('response', capture_ownership)
# Also try via page.evaluate on ownership endpoint
if not captured:
js_ownership = f"""
async () => {{
const urls = [
'/getData/get_vessel_info/shipid:{ship_id}/mmsi:{mmsi}',
'/en/getData/get_vessel_ownership/mmsi:{mmsi}',
'/getData/get_vessel_ownership/mmsi:{mmsi}',
];
const results = [];
for (const u of urls) {{
try {{
const r = await fetch(u, {{
credentials: 'include',
headers: {{
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json',
}}
}});
const text = await r.text();
results.push({{url: u, status: r.status, body: text.substring(0, 2000)}});
}} catch(e) {{
results.push({{url: u, error: e.message}});
}}
}}
return results;
}}
"""
try:
ownership_results = await page.evaluate(js_ownership)
for r in ownership_results:
if r.get('status') == 200 and r.get('body', '').startswith('{'):
captured.append(r)
except Exception:
pass
# Parse captured responses
for cap in captured:
text = cap.get('body', '')
try:
data = json.loads(text)
except Exception:
continue
# Extract ownership fields
for k in ['BENEFICIAL_OWNER', 'beneficial_owner', 'manager']:
if data.get(k):
ownership_data['owner'] = str(data[k])
break
for k in ['OPERATOR', 'operator', 'charterer']:
if data.get(k):
ownership_data['operator'] = str(data[k])
break
# Try nested
rows = data.get('data', {}).get('rows', []) or data.get('rows', [])
if rows and isinstance(rows, list):
row = rows[0]
for k in ['MANAGER', 'BENEFICIAL_OWNER', 'owner']:
if row.get(k):
ownership_data['owner'] = str(row[k])
break
for k in ['OPERATOR', 'operator']:
if row.get(k):
ownership_data['operator'] = str(row[k])
break
# Also check current page URL - if MT loaded vessel page, URL contains mmsi
current_url = page.url
if 'mmsi:' in current_url and mmsi not in current_url:
# Extract MMSI from URL (for cross-check)
m = re.search(r'mmsi:(\d+)', current_url)
if m:
ownership_data['mmsi_from_url'] = m.group(1)
return ownership_data if ownership_data else None
async def main():
parser = argparse.ArgumentParser()
parser.add_argument('--probe', action='store_true', help='Test approaches on sample vessels')
parser.add_argument('--limit', type=int, default=0, help='Max vessels to process')
parser.add_argument('--gt6only', action='store_true', help='Only bulk carriers (GT=6)')
parser.add_argument('--ownership', action='store_true', help='Also scrape ownership pages')
args = parser.parse_args()
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
)
context = await browser.new_context(
viewport={'width': 1440, 'height': 900},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
)
page = await context.new_page()
# Login
logged_in = await do_login(page)
if not logged_in:
print("ERROR: Login failed!")
await browser.close()
conn.close()
return
await asyncio.sleep(5)
# Warm up session on data page
await page.goto('https://www.marinetraffic.com/en/data/?asset_type=vessels',
wait_until='load', timeout=40000)
await asyncio.sleep(5)
print(f"Session warmed up: {page.url}")
# ---- PROBE MODE ----
if args.probe:
print("\n=== PROBE: Testing MMSI lookup approaches ===")
# Get a few test vessels
cur.execute("""
SELECT ship_id, name, gt_shiptype, dwt, flag
FROM mt_bulk_staging
WHERE mmsi IS NULL AND lat IS NOT NULL
ORDER BY gt_shiptype, dwt DESC NULLS LAST
LIMIT 10
""")
test_vessels = cur.fetchall()
print(f"Test vessels: {len(test_vessels)}")
for ship_id, name, gt, dwt, flag in test_vessels:
print(f"\n [{gt}] {name} (SHIP_ID={ship_id}, DWT={dwt}, {flag})")
# Approach 1: quicksearch_shipid
r1 = await lookup_mmsi_by_shipid(page, ship_id)
rows = r1.get('data', [])
if rows:
row = rows[0]
print(f" quicksearch_shipid: MMSI={row.get('MMSI')} IMO={row.get('IMO')} name={row.get('SHIPNAME')}")
print(f" Full row: {json.dumps(row)[:300]}")
else:
print(f" quicksearch_shipid: {r1.get('rows',0)} rows, error={r1.get('error','')}")
await asyncio.sleep(0.5)
# Approach 2: name search
if name:
r2 = await lookup_mmsi_by_name(page, name)
search_data = r2.get('data', [])
if isinstance(search_data, list) and search_data:
print(f" name search: {len(search_data)} results")
print(f" First result: {json.dumps(search_data[0])[:300]}")
elif isinstance(search_data, dict):
print(f" name search dict: {json.dumps(search_data)[:300]}")
else:
print(f" name search: no results, error={r2.get('error','')}")
await asyncio.sleep(0.5)
await browser.close()
conn.close()
return
# ---- MAIN SCRAPE ----
# Load checkpoint
processed_ids = set()
if os.path.exists(CKPT_FILE):
with open(CKPT_FILE) as f:
cp = json.load(f)
processed_ids = set(cp.get('processed', []))
print(f"Checkpoint: {len(processed_ids)} already processed")
# Load vessels to process
query = """
SELECT ship_id, name, gt_shiptype, dwt, flag, lat, lon, mmsi
FROM mt_bulk_staging
WHERE mmsi IS NULL
"""
if args.gt6only:
query += " AND gt_shiptype = '6'"
query += " ORDER BY gt_shiptype, dwt DESC NULLS LAST"
if args.limit:
query += f" LIMIT {args.limit}"
cur.execute(query)
vessels = [(r[0], r[1], r[2], r[3], r[4], r[5], r[6])
for r in cur.fetchall()
if r[0] not in processed_ids]
print(f"Vessels to process: {len(vessels)}")
found_mmsi = 0
not_found_mmsi = 0
found_owner = 0
batch_count = 0
total = len(vessels)
for i, (ship_id, name, gt, dwt, flag, lat, lon) in enumerate(vessels):
print(f"\n[{i+1}/{total}] [{gt}] {name} ({flag}, DWT={dwt})")
mmsi = None
imo = None
# ---- Try quicksearch_shipid ----
r = await lookup_mmsi_by_shipid(page, ship_id)
rows = r.get('data', [])
if rows:
row = rows[0]
# Verify it's the same vessel (name match or position match)
api_name = (row.get('SHIPNAME') or '').upper()
our_name = (name or '').upper()
match_score = 0
if api_name and our_name:
if api_name == our_name:
match_score = 3
elif our_name[:10] in api_name or api_name[:10] in our_name:
match_score = 2
else:
match_score = 0
if match_score >= 2 or (len(rows) == 1):
mmsi = str(row.get('MMSI', '')) or None
imo = str(row.get('IMO', '')) or None
print(f" quicksearch: MMSI={mmsi} IMO={imo} (match={match_score})")
else:
print(f" quicksearch: {len(rows)} rows, name mismatch: {api_name} vs {our_name}")
# ---- Try name search if no MMSI yet ----
if not mmsi and name:
r2 = await lookup_mmsi_by_name(page, name)
search_data = r2.get('data', [])
if isinstance(search_data, list):
# Find best match by name
for item in search_data[:5]:
item_name = str(item.get('name', item.get('NAME', item.get('title', '')))).upper()
if item_name == name.upper() or name.upper()[:15] in item_name:
# Extract MMSI
for k in ['mmsi', 'MMSI', 'id']:
if item.get(k) and str(item[k]).isdigit():
mmsi = str(item[k])
break
if mmsi:
print(f" name search: MMSI={mmsi} ({item_name})")
break
if not mmsi:
print(f" name search: no match")
# ---- Update DB ----
if mmsi:
found_mmsi += 1
cur.execute("""
UPDATE mt_bulk_staging
SET mmsi=%s, imo=COALESCE(%s, imo), scraped_at=NOW()
WHERE ship_id=%s
""", (mmsi, imo, ship_id))
# ---- Ownership (if --ownership flag) ----
if args.ownership and imo:
print(f" Getting ownership...")
own = await get_ownership_page(page, ship_id, mmsi, imo, name or 'VESSEL')
if own:
found_owner += 1
cur.execute("""
UPDATE mt_bulk_staging
SET owner=COALESCE(%s, owner),
operator=COALESCE(%s, operator),
scraped_at=NOW()
WHERE ship_id=%s
""", (own.get('owner'), own.get('operator'), ship_id))
print(f" owner={own.get('owner')} op={own.get('operator')}")
else:
print(f" No ownership data found")
await asyncio.sleep(1.0)
else:
not_found_mmsi += 1
print(f" No MMSI found")
processed_ids.add(ship_id)
batch_count += 1
# Commit and checkpoint
if batch_count >= BATCH:
conn.commit()
with open(CKPT_FILE, 'w') as f:
json.dump({'processed': list(processed_ids)}, f)
print(f"\n=== CHECKPOINT {i+1}/{total}: "
f"found={found_mmsi}, not_found={not_found_mmsi}, owners={found_owner} ===")
batch_count = 0
await asyncio.sleep(DELAY)
# Final commit
conn.commit()
with open(CKPT_FILE, 'w') as f:
json.dump({'processed': list(processed_ids)}, f)
# Summary
cur.execute('SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NOT NULL')
total_mmsi = cur.fetchone()[0]
cur.execute("SELECT count(*) FROM mt_bulk_staging WHERE gt_shiptype='6' AND mmsi IS NOT NULL")
bulk_mmsi = cur.fetchone()[0]
print(f"\n=== DONE ===")
print(f" Processed: {len(vessels)}")
print(f" MMSI found this run: {found_mmsi}")
print(f" Not found: {not_found_mmsi}")
print(f" Owners found: {found_owner}")
print(f" Total with MMSI in DB: {total_mmsi}")
print(f" Bulk carriers with MMSI: {bulk_mmsi}")
conn.close()
await browser.close()
print("\nLookup complete!")
asyncio.run(main())