montana/Русский/Логистика/mt_reports_scraper.py

523 lines
20 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
MT Reports Scraper Paginate /en/reports/ to get ALL vessels with MMSI + Ownership
Endpoint discovered: https://www.marinetraffic.com/en/reports/?asset_type=vessels&columns=...
Returns JSON with: SHIP_ID, MMSI, IMO, SHIPNAME, FLAG, LAT, LON, SPEED, COURSE, TYPE_SUMMARY
+ ownership columns (manager, operator, beneficial_owner, etc.) if available in Pro account
Uses page.evaluate(fetch()) from browser context to bypass Cloudflare.
Usage:
python mt_reports_scraper.py # all vessels, auto-paginate
python mt_reports_scraper.py --probe # just discover fields + pagination
python mt_reports_scraper.py --limit 500 # stop after 500 vessels
python mt_reports_scraper.py --type 6 # bulk carriers only
"""
import asyncio, json, sys, os, time, re, struct, hmac, hashlib, base64, argparse
import psycopg2
os.chdir(os.path.dirname(os.path.abspath(__file__)))
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
EMAIL = "operation@mrlogisticcorp.com"
PASSWORD = "NKh9i8Z!7fU9jfi"
TOTP_SECRET = "MNWTEPTFJZBUC32GJFEWY6LVKQ2GGYKH"
DB_URL = 'postgresql://seafare:SF_m0ntana_2026@127.0.0.1:15432/seafare_db'
# The columns we want from MT reports
# Ownership columns: manager, operator - these require MT Pro Ownership service
BASE_COLUMNS = (
'flag,shipname,imo,ship_type,time_of_latest_position,'
'lat_of_latest_position,lon_of_latest_position,'
'speed,course,reported_destination'
)
OWNERSHIP_COLUMNS = 'manager,operator' # MT Pro ownership fields
ALL_COLUMNS = BASE_COLUMNS + ',' + OWNERSHIP_COLUMNS
BATCH_SIZE = 500 # commit every N rows
PAGE_DELAY = 2.0 # seconds between pages
# Checkpoint
CKPT_FILE = 'mt_reports_checkpoint.json'
def totp(secret):
s = secret.upper().replace(' ', '')
pad = (-len(s)) % 8
key = base64.b32decode(s + '=' * pad)
counter = int(time.time()) // 30
msg = struct.pack('>Q', counter)
h = hmac.new(key, msg, hashlib.sha1).digest()
offset = h[-1] & 0x0f
code = struct.unpack('>I', h[offset:offset + 4])[0] & 0x7fffffff
return str(code % 1000000).zfill(6)
async def do_login(page):
print("Login to MT Pro...")
await page.goto('https://www.marinetraffic.com/en/users/login',
wait_until='domcontentloaded', timeout=30000)
await asyncio.sleep(3)
await page.fill('input[name="username"]', EMAIL)
await page.click('button[type="submit"]')
await asyncio.sleep(3)
await page.fill('input[type="password"]', PASSWORD)
await page.click('button[type="submit"]')
await asyncio.sleep(4)
if 'mfa-login-options' in page.url or 'mfa' in page.url.lower():
print(" 2FA: Google Authenticator...")
try:
await page.click('button:has-text("Google Authenticator")', timeout=5000)
except Exception:
pass
await asyncio.sleep(2)
otp = totp(TOTP_SECRET)
print(f" TOTP: {otp}")
await page.fill('input[name="code"]', otp)
await page.click('button[type="submit"]')
await asyncio.sleep(5)
logged_in = 'marinetraffic.com' in page.url and 'auth.kpler' not in page.url
print(f" Logged in: {logged_in} URL: {page.url}")
return logged_in
async def fetch_reports_page(page, columns, vessel_type=None, page_num=1,
page_size=100, extra_filters=''):
"""
Fetch one page of vessel data from /en/reports/ via page.evaluate(fetch()).
Returns parsed JSON or None on error.
"""
url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels'
f'&columns={columns}')
if vessel_type:
url += f'&typefilter={vessel_type}'
url += f'&page={page_num}&pageSize={page_size}'
if extra_filters:
url += f'&{extra_filters}'
js_code = f"""
async () => {{
try {{
const resp = await fetch({json.dumps(url)}, {{
credentials: 'include',
headers: {{
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
'X-Requested-With': 'XMLHttpRequest',
}}
}});
const text = await resp.text();
return {{status: resp.status, url: {json.dumps(url)}, body: text}};
}} catch(e) {{
return {{status: 0, url: {json.dumps(url)}, error: e.message}};
}}
}}
"""
try:
result = await page.evaluate(js_code)
return result
except Exception as e:
print(f" evaluate error page {page_num}: {e}")
return None
async def probe_pagination(page):
"""Try different pagination approaches to find what works."""
print("\n=== Probing pagination ===")
# Try different URL param formats for pagination
# NOTE: no typefilter to get any vessel data
# DataTables.js format: draw=N&start=N&length=N is very common
param_tests = [
'page=1&pageSize=100',
'page=2&pageSize=100',
'draw=1&start=0&length=100',
'draw=2&start=100&length=100',
'draw=3&start=200&length=100',
]
for params in param_tests:
url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels'
f'&columns={BASE_COLUMNS}&{params}')
js = f"""
async () => {{
const resp = await fetch({json.dumps(url)}, {{
credentials: 'include',
headers: {{
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json',
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
}}
}});
const text = await resp.text();
// Return full body so we can see total count and all rows
try {{
const parsed = JSON.parse(text);
return {{
status: resp.status,
total: parsed.total || parsed.totalCount || parsed.recordsTotal || '?',
rows: (parsed.data || []).length,
firstShip: (parsed.data || [])[0] ? (parsed.data[0].SHIPNAME || '') : '',
keys: Object.keys((parsed.data || [])[0] || {{}}),
raw: text.substring(0, 300),
}};
}} catch(e) {{
return {{status: resp.status, error: e.message, raw: text.substring(0, 300)}};
}}
}}
"""
try:
r = await page.evaluate(js)
status = r.get('status', 0)
print(f" {params}: status={status} rows={r.get('rows','?')} "
f"total={r.get('total','?')} first={r.get('firstShip','?')}")
if r.get('keys'):
print(f" Keys: {r['keys']}")
if r.get('error'):
print(f" Error: {r['error']}")
print(f" Raw: {r.get('raw','')[:200]}")
except Exception as e:
print(f" {params}: error {e}")
await asyncio.sleep(0.5)
async def probe_ownership_columns(page):
"""Try to fetch ownership columns and see what's returned."""
print("\n=== Probing ownership columns ===")
ownership_variants = [
'manager',
'operator',
'beneficial_owner',
'registered_owner',
'commercial_manager',
'charterer',
'manager,operator',
'manager,operator,beneficial_owner',
]
for cols in ownership_variants:
url = (f'https://www.marinetraffic.com/en/reports/?asset_type=vessels'
f'&columns={BASE_COLUMNS},{cols}&typefilter=6&page=1&pageSize=10')
js = f"""
async () => {{
const resp = await fetch({json.dumps(url)}, {{
credentials: 'include',
headers: {{
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json',
'Referer': 'https://www.marinetraffic.com/en/data/?asset_type=vessels',
}}
}});
const text = await resp.text();
return {{status: resp.status, body: text.substring(0, 2000)}};
}}
"""
try:
r = await page.evaluate(js)
status = r.get('status', 0)
body = r.get('body', '')
if status == 200 and body.startswith('{'):
parsed = json.loads(r.get('body', body))
data = parsed.get('data', [])
if data:
keys = list(data[0].keys())
# Check if any ownership field is present
own_keys = [k for k in keys if any(x in k.upper()
for x in ['OWNER', 'OPERATOR', 'MANAGER', 'CHARTER'])]
print(f" cols={cols}: {len(data)} rows, own_keys={own_keys}")
if own_keys:
print(f" Sample: {{{k}: {data[0].get(k)} for k in own_keys}}")
else:
print(f" cols={cols}: {status} no data rows")
else:
print(f" cols={cols}: {status} -> {body[:100]}")
except Exception as e:
print(f" cols={cols}: error {e}")
await asyncio.sleep(0.5)
def parse_vessel_row(row):
"""Extract standardized vessel data from MT reports row."""
if not isinstance(row, dict):
return {}
r = {}
# Identity
for k in ['MMSI']: r['mmsi'] = str(row[k]) if row.get(k) else None
for k in ['IMO']: r['imo'] = str(row[k]) if row.get(k) else None
for k in ['SHIP_ID']: r['ship_id'] = str(row[k]) if row.get(k) else None
r['name'] = row.get('SHIPNAME') or row.get('NAME') or ''
# Flag
r['flag'] = row.get('CODE2') or row.get('FLAG') or row.get('COUNTRY') or ''
# Type
r['gt_shiptype'] = str(row.get('TYPE_COLOR') or row.get('TYPE_ID') or '')
r['shiptype'] = row.get('TYPE_SUMMARY') or ''
# DWT
for k in ['DWT', 'DEADWEIGHT']:
if row.get(k):
try: r['dwt'] = int(row[k])
except Exception: pass
break
# Position
for k in ['LAT', 'lat_of_latest_position']:
if row.get(k):
try: r['lat'] = float(row[k])
except Exception: pass
break
for k in ['LON', 'lon_of_latest_position']:
if row.get(k):
try: r['lon'] = float(row[k])
except Exception: pass
break
for k in ['SPEED']:
if row.get(k):
try: r['speed'] = float(row[k])
except Exception: pass
break
for k in ['COURSE']:
if row.get(k):
try: r['course'] = float(row[k])
except Exception: pass
break
r['destination'] = row.get('DESTINATION') or row.get('reported_destination') or ''
# Ownership — try various column name formats
for k in ['MANAGER', 'manager', 'COMMERCIAL_MANAGER', 'BENEFICIAL_OWNER']:
if row.get(k):
r['owner'] = str(row[k])
break
for k in ['OPERATOR', 'operator', 'CHARTERER']:
if row.get(k):
r['operator'] = str(row[k])
break
return r
async def main():
parser = argparse.ArgumentParser()
parser.add_argument('--probe', action='store_true', help='Probe pagination + ownership columns')
parser.add_argument('--type', type=int, default=0, help='Vessel type filter (e.g. 6=bulk)')
parser.add_argument('--limit', type=int, default=0, help='Max vessels to collect (0=all)')
parser.add_argument('--page_size', type=int, default=100, help='Rows per page (default 100)')
parser.add_argument('--max_pages', type=int, default=0, help='Max pages (0=all)')
parser.add_argument('--columns', type=str, default=ALL_COLUMNS, help='Columns to fetch')
args = parser.parse_args()
conn = None
cur = None
if not args.probe:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
args=['--no-sandbox', '--disable-blink-features=AutomationControlled']
)
context = await browser.new_context(
viewport={'width': 1440, 'height': 900},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
)
page = await context.new_page()
# ---- Login ----
logged_in = await do_login(page)
if not logged_in:
print("ERROR: Login failed!")
await browser.close()
conn.close()
return
await asyncio.sleep(5)
# ---- Load data page to establish session ----
data_url = 'https://www.marinetraffic.com/en/data/?asset_type=vessels'
print(f"\nLoading data page: {data_url}")
await page.goto(data_url, wait_until='load', timeout=40000)
await asyncio.sleep(5)
print(f" Data page loaded: {page.url}")
# ---- PROBE MODE ----
if args.probe:
await probe_pagination(page)
await probe_ownership_columns(page)
await browser.close()
conn.close()
return
# ---- SCRAPE MODE ----
vessel_type = args.type if args.type else None
page_size = args.page_size
max_pages = args.max_pages
limit = args.limit
columns = args.columns
total_collected = 0
total_pages = 0
current_page = 1
all_vessels = {}
print(f"\nStarting scrape: type={vessel_type}, pageSize={page_size}, "
f"max_pages={max_pages}, limit={limit}")
while True:
result = await fetch_reports_page(
page, columns, vessel_type, current_page, page_size)
if not result:
print(f" Page {current_page}: no result, stopping")
break
status = result.get('status', 0)
body = result.get('body', '')
if status != 200:
print(f" Page {current_page}: status={status}, stopping")
if body:
print(f" Body: {body[:200]}")
break
if not body.startswith('{'):
print(f" Page {current_page}: non-JSON response, stopping")
print(f" Body: {body[:200]}")
break
try:
parsed = json.loads(body)
except Exception as e:
print(f" Page {current_page}: parse error {e}")
break
rows = parsed.get('data', [])
total_count = (parsed.get('total') or parsed.get('totalCount') or
parsed.get('count') or 0)
if not rows:
print(f" Page {current_page}: empty data, stopping")
break
# Process rows
new_this_page = 0
for row in rows:
v = parse_vessel_row(row)
key = v.get('mmsi') or v.get('ship_id') or v.get('name')
if key and key not in all_vessels:
all_vessels[key] = v
new_this_page += 1
total_collected += 1
print(f" Page {current_page}: {len(rows)} rows, {new_this_page} new, "
f"total={total_count}, collected={total_collected}")
# Sample first page
if current_page == 1 and rows:
print(f" Columns: {list(rows[0].keys())}")
print(f" Sample: {json.dumps(rows[0])[:300]}")
# Commit batch to DB
if total_collected % BATCH_SIZE == 0 and total_collected > 0:
_upsert_vessels(cur, list(all_vessels.values()))
conn.commit()
all_vessels.clear()
print(f" Committed batch, total in DB now...")
total_pages += 1
# Stop conditions
if limit and total_collected >= limit:
print(f" Reached limit {limit}, stopping")
break
if max_pages and total_pages >= max_pages:
print(f" Reached max_pages {max_pages}, stopping")
break
if total_count and total_collected >= total_count:
print(f" Collected all {total_count} vessels, stopping")
break
if len(rows) < page_size:
print(f" Last page (fewer than pageSize rows), stopping")
break
current_page += 1
await asyncio.sleep(PAGE_DELAY)
# Final commit
if all_vessels:
_upsert_vessels(cur, list(all_vessels.values()))
conn.commit()
# Final stats
cur.execute('SELECT count(*) FROM mt_bulk_staging')
total_in_db = cur.fetchone()[0]
cur.execute("SELECT count(*) FROM mt_bulk_staging WHERE mmsi IS NOT NULL")
with_mmsi = cur.fetchone()[0]
cur.execute("SELECT count(*) FROM mt_bulk_staging WHERE owner IS NOT NULL")
with_owner = cur.fetchone()[0]
print(f"\n=== DONE ===")
print(f" Collected this run: {total_collected}")
print(f" Pages processed: {total_pages}")
print(f" mt_bulk_staging total: {total_in_db}")
print(f" With MMSI: {with_mmsi}")
print(f" With owner: {with_owner}")
conn.close()
await browser.close()
print("\nReports scraper complete!")
def _upsert_vessels(cur, vessels):
"""Upsert list of vessel dicts into mt_bulk_staging."""
inserted = 0
for v in vessels:
ship_id = v.get('ship_id') or v.get('mmsi') or v.get('name', '')[:20]
if not ship_id:
continue
try:
cur.execute("""
INSERT INTO mt_bulk_staging
(ship_id, name, flag, dwt, gt_shiptype, type_category,
lat, lon, speed, course, destination, mmsi, imo, owner, operator,
scraped_at)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, NOW())
ON CONFLICT (ship_id) DO UPDATE SET
mmsi = COALESCE(EXCLUDED.mmsi, mt_bulk_staging.mmsi),
imo = COALESCE(EXCLUDED.imo, mt_bulk_staging.imo),
owner = COALESCE(EXCLUDED.owner, mt_bulk_staging.owner),
operator = COALESCE(EXCLUDED.operator, mt_bulk_staging.operator),
lat = COALESCE(EXCLUDED.lat, mt_bulk_staging.lat),
lon = COALESCE(EXCLUDED.lon, mt_bulk_staging.lon),
flag = COALESCE(EXCLUDED.flag, mt_bulk_staging.flag),
name = COALESCE(EXCLUDED.name, mt_bulk_staging.name),
scraped_at = NOW()
""", (
ship_id,
v.get('name'),
v.get('flag'),
v.get('dwt'),
v.get('gt_shiptype'),
'bulk' if v.get('gt_shiptype') == '6' else 'general',
v.get('lat'),
v.get('lon'),
v.get('speed'),
v.get('course'),
v.get('destination'),
v.get('mmsi'),
v.get('imo'),
v.get('owner'),
v.get('operator'),
))
inserted += 1
except Exception as e:
print(f" DB error for {ship_id}: {e}")
return inserted
asyncio.run(main())