255 lines
7.7 KiB
Python
255 lines
7.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
Continuous monitor for mt_detail_sweep.py and mt_green_fleet.py
|
||
|
|
Runs for 12 hours, checks every 5 minutes, auto-restarts dead processes.
|
||
|
|
|
||
|
|
NOTE: On this Windows+Git-Bash system:
|
||
|
|
bash /tmp = C:\\Users\\0\\AppData\\Local\\Temp
|
||
|
|
Python must use Windows paths for os.path.getmtime etc.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import subprocess
|
||
|
|
import time
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
WORK_DIR = r"c:\project\Cursor\Логистика"
|
||
|
|
# Bash /tmp maps to Windows AppData\Local\Temp
|
||
|
|
WINDOWS_TEMP = r"C:\Users\0\AppData\Local\Temp"
|
||
|
|
|
||
|
|
LOG_DETAIL_BASH = "/tmp/mt_detail_live.txt"
|
||
|
|
LOG_GREEN_BASH = "/tmp/green_fleet_live.txt"
|
||
|
|
LOG_DETAIL_WIN = os.path.join(WINDOWS_TEMP, "mt_detail_live.txt")
|
||
|
|
LOG_GREEN_WIN = os.path.join(WINDOWS_TEMP, "green_fleet_live.txt")
|
||
|
|
|
||
|
|
START_TIME = time.time()
|
||
|
|
DURATION_HOURS = 12
|
||
|
|
CHECK_INTERVAL = 300 # 5 minutes between checks
|
||
|
|
STALE_THRESHOLD = 600 # 10 min stale = process considered dead
|
||
|
|
RESTART_WAIT = 120 # seconds to wait after restart before next check
|
||
|
|
|
||
|
|
check_count = 0
|
||
|
|
|
||
|
|
|
||
|
|
def ts():
|
||
|
|
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
|
|
|
||
|
|
|
||
|
|
def log(msg):
|
||
|
|
print(f"[{ts()}] {msg}", flush=True)
|
||
|
|
|
||
|
|
|
||
|
|
def log_age_seconds(win_path):
|
||
|
|
"""Return seconds since log was last modified. Returns -1 if file missing."""
|
||
|
|
try:
|
||
|
|
mtime = os.path.getmtime(win_path)
|
||
|
|
return time.time() - mtime
|
||
|
|
except OSError:
|
||
|
|
return -1
|
||
|
|
|
||
|
|
|
||
|
|
def log_tail(win_path, n=3):
|
||
|
|
"""Return last N non-empty lines of log file."""
|
||
|
|
try:
|
||
|
|
with open(win_path, "r", encoding="utf-8", errors="replace") as f:
|
||
|
|
lines = f.readlines()
|
||
|
|
result = [l.rstrip() for l in lines if l.strip()]
|
||
|
|
return result[-n:] if result else ["<empty log>"]
|
||
|
|
except OSError:
|
||
|
|
return ["<log not found>"]
|
||
|
|
|
||
|
|
|
||
|
|
def find_pids(script_name):
|
||
|
|
"""Return list of PIDs running the given script (Windows python.exe)."""
|
||
|
|
try:
|
||
|
|
result = subprocess.run(
|
||
|
|
["wmic", "process", "where", "name='python.exe'",
|
||
|
|
"get", "processid,commandline"],
|
||
|
|
capture_output=True, text=True, timeout=15,
|
||
|
|
encoding="utf-8", errors="replace"
|
||
|
|
)
|
||
|
|
pids = []
|
||
|
|
for line in result.stdout.splitlines():
|
||
|
|
if script_name in line:
|
||
|
|
parts = line.strip().split()
|
||
|
|
for p in parts:
|
||
|
|
if p.isdigit():
|
||
|
|
pids.append(int(p))
|
||
|
|
return pids
|
||
|
|
except Exception as e:
|
||
|
|
log(f" [WARN] find_pids({script_name}) error: {e}")
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def kill_pid(pid):
|
||
|
|
"""Kill a process by PID using PowerShell."""
|
||
|
|
try:
|
||
|
|
subprocess.run(
|
||
|
|
["powershell", "-Command", f"Stop-Process -Id {pid} -Force -ErrorAction SilentlyContinue"],
|
||
|
|
capture_output=True, timeout=10
|
||
|
|
)
|
||
|
|
log(f" Killed PID {pid}")
|
||
|
|
except Exception as e:
|
||
|
|
log(f" [WARN] kill_pid({pid}): {e}")
|
||
|
|
|
||
|
|
|
||
|
|
def start_scraper(script, log_bash_path):
|
||
|
|
"""Start a scraper in background using bash, appending to log file."""
|
||
|
|
log(f" Launching: {script} -> {log_bash_path}")
|
||
|
|
try:
|
||
|
|
cmd = f'cd "{WORK_DIR}" && python -u {script} >> "{log_bash_path}" 2>&1'
|
||
|
|
proc = subprocess.Popen(
|
||
|
|
["bash", "-c", cmd],
|
||
|
|
stdout=subprocess.DEVNULL,
|
||
|
|
stderr=subprocess.DEVNULL,
|
||
|
|
cwd=WORK_DIR,
|
||
|
|
)
|
||
|
|
log(f" Launched bash wrapper PID={proc.pid} for {script}")
|
||
|
|
return proc
|
||
|
|
except Exception as e:
|
||
|
|
log(f" [ERROR] Failed to start {script}: {e}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def check_and_maybe_restart(label, script, log_bash, log_win):
|
||
|
|
"""
|
||
|
|
Check if scraper is healthy. Restart if:
|
||
|
|
- No python.exe process found for it, OR
|
||
|
|
- Log file missing, OR
|
||
|
|
- Log not updated in STALE_THRESHOLD seconds
|
||
|
|
Returns True if restarted.
|
||
|
|
"""
|
||
|
|
pids = find_pids(script)
|
||
|
|
age = log_age_seconds(log_win)
|
||
|
|
age_min = age / 60 if age >= 0 else -1
|
||
|
|
|
||
|
|
if pids:
|
||
|
|
proc_status = f"RUNNING PIDs={pids}"
|
||
|
|
else:
|
||
|
|
proc_status = "NO PROCESS"
|
||
|
|
|
||
|
|
if age >= 0:
|
||
|
|
log_status = f"log_age={age_min:.1f}m"
|
||
|
|
else:
|
||
|
|
log_status = "log=MISSING"
|
||
|
|
|
||
|
|
log(f" [{label}] {proc_status} | {log_status}")
|
||
|
|
|
||
|
|
# Decide if restart needed
|
||
|
|
need_restart = False
|
||
|
|
reason = ""
|
||
|
|
|
||
|
|
if not pids:
|
||
|
|
need_restart = True
|
||
|
|
reason = "no process found"
|
||
|
|
elif age < 0:
|
||
|
|
# Process running but log missing — might still be starting (first 90s)
|
||
|
|
# Only flag after 2 minutes
|
||
|
|
need_restart = False
|
||
|
|
log(f" [{label}] Process alive but log missing — may be starting up, skipping")
|
||
|
|
elif age > STALE_THRESHOLD:
|
||
|
|
need_restart = True
|
||
|
|
reason = f"log stale {age_min:.1f}m > {STALE_THRESHOLD//60}m"
|
||
|
|
|
||
|
|
if need_restart:
|
||
|
|
log(f" !! [{label}] RESTARTING — {reason}")
|
||
|
|
for pid in pids:
|
||
|
|
kill_pid(pid)
|
||
|
|
time.sleep(3)
|
||
|
|
start_scraper(script, log_bash)
|
||
|
|
return True
|
||
|
|
|
||
|
|
# Kill duplicates if more than one instance
|
||
|
|
if len(pids) > 1:
|
||
|
|
log(f" [WARN] [{label}] {len(pids)} instances — killing duplicates")
|
||
|
|
for pid in pids[1:]:
|
||
|
|
kill_pid(pid)
|
||
|
|
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def print_report():
|
||
|
|
elapsed_h = (time.time() - START_TIME) / 3600
|
||
|
|
remaining_h = max(0, DURATION_HOURS - elapsed_h)
|
||
|
|
log("=" * 65)
|
||
|
|
log(f"30-MIN REPORT | elapsed={elapsed_h:.1f}h | remaining={remaining_h:.1f}h")
|
||
|
|
log("=" * 65)
|
||
|
|
|
||
|
|
for label, win_path in [("mt_detail_sweep", LOG_DETAIL_WIN), ("mt_green_fleet ", LOG_GREEN_WIN)]:
|
||
|
|
age = log_age_seconds(win_path)
|
||
|
|
age_str = f"{age/60:.1f}m" if age >= 0 else "MISSING"
|
||
|
|
log(f" [{label}] log_age={age_str}")
|
||
|
|
for line in log_tail(win_path, 4):
|
||
|
|
log(f" {line}")
|
||
|
|
|
||
|
|
log("=" * 65)
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
global check_count
|
||
|
|
|
||
|
|
log("=" * 65)
|
||
|
|
log("SCRAPER MONITOR STARTED")
|
||
|
|
log(f"Duration={DURATION_HOURS}h | Check={CHECK_INTERVAL//60}m | Stale>{STALE_THRESHOLD//60}m = restart")
|
||
|
|
log(f"Work dir: {WORK_DIR}")
|
||
|
|
log(f"Windows TEMP: {WINDOWS_TEMP}")
|
||
|
|
log("=" * 65)
|
||
|
|
|
||
|
|
end_time = START_TIME + DURATION_HOURS * 3600
|
||
|
|
last_report_time = START_TIME - 1800 # force immediate first report
|
||
|
|
|
||
|
|
# Initial status report
|
||
|
|
print_report()
|
||
|
|
|
||
|
|
# Initial check
|
||
|
|
log("--- Initial health check ---")
|
||
|
|
r1 = check_and_maybe_restart("detail", "mt_detail_sweep.py", LOG_DETAIL_BASH, LOG_DETAIL_WIN)
|
||
|
|
r2 = check_and_maybe_restart("green", "mt_green_fleet.py", LOG_GREEN_BASH, LOG_GREEN_WIN)
|
||
|
|
|
||
|
|
if r1 or r2:
|
||
|
|
log(f"Initial restarts done. Waiting {RESTART_WAIT}s before first regular check...")
|
||
|
|
time.sleep(RESTART_WAIT)
|
||
|
|
|
||
|
|
last_check_time = time.time()
|
||
|
|
|
||
|
|
while time.time() < end_time:
|
||
|
|
# Sleep until next check interval (in 10s chunks for responsiveness)
|
||
|
|
next_check = last_check_time + CHECK_INTERVAL
|
||
|
|
while time.time() < next_check and time.time() < end_time:
|
||
|
|
time.sleep(10)
|
||
|
|
|
||
|
|
if time.time() >= end_time:
|
||
|
|
break
|
||
|
|
|
||
|
|
check_count += 1
|
||
|
|
log(f"--- Check #{check_count} | {(time.time()-START_TIME)/3600:.1f}h elapsed ---")
|
||
|
|
|
||
|
|
r1 = check_and_maybe_restart("detail", "mt_detail_sweep.py", LOG_DETAIL_BASH, LOG_DETAIL_WIN)
|
||
|
|
r2 = check_and_maybe_restart("green", "mt_green_fleet.py", LOG_GREEN_BASH, LOG_GREEN_WIN)
|
||
|
|
|
||
|
|
if r1 or r2:
|
||
|
|
log(f"Restart(s) performed. Waiting {RESTART_WAIT}s...")
|
||
|
|
time.sleep(RESTART_WAIT)
|
||
|
|
|
||
|
|
last_check_time = time.time()
|
||
|
|
|
||
|
|
# Full report every 30 minutes
|
||
|
|
now = time.time()
|
||
|
|
if now - last_report_time >= 1800:
|
||
|
|
print_report()
|
||
|
|
last_report_time = now
|
||
|
|
|
||
|
|
log("=" * 65)
|
||
|
|
log(f"MONITOR FINISHED — {DURATION_HOURS}h complete")
|
||
|
|
log("=" * 65)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
try:
|
||
|
|
main()
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
log("Monitor stopped by user (Ctrl+C).")
|
||
|
|
sys.exit(0)
|