From 6005601bd8a7911c4ad67ceefd0dcdd81c312d54 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 14 May 2026 05:40:15 +0000 Subject: [PATCH 1/2] Add hardened service and 24x7 operations tooling for Python monitor Agent-Logs-Url: https://github.com/nexuspcs/ConnectivityMonitor/sessions/5be8d40f-972c-4648-b122-7c01afdd1f28 Co-authored-by: nexuspcs <69493073+nexuspcs@users.noreply.github.com> --- README.md | 6 + python/README.md | 108 ++++++++++++++ python/connectivity-monitor@.service | 21 ++- python/ops/archive_artifacts.py | 92 ++++++++++++ python/ops/health_probe.py | 132 ++++++++++++++++++ python/ops/recover_service.sh | 26 ++++ .../connectivity-monitor-archive@.service | 11 ++ .../connectivity-monitor-archive@.timer | 10 ++ .../connectivity-monitor-healthcheck@.service | 12 ++ .../connectivity-monitor-healthcheck@.timer | 11 ++ 10 files changed, 426 insertions(+), 3 deletions(-) create mode 100644 python/ops/archive_artifacts.py create mode 100644 python/ops/health_probe.py create mode 100644 python/ops/recover_service.sh create mode 100644 python/systemd/connectivity-monitor-archive@.service create mode 100644 python/systemd/connectivity-monitor-archive@.timer create mode 100644 python/systemd/connectivity-monitor-healthcheck@.service create mode 100644 python/systemd/connectivity-monitor-healthcheck@.timer diff --git a/README.md b/README.md index 9511f58..8f32d03 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,12 @@ sudo systemctl start connectivity-monitor@YOUR_USER sudo systemctl status connectivity-monitor@YOUR_USER ``` +For 24/7 Raspberry Pi operation, the Python directory also includes: + +- `python/systemd/connectivity-monitor-healthcheck@.service|.timer` (scheduled API health probe) +- `python/systemd/connectivity-monitor-archive@.service|.timer` (daily archive/prune of logs/reports) +- `python/ops/recover_service.sh` (one-command recovery + validation) + > 📖 Full details: [python/README.md](python/README.md) --- diff --git a/python/README.md b/python/README.md index 5b9b263..5613ac9 100644 --- a/python/README.md +++ b/python/README.md @@ -102,6 +102,16 @@ sudo systemctl status connectivity-monitor@YOUR_USER sudo journalctl -u connectivity-monitor@YOUR_USER -f ``` +### Hardened 24/7 service behavior + +The included `connectivity-monitor@.service` is hardened for always-on usage: + +- Restarts automatically (`Restart=always`) with short backoff +- Waits for `network-online.target` on boot +- Adds startup/shutdown timeout protections +- Sets file descriptor/task limits for long-running operation +- Restricts filesystem writes to `~/ConnectivityMonitor` + ## Raspberry Pi Setup The Python version works natively on Raspberry Pi: @@ -118,6 +128,104 @@ python3 -m connectivity_monitor --headless --web-port 8080 For always-on monitoring, set it up as a systemd service (see above). Then access the dashboard from any device on your network at `http://:8080`. +## Raspberry Pi 24/7 Production Setup + +### 1) Keep the Pi address stable (queryable anytime) + +Use one of these: + +- DHCP reservation on your router (recommended), or +- Static IP on Raspberry Pi OS + +Then use a stable DNS name on your LAN (for example, `connectivity-monitor.local`) if available. + +### 2) Install hardened service + timers + +```bash +cd ~/ConnectivityMonitor/python + +# Main monitor service +sudo cp connectivity-monitor@.service /etc/systemd/system/ + +# Health-check and archive timer units +sudo cp systemd/connectivity-monitor-healthcheck@.service /etc/systemd/system/ +sudo cp systemd/connectivity-monitor-healthcheck@.timer /etc/systemd/system/ +sudo cp systemd/connectivity-monitor-archive@.service /etc/systemd/system/ +sudo cp systemd/connectivity-monitor-archive@.timer /etc/systemd/system/ + +sudo systemctl daemon-reload + +# Enable monitor + safety timers +sudo systemctl enable connectivity-monitor@YOUR_USER +sudo systemctl start connectivity-monitor@YOUR_USER +sudo systemctl enable --now connectivity-monitor-healthcheck@YOUR_USER.timer +sudo systemctl enable --now connectivity-monitor-archive@YOUR_USER.timer +``` + +### 3) Reverse proxy for controlled remote access (TLS/auth) + +Keep the monitor on localhost and publish through a reverse proxy (Nginx/Caddy/Traefik) to add: + +- HTTPS/TLS certificates +- Basic auth or SSO +- IP allow-listing/rate limits + +Proxy upstream target: `http://127.0.0.1:8080` + +## Operational Safety Checks + +### API health probe + alert trigger + +`ops/health_probe.py` checks `/api/status` and exits non-zero when: + +- Endpoint is unreachable +- Health score is below threshold +- Packet loss exceeds threshold + +This is scheduled every minute by `connectivity-monitor-healthcheck@.timer` and visible in `journalctl`. + +Optional auto-reboot trigger example: + +```bash +python3 ops/health_probe.py \ + --url http://127.0.0.1:8080/api/status \ + --min-health 60 --max-loss 10 \ + --reboot-after-failures 15 \ + --allow-reboot +``` + +### Log/report persistence and archival + +The monitor writes logs and reports under `~/ConnectivityMonitor`. + +`ops/archive_artifacts.py` can archive and prune old data, and is scheduled daily by `connectivity-monitor-archive@.timer`. + +## One-command Recovery and Validation + +Use: + +```bash +bash ~/ConnectivityMonitor/python/ops/recover_service.sh YOUR_USER +``` + +This command: + +- Reloads systemd units +- Re-enables and restarts the monitor service +- Prints service status and recent journal logs +- Validates API response from `/api/status` + +## Soak Test Checklist (48–72h) + +Before calling deployment production-ready, run for 48–72 hours and verify: + +- Service survives reboot (`systemctl is-enabled` + post-reboot status) +- Auto-restart behavior works when process is killed +- API remains queryable (`/api/status`, `/api/history`, `/api/drops`, `/api/targets`, `/api/heatmap`) +- Logs and reports continue to generate +- Health-check timer executes and records status +- Daily archival timer creates archives and prunes old ones + ## Package Structure ``` diff --git a/python/connectivity-monitor@.service b/python/connectivity-monitor@.service index 328b147..3d2b5e9 100644 --- a/python/connectivity-monitor@.service +++ b/python/connectivity-monitor@.service @@ -2,14 +2,29 @@ Description=Connectivity Monitor v4.0 After=network-online.target Wants=network-online.target +StartLimitIntervalSec=300 +StartLimitBurst=10 [Service] Type=simple User=%i +Group=%i +WorkingDirectory=%h/ConnectivityMonitor/python +Environment=PYTHONUNBUFFERED=1 ExecStart=/usr/bin/python3 -m connectivity_monitor --headless -WorkingDirectory=%h -Restart=on-failure -RestartSec=10 +Restart=always +RestartSec=5 +TimeoutStartSec=30 +TimeoutStopSec=30 +UMask=0027 +LimitNOFILE=65536 +TasksMax=512 +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=full +ReadWritePaths=%h/ConnectivityMonitor +StandardOutput=journal +StandardError=journal [Install] WantedBy=multi-user.target diff --git a/python/ops/archive_artifacts.py b/python/ops/archive_artifacts.py new file mode 100644 index 0000000..42b34a8 --- /dev/null +++ b/python/ops/archive_artifacts.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""Archive logs/reports into compressed tar files and prune old archives.""" + +import argparse +import datetime +import os +import tarfile +import time + + +def _collect_files(path, older_than_days): + if not os.path.isdir(path): + return [] + now = time.time() + min_age = older_than_days * 86400 + files = [] + for name in os.listdir(path): + full = os.path.join(path, name) + if not os.path.isfile(full): + continue + if now - os.path.getmtime(full) >= min_age: + files.append(full) + return sorted(files) + + +def _archive_group(files, archive_path, base_dir): + if not files: + return 0 + os.makedirs(os.path.dirname(archive_path), exist_ok=True) + with tarfile.open(archive_path, "w:gz") as tar: + for file_path in files: + rel = os.path.relpath(file_path, base_dir) + tar.add(file_path, arcname=rel) + return len(files) + + +def _prune_archives(archive_dir, keep_days): + if keep_days <= 0 or not os.path.isdir(archive_dir): + return 0 + now = time.time() + max_age = keep_days * 86400 + removed = 0 + for name in os.listdir(archive_dir): + if not name.endswith(".tar.gz"): + continue + full = os.path.join(archive_dir, name) + if os.path.isfile(full) and (now - os.path.getmtime(full)) > max_age: + os.remove(full) + removed += 1 + return removed + + +def main(): + parser = argparse.ArgumentParser(description="Archive ConnectivityMonitor logs/reports") + parser.add_argument("--base-dir", default=os.path.expanduser("~/ConnectivityMonitor")) + parser.add_argument("--older-than-days", type=int, default=1) + parser.add_argument("--delete-after-archive", action="store_true") + parser.add_argument("--keep-archive-days", type=int, default=30) + args = parser.parse_args() + + ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + logs_dir = os.path.join(args.base_dir, "logs") + reports_dir = os.path.join(args.base_dir, "reports") + archive_dir = os.path.join(args.base_dir, "archive") + + logs_files = _collect_files(logs_dir, args.older_than_days) + reports_files = _collect_files(reports_dir, args.older_than_days) + + logs_archive = os.path.join(archive_dir, "logs_{}.tar.gz".format(ts)) + reports_archive = os.path.join(archive_dir, "reports_{}.tar.gz".format(ts)) + + logs_count = _archive_group(logs_files, logs_archive, args.base_dir) + reports_count = _archive_group(reports_files, reports_archive, args.base_dir) + + if args.delete_after_archive: + for file_path in logs_files + reports_files: + os.remove(file_path) + + pruned = _prune_archives(archive_dir, args.keep_archive_days) + + print( + "Archived logs={}, reports={}, deleted_source={}, pruned_archives={}".format( + logs_count, + reports_count, + bool(args.delete_after_archive), + pruned, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/python/ops/health_probe.py b/python/ops/health_probe.py new file mode 100644 index 0000000..39ed7b9 --- /dev/null +++ b/python/ops/health_probe.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +"""Health probe for Connectivity Monitor API with optional recovery actions.""" + +import argparse +import datetime +import json +import os +import subprocess +import sys +import urllib.error +import urllib.request + + +def _now(): + return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + + +def _read_state(path): + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return {"consecutive_failures": 0, "last_status": "unknown"} + + +def _write_state(path, state): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2) + + +def _fetch_status(url, timeout): + req = urllib.request.Request(url, headers={"User-Agent": "ConnectivityMonitor/4.0 health-probe"}) + with urllib.request.urlopen(req, timeout=timeout) as response: + if response.status != 200: + raise RuntimeError("Non-200 status: {}".format(response.status)) + payload = json.loads(response.read().decode("utf-8")) + return payload + + +def _evaluate(payload, min_health, max_loss): + health = payload.get("health_score") + loss = payload.get("loss") + is_down = payload.get("is_down") + + problems = [] + if is_down: + problems.append("monitor reports outage") + if isinstance(health, (int, float)) and health < min_health: + problems.append("health_score {} < {}".format(health, min_health)) + if isinstance(loss, (int, float)) and loss > max_loss: + problems.append("loss {} > {}".format(loss, max_loss)) + return problems + + +def _maybe_reboot(enabled, command, reason): + if not enabled: + return + print("[{}] WARNING: {}".format(_now(), reason)) + print("[{}] WARNING: Executing reboot command: {}".format(_now(), " ".join(command))) + try: + subprocess.run(command, check=True) + except Exception as exc: + print("[{}] ERROR: Reboot command failed: {}".format(_now(), exc), file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser(description="Connectivity Monitor health probe") + parser.add_argument("--url", default="http://127.0.0.1:8080/api/status") + parser.add_argument("--timeout", type=int, default=5) + parser.add_argument("--min-health", type=float, default=60.0) + parser.add_argument("--max-loss", type=float, default=10.0) + parser.add_argument( + "--state-file", + default=os.path.expanduser("~/ConnectivityMonitor/health_probe_state.json"), + ) + parser.add_argument("--reboot-after-failures", type=int, default=0) + parser.add_argument("--allow-reboot", action="store_true") + parser.add_argument( + "--reboot-cmd", + default="/usr/bin/systemctl reboot", + help="Command used when reboot is enabled (string split on spaces)", + ) + args = parser.parse_args() + + state = _read_state(args.state_file) + failures = int(state.get("consecutive_failures", 0)) + + try: + payload = _fetch_status(args.url, args.timeout) + issues = _evaluate(payload, args.min_health, args.max_loss) + if issues: + failures += 1 + state["last_status"] = "degraded" + state["last_error"] = "; ".join(issues) + print("[{}] WARNING: {}".format(_now(), state["last_error"])) + else: + failures = 0 + state["last_status"] = "healthy" + state["last_error"] = "" + print( + "[{}] OK: health_score={}, loss={}, total_pings={}".format( + _now(), payload.get("health_score"), payload.get("loss"), payload.get("total_pings") + ) + ) + except (urllib.error.URLError, urllib.error.HTTPError, ValueError, RuntimeError) as exc: + failures += 1 + state["last_status"] = "unreachable" + state["last_error"] = str(exc) + print("[{}] ERROR: {}".format(_now(), exc), file=sys.stderr) + + state["consecutive_failures"] = failures + state["last_checked"] = _now() + _write_state(args.state_file, state) + + if args.reboot_after_failures > 0 and failures >= args.reboot_after_failures: + _maybe_reboot( + args.allow_reboot, + args.reboot_cmd.split(), + "Consecutive failures ({}) reached reboot threshold ({})".format( + failures, args.reboot_after_failures + ), + ) + sys.exit(2) + + if failures > 0: + sys.exit(1) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/python/ops/recover_service.sh b/python/ops/recover_service.sh new file mode 100644 index 0000000..f6fa01b --- /dev/null +++ b/python/ops/recover_service.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +USER_NAME="$1" +SERVICE="connectivity-monitor@${USER_NAME}" + +echo "Reloading systemd units..." +sudo systemctl daemon-reload + +echo "Enabling and restarting ${SERVICE}..." +sudo systemctl enable "${SERVICE}" +sudo systemctl restart "${SERVICE}" + +echo "Service status:" +sudo systemctl --no-pager --full status "${SERVICE}" | cat + +echo "Recent logs:" +sudo journalctl --no-pager -u "${SERVICE}" -n 50 | cat + +echo "API health:" +curl -fsS "http://127.0.0.1:8080/api/status" | python3 -m json.tool diff --git a/python/systemd/connectivity-monitor-archive@.service b/python/systemd/connectivity-monitor-archive@.service new file mode 100644 index 0000000..db32f00 --- /dev/null +++ b/python/systemd/connectivity-monitor-archive@.service @@ -0,0 +1,11 @@ +[Unit] +Description=Archive Connectivity Monitor logs/reports for %i +After=connectivity-monitor@%i.service + +[Service] +Type=oneshot +User=%i +Group=%i +WorkingDirectory=%h/ConnectivityMonitor/python +Environment=PYTHONUNBUFFERED=1 +ExecStart=/usr/bin/python3 %h/ConnectivityMonitor/python/ops/archive_artifacts.py --base-dir %h/ConnectivityMonitor --older-than-days 1 --delete-after-archive --keep-archive-days 30 diff --git a/python/systemd/connectivity-monitor-archive@.timer b/python/systemd/connectivity-monitor-archive@.timer new file mode 100644 index 0000000..581cdf5 --- /dev/null +++ b/python/systemd/connectivity-monitor-archive@.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Archive Connectivity Monitor logs/reports daily for %i + +[Timer] +OnCalendar=*-*-* 03:00:00 +Persistent=true +Unit=connectivity-monitor-archive@%i.service + +[Install] +WantedBy=timers.target diff --git a/python/systemd/connectivity-monitor-healthcheck@.service b/python/systemd/connectivity-monitor-healthcheck@.service new file mode 100644 index 0000000..a4c6fea --- /dev/null +++ b/python/systemd/connectivity-monitor-healthcheck@.service @@ -0,0 +1,12 @@ +[Unit] +Description=Connectivity Monitor API health probe for %i +After=connectivity-monitor@%i.service +Requires=connectivity-monitor@%i.service + +[Service] +Type=oneshot +User=%i +Group=%i +WorkingDirectory=%h/ConnectivityMonitor/python +Environment=PYTHONUNBUFFERED=1 +ExecStart=/usr/bin/python3 %h/ConnectivityMonitor/python/ops/health_probe.py --url http://127.0.0.1:8080/api/status --min-health 60 --max-loss 10 --state-file %h/ConnectivityMonitor/health_probe_state.json diff --git a/python/systemd/connectivity-monitor-healthcheck@.timer b/python/systemd/connectivity-monitor-healthcheck@.timer new file mode 100644 index 0000000..2656181 --- /dev/null +++ b/python/systemd/connectivity-monitor-healthcheck@.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run Connectivity Monitor API health probe every minute for %i + +[Timer] +OnBootSec=2m +OnUnitActiveSec=1m +AccuracySec=15s +Unit=connectivity-monitor-healthcheck@%i.service + +[Install] +WantedBy=timers.target From f5586b91fbcb143b4d8eef1e2f1be27eb171f97c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 14 May 2026 05:43:29 +0000 Subject: [PATCH 2/2] Address validation feedback for ops scripts and healthcheck configuration Agent-Logs-Url: https://github.com/nexuspcs/ConnectivityMonitor/sessions/5be8d40f-972c-4648-b122-7c01afdd1f28 Co-authored-by: nexuspcs <69493073+nexuspcs@users.noreply.github.com> --- python/README.md | 19 ++++++++++++ python/ops/archive_artifacts.py | 15 +++++++-- python/ops/health_probe.py | 24 +++++++++++--- python/ops/recover_service.sh | 31 +++++++++++++++++-- .../connectivity-monitor-healthcheck@.service | 5 ++- 5 files changed, 83 insertions(+), 11 deletions(-) diff --git a/python/README.md b/python/README.md index 5613ac9..18362a5 100644 --- a/python/README.md +++ b/python/README.md @@ -138,6 +138,12 @@ Use one of these: - Static IP on Raspberry Pi OS Then use a stable DNS name on your LAN (for example, `connectivity-monitor.local`) if available. +Note: `.local` hostname discovery depends on mDNS (for example `avahi-daemon`) being enabled on the Pi/network. + +```bash +sudo systemctl status avahi-daemon +sudo systemctl enable --now avahi-daemon +``` ### 2) Install hardened service + timers @@ -162,6 +168,17 @@ sudo systemctl enable --now connectivity-monitor-healthcheck@YOUR_USER.timer sudo systemctl enable --now connectivity-monitor-archive@YOUR_USER.timer ``` +If your monitor runs on a non-default web port, override the health-check unit port: + +```bash +sudo systemctl edit connectivity-monitor-healthcheck@YOUR_USER.service +# Add: +# [Service] +# Environment=WEB_PORT=9090 +sudo systemctl daemon-reload +sudo systemctl restart connectivity-monitor-healthcheck@YOUR_USER.timer +``` + ### 3) Reverse proxy for controlled remote access (TLS/auth) Keep the monitor on localhost and publish through a reverse proxy (Nginx/Caddy/Traefik) to add: @@ -194,6 +211,8 @@ python3 ops/health_probe.py \ --allow-reboot ``` +Auto-reboot requires root privileges (or an explicit sudo policy that allows the reboot command non-interactively). + ### Log/report persistence and archival The monitor writes logs and reports under `~/ConnectivityMonitor`. diff --git a/python/ops/archive_artifacts.py b/python/ops/archive_artifacts.py index 42b34a8..996dc98 100644 --- a/python/ops/archive_artifacts.py +++ b/python/ops/archive_artifacts.py @@ -58,7 +58,7 @@ def main(): parser.add_argument("--keep-archive-days", type=int, default=30) args = parser.parse_args() - ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d_%H%M%SZ") logs_dir = os.path.join(args.base_dir, "logs") reports_dir = os.path.join(args.base_dir, "reports") archive_dir = os.path.join(args.base_dir, "archive") @@ -72,20 +72,29 @@ def main(): logs_count = _archive_group(logs_files, logs_archive, args.base_dir) reports_count = _archive_group(reports_files, reports_archive, args.base_dir) + delete_errors = [] if args.delete_after_archive: for file_path in logs_files + reports_files: - os.remove(file_path) + try: + os.remove(file_path) + except OSError as exc: + delete_errors.append("{} ({})".format(file_path, exc)) pruned = _prune_archives(archive_dir, args.keep_archive_days) print( - "Archived logs={}, reports={}, deleted_source={}, pruned_archives={}".format( + "Archived logs={}, reports={}, deleted_source={}, pruned_archives={}, delete_errors={}".format( logs_count, reports_count, bool(args.delete_after_archive), pruned, + len(delete_errors), ) ) + if delete_errors: + for err in delete_errors: + print("Delete error: {}".format(err)) + raise SystemExit(1) if __name__ == "__main__": diff --git a/python/ops/health_probe.py b/python/ops/health_probe.py index 39ed7b9..0f9d8e7 100644 --- a/python/ops/health_probe.py +++ b/python/ops/health_probe.py @@ -5,14 +5,17 @@ import datetime import json import os +import shlex import subprocess import sys import urllib.error import urllib.request +APP_VERSION = "4.0" + def _now(): - return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + return datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0).isoformat() def _read_state(path): @@ -24,13 +27,17 @@ def _read_state(path): def _write_state(path, state): - os.makedirs(os.path.dirname(path), exist_ok=True) + parent = os.path.dirname(path) + if parent: + os.makedirs(parent, exist_ok=True) with open(path, "w", encoding="utf-8") as f: json.dump(state, f, indent=2) def _fetch_status(url, timeout): - req = urllib.request.Request(url, headers={"User-Agent": "ConnectivityMonitor/4.0 health-probe"}) + req = urllib.request.Request( + url, headers={"User-Agent": "ConnectivityMonitor/{} health-probe".format(APP_VERSION)} + ) with urllib.request.urlopen(req, timeout=timeout) as response: if response.status != 200: raise RuntimeError("Non-200 status: {}".format(response.status)) @@ -56,8 +63,15 @@ def _evaluate(payload, min_health, max_loss): def _maybe_reboot(enabled, command, reason): if not enabled: return + safe_cmd = " ".join(shlex.quote(part) for part in command) print("[{}] WARNING: {}".format(_now(), reason)) - print("[{}] WARNING: Executing reboot command: {}".format(_now(), " ".join(command))) + print("[{}] WARNING: Executing reboot command: {}".format(_now(), safe_cmd)) + if os.geteuid() != 0: + print( + "[{}] ERROR: Reboot requires root privileges or sudo/NOPASSWD policy.".format(_now()), + file=sys.stderr, + ) + return try: subprocess.run(command, check=True) except Exception as exc: @@ -116,7 +130,7 @@ def main(): if args.reboot_after_failures > 0 and failures >= args.reboot_after_failures: _maybe_reboot( args.allow_reboot, - args.reboot_cmd.split(), + shlex.split(args.reboot_cmd), "Consecutive failures ({}) reached reboot threshold ({})".format( failures, args.reboot_after_failures ), diff --git a/python/ops/recover_service.sh b/python/ops/recover_service.sh index f6fa01b..32f007c 100644 --- a/python/ops/recover_service.sh +++ b/python/ops/recover_service.sh @@ -2,12 +2,36 @@ set -euo pipefail if [[ $# -lt 1 ]]; then - echo "Usage: $0 " + echo "Usage: $0 [web_port]" exit 1 fi USER_NAME="$1" SERVICE="connectivity-monitor@${USER_NAME}" +PORT="${2:-}" + +if [[ -z "${PORT}" ]]; then + CONFIG_PATH="/home/${USER_NAME}/ConnectivityMonitor/monitor_config.json" + if [[ -f "${CONFIG_PATH}" ]]; then + if ! PORT="$(python3 - "${CONFIG_PATH}" <<'PY' +import json +import sys + +try: + with open(sys.argv[1], "r", encoding="utf-8") as f: + print(json.load(f).get("web_port", 8080)) +except Exception as exc: + print(f"ERROR: could not parse config: {exc}", file=sys.stderr) + sys.exit(1) +PY + )"; then + echo "Falling back to port 8080 due to config parse failure." >&2 + PORT="8080" + fi + else + PORT="8080" + fi +fi echo "Reloading systemd units..." sudo systemctl daemon-reload @@ -23,4 +47,7 @@ echo "Recent logs:" sudo journalctl --no-pager -u "${SERVICE}" -n 50 | cat echo "API health:" -curl -fsS "http://127.0.0.1:8080/api/status" | python3 -m json.tool +if ! curl -fsS "http://127.0.0.1:${PORT}/api/status" | python3 -m json.tool; then + echo "ERROR: API health check failed on port ${PORT}" >&2 + exit 2 +fi diff --git a/python/systemd/connectivity-monitor-healthcheck@.service b/python/systemd/connectivity-monitor-healthcheck@.service index a4c6fea..b972cb0 100644 --- a/python/systemd/connectivity-monitor-healthcheck@.service +++ b/python/systemd/connectivity-monitor-healthcheck@.service @@ -9,4 +9,7 @@ User=%i Group=%i WorkingDirectory=%h/ConnectivityMonitor/python Environment=PYTHONUNBUFFERED=1 -ExecStart=/usr/bin/python3 %h/ConnectivityMonitor/python/ops/health_probe.py --url http://127.0.0.1:8080/api/status --min-health 60 --max-loss 10 --state-file %h/ConnectivityMonitor/health_probe_state.json +Environment=WEB_PORT=8080 +EnvironmentFile=-/etc/default/connectivity-monitor +EnvironmentFile=-/etc/default/connectivity-monitor-%i +ExecStart=/usr/bin/python3 %h/ConnectivityMonitor/python/ops/health_probe.py --url http://127.0.0.1:${WEB_PORT}/api/status --min-health 60 --max-loss 10 --state-file %h/ConnectivityMonitor/health_probe_state.json