From 49f2fc443887fba0aeddf216bc966d0d47fbc483 Mon Sep 17 00:00:00 2001 From: zhangshaozhi Date: Sun, 17 May 2026 20:26:16 +0800 Subject: [PATCH 01/31] fix: make application runnable in production - Dockerfile: include README.md in COPY statement - main.py: add static file serving for built frontend (SPA routing support) - App.vue: wrap template with Naive UI message/dialog/notification providers Co-Authored-By: Claude Sonnet 4.6 --- Dockerfile | 2 +- protoforge/main.py | 10 ++++++++++ web/src/App.vue | 8 +++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index babbf5f..2528ade 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ WORKDIR /app RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/* -COPY pyproject.toml . +COPY pyproject.toml README.md ./ COPY protoforge/ protoforge/ COPY --from=frontend-builder /app/web/dist /app/static diff --git a/protoforge/main.py b/protoforge/main.py index 894c5bc..0e43951 100644 --- a/protoforge/main.py +++ b/protoforge/main.py @@ -1,8 +1,11 @@ import logging from contextlib import asynccontextmanager +from pathlib import Path from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse +from fastapi.staticfiles import StaticFiles from protoforge.api.v1.router import router from protoforge.core.engine import SimulationEngine @@ -184,12 +187,19 @@ def create_app() -> FastAPI: @app.get("/") async def root(): + index = Path("/app/static/index.html") + if index.exists(): + return FileResponse(index) return { "name": "ProtoForge", "version": "0.1.0", "description": "物联网协议仿真与测试平台", } + static_dir = Path("/app/static") + if static_dir.exists(): + app.mount("/assets", StaticFiles(directory=static_dir / "assets"), name="assets") + @app.get("/health") async def health(): return {"status": "ok"} diff --git a/web/src/App.vue b/web/src/App.vue index ae6d315..22eabe9 100644 --- a/web/src/App.vue +++ b/web/src/App.vue @@ -1,4 +1,7 @@ From 550d8e20b98481a5390be716104142d6aec87770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Wed, 20 May 2026 19:57:06 +0800 Subject: [PATCH 17/31] feat(ai): support ai --- ai/predict.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100755 ai/predict.py diff --git a/ai/predict.py b/ai/predict.py new file mode 100755 index 0000000..b70f822 --- /dev/null +++ b/ai/predict.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +import requests +import numpy as np +from datetime import datetime, timedelta + +VM_URL = "http://localhost:8428" +DEVICE_ID = "fanuc-cnc" +METRIC = f'feed_rate{{device_id="{DEVICE_ID}"}}' + +def fetch_history(minutes=30): + """从VM拉取历史数据""" + end = datetime.now() + start = end - timedelta(minutes=minutes) + resp = requests.get(f"{VM_URL}/api/v1/query_range", params={ + "query": METRIC, + "start": start.timestamp(), + "end": end.timestamp(), + "step": "1s", + }) + result = resp.json()["data"]["result"] + if not result: + return [], [] + values = result[0]["values"] + ts = [float(v[0]) for v in values] + ys = [float(v[1]) for v in values] + return ts, ys + +def predict_next(ts, ys, horizon=60): + """ + 用FFT检测主频,拟合正弦波,外推未来horizon秒 + 适合周期性信号 + """ + if len(ys) < 60: + return [], [] + + ys = np.array(ys) + n = len(ys) + dt = 1.0 # 1秒采样 + + # FFT找主频 + fft = np.fft.rfft(ys - ys.mean()) + freqs = np.fft.rfftfreq(n, d=dt) + dominant_idx = np.argmax(np.abs(fft[1:])) + 1 + dominant_freq = freqs[dominant_idx] + period = 1.0 / dominant_freq if dominant_freq > 0 else 60 + + # 拟合:y = A*sin(2π/T * t + φ) + offset + from scipy.optimize import curve_fit + t_rel = np.arange(n, dtype=float) + offset = ys.mean() + amplitude = (ys.max() - ys.min()) / 2 + + def sine_model(t, A, T, phi, C): + return A * np.sin(2 * np.pi / T * t + phi) + C + + try: + popt, _ = curve_fit( + sine_model, t_rel, ys, + p0=[amplitude, period, 0, offset], + maxfev=5000 + ) + # 外推 + t_future = np.arange(n, n + horizon, dtype=float) + y_pred = sine_model(t_future, *popt) + ts_future = [ts[-1] + i + 1 for i in range(horizon)] + return ts_future, y_pred.tolist() + except Exception: + # 拟合失败降级为线性 + slope = (ys[-1] - ys[-10]) / 10 + ts_future = [ts[-1] + i + 1 for i in range(horizon)] + y_pred = [ys[-1] + slope * (i + 1) for i in range(horizon)] + return ts_future, y_pred + +def write_predictions(ts_future, y_pred, metric_name="protoforge_feed_rate_predicted"): + """写回VictoriaMetrics""" + lines = [] + for t, y in zip(ts_future, y_pred): + ts_ms = int(t * 1000) + lines.append(f'{metric_name}{{device_id="{DEVICE_ID}"}} {y:.2f} {ts_ms}') + payload = "\n".join(lines) + requests.post(f"{VM_URL}/api/v1/import/prometheus", data=payload) + +def run_once(): + ts, ys = fetch_history(minutes=30) + if len(ys) < 60: + print("数据不足") + return + ts_future, y_pred = predict_next(ts, ys, horizon=120) + write_predictions(ts_future, y_pred) + print(f"写入 {len(y_pred)} 个预测点,预测到 +{len(y_pred)}s") + +if __name__ == "__main__": + import time + while True: + run_once() + time.sleep(30) # 每30秒重新预测一次 From e8f70d09c27d53bff6c8f310b7a2064496632a33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Wed, 20 May 2026 20:29:13 +0800 Subject: [PATCH 18/31] feat(predict_v2): add predict_v2 python file --- ai/predict_v2.py | 206 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100755 ai/predict_v2.py diff --git a/ai/predict_v2.py b/ai/predict_v2.py new file mode 100755 index 0000000..df5dd97 --- /dev/null +++ b/ai/predict_v2.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- +""" +ProtoForge 预测服务 v2 +从 VictoriaMetrics 拉取历史数据,用 FFT + 正弦拟合预测未来值,写回 VM。 +预测值时间戳为未来时间,Grafana 中预测线出现在实测线右侧延伸处。 +""" + +import logging +import time +from datetime import datetime, timedelta + +import numpy as np +import requests +from scipy.optimize import curve_fit + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +logger = logging.getLogger(__name__) + +# ── 配置 ────────────────────────────────────────────────────────────────────── +VM_URL = "http://localhost:8428" + +# 要预测的指标列表,每项:(查询表达式, 写回指标名) +PREDICT_TARGETS = [ + ('feed_rate{device_id="fanuc-cnc"}', "feed_rate_predicted"), + ('spindle_speed{device_id="fanuc-cnc"}', "spindle_speed_predicted"), + ('spindle_current{device_id="fanuc-cnc"}', "spindle_current_predicted"), + ('vibration_x{device_id="fanuc-cnc"}', "vibration_x_predicted"), + ('vibration_y{device_id="fanuc-cnc"}', "vibration_y_predicted"), + ('vibration_z{device_id="fanuc-cnc"}', "vibration_z_predicted"), +] + +HISTORY_MINUTES = 30 # 拉取多少分钟历史数据用于拟合 +HORIZON_SECONDS = 120 # 预测未来多少秒 +POLL_INTERVAL = 30 # 每隔多少秒重新预测一次 +MIN_POINTS = 120 # 至少需要多少个历史点才开始预测 +# ───────────────────────────────────────────────────────────────────────────── + + +def fetch_history(query: str, minutes: int = HISTORY_MINUTES): + """从 VictoriaMetrics 拉取历史时序数据,返回 (timestamps, values)。""" + now = datetime.now() + start = now - timedelta(minutes=minutes) + try: + resp = requests.get( + f"{VM_URL}/api/v1/query_range", + params={ + "query": query, + "start": start.timestamp(), + "end": now.timestamp(), + "step": "1s", + }, + timeout=10, + ) + resp.raise_for_status() + except requests.RequestException as e: + logger.error("拉取数据失败 query=%s: %s", query, e) + return [], [] + + result = resp.json().get("data", {}).get("result", []) + if not result: + return [], [] + + values = result[0]["values"] + ts = [float(v[0]) for v in values] + ys = [float(v[1]) for v in values] + return ts, ys + + +def _sine_model(t, A, T, phi, C): + return A * np.sin(2 * np.pi / T * t + phi) + C + + +def predict_next(ts: list, ys: list, horizon: int = HORIZON_SECONDS): + """ + 用 FFT 检测主频,拟合正弦波,外推未来 horizon 秒。 + 返回 (future_timestamps, predicted_values),时间戳均在最后一个真实点之后。 + 降级策略:拟合失败时用最近 10 点线性外推。 + """ + ys_arr = np.array(ys) + n = len(ys_arr) + + # ── FFT 找主频 ──────────────────────────────────────────────────────────── + fft_vals = np.fft.rfft(ys_arr - ys_arr.mean()) + freqs = np.fft.rfftfreq(n, d=1.0) # d=1 表示 1 秒采样间隔 + # 跳过直流分量(index 0) + dominant_idx = int(np.argmax(np.abs(fft_vals[1:]))) + 1 + dominant_freq = freqs[dominant_idx] + period = 1.0 / dominant_freq if dominant_freq > 0 else 60.0 + period = float(np.clip(period, 5.0, 3600.0)) # 限制在合理范围 + + # ── 正弦拟合 ────────────────────────────────────────────────────────────── + t_rel = np.arange(n, dtype=float) + amplitude = (ys_arr.max() - ys_arr.min()) / 2.0 + offset = float(ys_arr.mean()) + + # 最后一个真实数据点的 Unix 时间戳(秒) + last_ts = ts[-1] + + try: + popt, _ = curve_fit( + _sine_model, + t_rel, + ys_arr, + p0=[amplitude, period, 0.0, offset], + bounds=( + [0, 5.0, -np.pi, ys_arr.min()], + [np.inf, 3600.0, np.pi, ys_arr.max()], + ), + maxfev=8000, + ) + t_future = np.arange(n, n + horizon, dtype=float) + y_pred = _sine_model(t_future, *popt) + # 裁剪到历史数据值域,避免外推飞出合理范围 + y_pred = np.clip(y_pred, ys_arr.min() * 0.5, ys_arr.max() * 1.5) + + # 未来时间戳:last_ts + 1s, +2s, ..., +horizon s + ts_future = [last_ts + i + 1 for i in range(horizon)] + logger.debug("正弦拟合成功 period=%.1fs amplitude=%.2f", popt[1], popt[0]) + return ts_future, y_pred.tolist() + + except Exception as e: + logger.warning("正弦拟合失败,降级为线性外推: %s", e) + tail = min(10, n) + slope = (ys_arr[-1] - ys_arr[-tail]) / tail + ts_future = [last_ts + i + 1 for i in range(horizon)] + y_pred = [float(ys_arr[-1] + slope * (i + 1)) for i in range(horizon)] + return ts_future, y_pred + + +def write_predictions(ts_future: list, y_pred: list, metric_name: str, extra_labels: dict = None): + """ + 将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。 + 时间戳为毫秒级 Unix 时间戳,对应未来时间点。 + """ + label_str = "" + if extra_labels: + parts = [f'{k}="{v}"' for k, v in extra_labels.items()] + label_str = "{" + ",".join(parts) + "}" + + lines = [] + for t, y in zip(ts_future, y_pred): + ts_ms = int(t * 1000) + lines.append(f"{metric_name}{label_str} {y:.4f} {ts_ms}") + + payload = "\n".join(lines) + try: + resp = requests.post( + f"{VM_URL}/api/v1/import/prometheus", + data=payload, + timeout=10, + ) + resp.raise_for_status() + except requests.RequestException as e: + logger.error("写入预测数据失败 metric=%s: %s", metric_name, e) + + +def _parse_labels(query: str) -> dict: + """从查询表达式中解析标签,如 feed_rate{device_id="fanuc-cnc"} → {"device_id": "fanuc-cnc"}""" + labels = {} + if "{" not in query: + return labels + label_part = query[query.index("{") + 1: query.index("}")] + for item in label_part.split(","): + if "=" in item: + k, v = item.split("=", 1) + labels[k.strip()] = v.strip().strip('"') + return labels + + +def run_once(): + now_str = datetime.now().strftime("%H:%M:%S") + for query, pred_metric in PREDICT_TARGETS: + ts, ys = fetch_history(query) + if len(ys) < MIN_POINTS: + logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) + continue + + ts_future, y_pred = predict_next(ts, ys, horizon=HORIZON_SECONDS) + if not ts_future: + continue + + extra_labels = _parse_labels(query) + write_predictions(ts_future, y_pred, pred_metric, extra_labels) + + future_time = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") + logger.info( + "[%s] %-40s → %-35s 写入 %d 点,预测至 %s", + now_str, query, pred_metric, len(y_pred), future_time, + ) + + +def main(): + logger.info( + "预测服务启动 VM=%s 预测窗口=%ds 轮询间隔=%ds", + VM_URL, HORIZON_SECONDS, POLL_INTERVAL, + ) + while True: + run_once() + time.sleep(POLL_INTERVAL) + + +if __name__ == "__main__": + main() From 57df20284645347aba7de4e2399640e05b6d0b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Wed, 20 May 2026 21:13:52 +0800 Subject: [PATCH 19/31] fix --- ai/predict_v2.py | 536 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 450 insertions(+), 86 deletions(-) diff --git a/ai/predict_v2.py b/ai/predict_v2.py index df5dd97..bc425c8 100755 --- a/ai/predict_v2.py +++ b/ai/predict_v2.py @@ -1,56 +1,93 @@ # -*- coding: utf-8 -*- """ -ProtoForge 预测服务 v2 -从 VictoriaMetrics 拉取历史数据,用 FFT + 正弦拟合预测未来值,写回 VM。 -预测值时间戳为未来时间,Grafana 中预测线出现在实测线右侧延伸处。 +ProtoForge 预测服务 v3 + +修复点: +1. 解决 HORIZON_SECONDS > POLL_INTERVAL 时,多轮预测窗口重叠导致 Grafana 出现毛刺/竖线问题。 +2. 每轮写入新预测前,删除同一个预测 metric 的旧预测序列,只保留最新一轮预测。 +3. 预测时间戳按整秒写入,避免毫秒时间戳和 Grafana step 不对齐。 +4. 拟合使用真实 timestamp 相对时间,不再假设历史数据严格 1 秒等间隔。 +5. 对历史数据做排序、去重、NaN/Inf 清洗。 """ import logging +import math +import re import time from datetime import datetime, timedelta +from typing import Dict, List, Tuple import numpy as np import requests from scipy.optimize import curve_fit + logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) + logger = logging.getLogger(__name__) + # ── 配置 ────────────────────────────────────────────────────────────────────── + VM_URL = "http://localhost:8428" -# 要预测的指标列表,每项:(查询表达式, 写回指标名) PREDICT_TARGETS = [ - ('feed_rate{device_id="fanuc-cnc"}', "feed_rate_predicted"), - ('spindle_speed{device_id="fanuc-cnc"}', "spindle_speed_predicted"), - ('spindle_current{device_id="fanuc-cnc"}', "spindle_current_predicted"), - ('vibration_x{device_id="fanuc-cnc"}', "vibration_x_predicted"), - ('vibration_y{device_id="fanuc-cnc"}', "vibration_y_predicted"), - ('vibration_z{device_id="fanuc-cnc"}', "vibration_z_predicted"), + ('feed_rate{device_id="fanuc-cnc"}', "feed_rate_predicted"), + ('spindle_speed{device_id="fanuc-cnc"}', "spindle_speed_predicted"), + ('spindle_current{device_id="fanuc-cnc"}', "spindle_current_predicted"), + ('vibration_x{device_id="fanuc-cnc"}', "vibration_x_predicted"), + ('vibration_y{device_id="fanuc-cnc"}', "vibration_y_predicted"), + ('vibration_z{device_id="fanuc-cnc"}', "vibration_z_predicted"), ] -HISTORY_MINUTES = 30 # 拉取多少分钟历史数据用于拟合 -HORIZON_SECONDS = 120 # 预测未来多少秒 -POLL_INTERVAL = 30 # 每隔多少秒重新预测一次 -MIN_POINTS = 120 # 至少需要多少个历史点才开始预测 +HISTORY_MINUTES = 30 +HORIZON_SECONDS = 120 +POLL_INTERVAL = 30 +MIN_POINTS = 120 +QUERY_STEP = "1s" + +# 关键修复:每轮写入前删除旧预测,避免 120s 预测窗口和 30s 轮询周期重叠 +CLEAR_OLD_PREDICTIONS = True + +# 如果删除旧预测失败,是否跳过本轮写入。 +# 建议 True,避免继续叠加脏数据。 +SKIP_WRITE_IF_CLEAR_FAILED = True + +# 给新预测数据加一个稳定标签,方便 Grafana 查询过滤。 +# Grafana 可以查询:feed_rate_predicted{device_id="fanuc-cnc",forecast="latest"} +EXTRA_PREDICT_LABELS = { + "forecast": "latest", + "source": "protoforge", +} + +# 正弦周期限制 +MIN_PERIOD_SECONDS = 5.0 +MAX_PERIOD_SECONDS = 3600.0 + # ───────────────────────────────────────────────────────────────────────────── -def fetch_history(query: str, minutes: int = HISTORY_MINUTES): - """从 VictoriaMetrics 拉取历史时序数据,返回 (timestamps, values)。""" +def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]: + """ + 从 VictoriaMetrics 拉取历史时序数据。 + 返回: + timestamps: Unix 秒级时间戳 + values: float 数值 + """ now = datetime.now() start = now - timedelta(minutes=minutes) + try: resp = requests.get( f"{VM_URL}/api/v1/query_range", params={ "query": query, "start": start.timestamp(), - "end": now.timestamp(), - "step": "1s", + "end": now.timestamp(), + "step": QUERY_STEP, }, timeout=10, ) @@ -59,148 +96,475 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES): logger.error("拉取数据失败 query=%s: %s", query, e) return [], [] - result = resp.json().get("data", {}).get("result", []) + try: + result = resp.json().get("data", {}).get("result", []) + except Exception as e: + logger.error("解析 VM 返回失败 query=%s: %s", query, e) + return [], [] + if not result: return [], [] - values = result[0]["values"] - ts = [float(v[0]) for v in values] - ys = [float(v[1]) for v in values] + values = result[0].get("values", []) + if not values: + return [], [] + + ts = [] + ys = [] + + for item in values: + if len(item) < 2: + continue + + try: + t = float(item[0]) + y = float(item[1]) + except Exception: + continue + + if not math.isfinite(t) or not math.isfinite(y): + continue + + ts.append(t) + ys.append(y) + return ts, ys -def _sine_model(t, A, T, phi, C): - return A * np.sin(2 * np.pi / T * t + phi) + C +def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]: + """ + 清洗历史数据: + 1. 转换为整秒时间戳 + 2. 排序 + 3. 同一秒多个值时保留最后一个 + 4. 插值补齐中间缺失秒 + """ + if not ts or not ys or len(ts) != len(ys): + return np.array([]), np.array([]) + + data = {} + + for t, y in zip(ts, ys): + try: + sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(sec) or not math.isfinite(val): + continue + + data[sec] = val + + if not data: + return np.array([]), np.array([]) + + sorted_items = sorted(data.items(), key=lambda x: x[0]) + ts_clean = np.array([x[0] for x in sorted_items], dtype=float) + ys_clean = np.array([x[1] for x in sorted_items], dtype=float) -def predict_next(ts: list, ys: list, horizon: int = HORIZON_SECONDS): + if len(ts_clean) < 2: + return ts_clean, ys_clean + + start_sec = int(ts_clean[0]) + end_sec = int(ts_clean[-1]) + + if end_sec <= start_sec: + return ts_clean, ys_clean + + # 统一为 1 秒网格,减少 query_range 缺点、抖动、缺失点对 FFT 的影响 + ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float) + ys_grid = np.interp(ts_grid, ts_clean, ys_clean) + + return ts_grid, ys_grid + + +def _sine_model(t: np.ndarray, A: float, T: float, phi: float, C: float) -> np.ndarray: + return A * np.sin(2.0 * np.pi / T * t + phi) + C + + +def estimate_period_by_fft(ys_arr: np.ndarray) -> float: """ - 用 FFT 检测主频,拟合正弦波,外推未来 horizon 秒。 - 返回 (future_timestamps, predicted_values),时间戳均在最后一个真实点之后。 - 降级策略:拟合失败时用最近 10 点线性外推。 + 使用 FFT 估算主周期。 + ys_arr 默认是 1 秒间隔。 """ - ys_arr = np.array(ys) n = len(ys_arr) - # ── FFT 找主频 ──────────────────────────────────────────────────────────── - fft_vals = np.fft.rfft(ys_arr - ys_arr.mean()) - freqs = np.fft.rfftfreq(n, d=1.0) # d=1 表示 1 秒采样间隔 - # 跳过直流分量(index 0) - dominant_idx = int(np.argmax(np.abs(fft_vals[1:]))) + 1 - dominant_freq = freqs[dominant_idx] - period = 1.0 / dominant_freq if dominant_freq > 0 else 60.0 - period = float(np.clip(period, 5.0, 3600.0)) # 限制在合理范围 + if n < 4: + return 60.0 + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return 60.0 + + fft_vals = np.fft.rfft(centered) + freqs = np.fft.rfftfreq(n, d=1.0) + + if len(freqs) <= 1: + return 60.0 + + # 跳过直流分量 index 0 + power = np.abs(fft_vals[1:]) + if len(power) == 0 or np.max(power) <= 0: + return 60.0 + + dominant_idx = int(np.argmax(power)) + 1 + dominant_freq = float(freqs[dominant_idx]) + + if dominant_freq <= 0: + return 60.0 + + period = 1.0 / dominant_freq + period = float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + return period - # ── 正弦拟合 ────────────────────────────────────────────────────────────── - t_rel = np.arange(n, dtype=float) - amplitude = (ys_arr.max() - ys_arr.min()) / 2.0 - offset = float(ys_arr.mean()) - # 最后一个真实数据点的 Unix 时间戳(秒) - last_ts = ts[-1] +def predict_next( + ts: List[float], + ys: List[float], + horizon: int = HORIZON_SECONDS, + start_from_now: bool = True, +) -> Tuple[List[float], List[float]]: + """ + 用 FFT 检测主频,拟合正弦波,外推未来 horizon 秒。 + 返回: + future_timestamps: 未来整秒时间戳 + predicted_values: 预测值 + """ + ts_grid, ys_grid = normalize_history(ts, ys) + + if len(ys_grid) < MIN_POINTS: + return [], [] + + n = len(ys_grid) + + y_min = float(np.min(ys_grid)) + y_max = float(np.max(ys_grid)) + y_mean = float(np.mean(ys_grid)) + y_range = y_max - y_min + + # 数据几乎不波动时,直接使用最后一个值保持 + if y_range <= 1e-9: + base_ts = int(time.time()) if start_from_now else int(ts_grid[-1]) + base_ts = max(base_ts, int(ts_grid[-1])) + + ts_future = [base_ts + i + 1 for i in range(horizon)] + y_pred = [float(ys_grid[-1])] * horizon + return ts_future, y_pred + + period = estimate_period_by_fft(ys_grid) + + # 用真实时间戳做相对时间,而不是 np.arange(n) + t_fit = ts_grid - ts_grid[0] + + amplitude = y_range / 2.0 + offset = y_mean + + # 预测起点统一对齐到整秒 + if start_from_now: + base_ts = int(time.time()) + else: + base_ts = int(ts_grid[-1]) + + # 避免因为 VM 查询延迟导致预测点落在最后一个真实点之前 + base_ts = max(base_ts, int(ts_grid[-1])) + + ts_future_arr = np.arange(base_ts + 1, base_ts + 1 + horizon, 1, dtype=float) + t_future = ts_future_arr - ts_grid[0] try: popt, _ = curve_fit( _sine_model, - t_rel, - ys_arr, + t_fit, + ys_grid, p0=[amplitude, period, 0.0, offset], bounds=( - [0, 5.0, -np.pi, ys_arr.min()], - [np.inf, 3600.0, np.pi, ys_arr.max()], + [0.0, MIN_PERIOD_SECONDS, -2.0 * np.pi, y_min - y_range], + [np.inf, MAX_PERIOD_SECONDS, 2.0 * np.pi, y_max + y_range], ), - maxfev=8000, + maxfev=12000, + ) + + y_pred_arr = _sine_model(t_future, *popt) + + # 裁剪到合理范围,避免拟合异常时飞出去 + margin = y_range * 0.2 + lower = y_min - margin + upper = y_max + margin + y_pred_arr = np.clip(y_pred_arr, lower, upper) + + if not np.all(np.isfinite(y_pred_arr)): + raise ValueError("预测结果包含 NaN/Inf") + + logger.debug( + "正弦拟合成功 period=%.2fs amplitude=%.4f offset=%.4f", + popt[1], + popt[0], + popt[3], ) - t_future = np.arange(n, n + horizon, dtype=float) - y_pred = _sine_model(t_future, *popt) - # 裁剪到历史数据值域,避免外推飞出合理范围 - y_pred = np.clip(y_pred, ys_arr.min() * 0.5, ys_arr.max() * 1.5) - # 未来时间戳:last_ts + 1s, +2s, ..., +horizon s - ts_future = [last_ts + i + 1 for i in range(horizon)] - logger.debug("正弦拟合成功 period=%.1fs amplitude=%.2f", popt[1], popt[0]) - return ts_future, y_pred.tolist() + return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist() except Exception as e: - logger.warning("正弦拟合失败,降级为线性外推: %s", e) + logger.warning("正弦拟合失败,降级为最近值平滑外推: %s", e) + + # 降级策略:用最近 10 个点的均值保持,避免线性外推越走越偏 tail = min(10, n) - slope = (ys_arr[-1] - ys_arr[-tail]) / tail - ts_future = [last_ts + i + 1 for i in range(horizon)] - y_pred = [float(ys_arr[-1] + slope * (i + 1)) for i in range(horizon)] + last_value = float(np.mean(ys_grid[-tail:])) + + ts_future = ts_future_arr.tolist() + y_pred = [last_value] * horizon + return ts_future, y_pred -def write_predictions(ts_future: list, y_pred: list, metric_name: str, extra_labels: dict = None): +def prom_escape_label_value(value: str) -> str: + """ + Prometheus exposition label value 转义。 + """ + return ( + str(value) + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace('"', '\\"') + ) + + +def build_selector(metric_name: str, labels: Dict[str, str]) -> str: + """ + 构造 PromQL selector,用于 delete_series。 + + 示例: + feed_rate_predicted{device_id="fanuc-cnc"} + """ + if not labels: + return metric_name + + parts = [] + for k in sorted(labels.keys()): + v = prom_escape_label_value(labels[k]) + parts.append(f'{k}="{v}"') + + return f'{metric_name}' + "{" + ",".join(parts) + "}" + + +def delete_old_predictions(metric_name: str, base_labels: Dict[str, str]) -> bool: + """ + 删除旧预测序列,避免多轮预测窗口重叠。 + + 注意: + 这里故意只用 base_labels,比如 device_id。 + 不带 forecast/source 标签,是为了兼容旧版本脚本写入的无 forecast 标签数据。 + """ + selector = build_selector(metric_name, base_labels) + + try: + resp = requests.post( + f"{VM_URL}/api/v1/admin/tsdb/delete_series", + params=[("match[]", selector)], + timeout=10, + ) + + if resp.status_code not in (200, 204): + logger.error( + "删除旧预测数据失败 metric=%s selector=%s status=%s body=%s", + metric_name, + selector, + resp.status_code, + resp.text[:500], + ) + return False + + logger.debug("已删除旧预测数据 selector=%s", selector) + return True + + except requests.RequestException as e: + logger.error("删除旧预测数据异常 metric=%s selector=%s: %s", metric_name, selector, e) + return False + + +def write_predictions( + ts_future: List[float], + y_pred: List[float], + metric_name: str, + labels: Dict[str, str] = None, +) -> bool: """ 将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。 - 时间戳为毫秒级 Unix 时间戳,对应未来时间点。 + 时间戳为毫秒级 Unix timestamp。 """ + if labels is None: + labels = {} + + if not ts_future or not y_pred or len(ts_future) != len(y_pred): + logger.warning("预测数据为空或长度不一致 metric=%s", metric_name) + return False + label_str = "" - if extra_labels: - parts = [f'{k}="{v}"' for k, v in extra_labels.items()] + if labels: + parts = [] + for k in sorted(labels.keys()): + v = prom_escape_label_value(labels[k]) + parts.append(f'{k}="{v}"') label_str = "{" + ",".join(parts) + "}" lines = [] + for t, y in zip(ts_future, y_pred): - ts_ms = int(t * 1000) - lines.append(f"{metric_name}{label_str} {y:.4f} {ts_ms}") + try: + ts_sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(ts_sec) or not math.isfinite(val): + continue + + ts_ms = ts_sec * 1000 + lines.append(f"{metric_name}{label_str} {val:.6f} {ts_ms}") + + if not lines: + logger.warning("没有可写入的预测点 metric=%s", metric_name) + return False + + payload = "\n".join(lines) + "\n" - payload = "\n".join(lines) try: resp = requests.post( f"{VM_URL}/api/v1/import/prometheus", - data=payload, + data=payload.encode("utf-8"), + headers={ + "Content-Type": "text/plain; version=0.0.4; charset=utf-8", + }, timeout=10, ) resp.raise_for_status() + return True + except requests.RequestException as e: logger.error("写入预测数据失败 metric=%s: %s", metric_name, e) + return False + + +_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*') -def _parse_labels(query: str) -> dict: - """从查询表达式中解析标签,如 feed_rate{device_id="fanuc-cnc"} → {"device_id": "fanuc-cnc"}""" +def _parse_labels(query: str) -> Dict[str, str]: + """ + 从查询表达式中解析标签。 + + 示例: + feed_rate{device_id="fanuc-cnc"} -> {"device_id": "fanuc-cnc"} + """ labels = {} - if "{" not in query: + + if "{" not in query or "}" not in query: return labels - label_part = query[query.index("{") + 1: query.index("}")] - for item in label_part.split(","): - if "=" in item: - k, v = item.split("=", 1) - labels[k.strip()] = v.strip().strip('"') + + try: + label_part = query[query.index("{") + 1: query.rindex("}")] + except Exception: + return labels + + for match in _LABEL_PATTERN.finditer(label_part): + key = match.group(1) + value = match.group(2) + value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\") + labels[key] = value + return labels +def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]: + result = {} + + for d in dicts: + if not d: + continue + result.update(d) + + return result + + def run_once(): now_str = datetime.now().strftime("%H:%M:%S") + for query, pred_metric in PREDICT_TARGETS: ts, ys = fetch_history(query) + if len(ys) < MIN_POINTS: logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) continue - ts_future, y_pred = predict_next(ts, ys, horizon=HORIZON_SECONDS) - if not ts_future: + ts_future, y_pred = predict_next( + ts, + ys, + horizon=HORIZON_SECONDS, + start_from_now=True, + ) + + if not ts_future or not y_pred: + logger.warning("[%s] %s 预测结果为空,跳过", now_str, query) + continue + + base_labels = _parse_labels(query) + + # 先删除旧预测,再写入新预测。 + # 删除条件只带 base_labels,兼容老版本无 forecast/source 标签的脏数据。 + if CLEAR_OLD_PREDICTIONS: + clear_ok = delete_old_predictions(pred_metric, base_labels) + + if not clear_ok and SKIP_WRITE_IF_CLEAR_FAILED: + logger.error( + "[%s] %s 删除旧预测失败,为避免继续制造重叠数据,本轮跳过写入", + now_str, + pred_metric, + ) + continue + + write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) + + ok = write_predictions( + ts_future=ts_future, + y_pred=y_pred, + metric_name=pred_metric, + labels=write_labels, + ) + + if not ok: continue - extra_labels = _parse_labels(query) - write_predictions(ts_future, y_pred, pred_metric, extra_labels) + future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") + future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") - future_time = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") logger.info( - "[%s] %-40s → %-35s 写入 %d 点,预测至 %s", - now_str, query, pred_metric, len(y_pred), future_time, + "[%s] %-40s → %-35s 写入 %d 点,预测区间 %s ~ %s", + now_str, + query, + pred_metric, + len(y_pred), + future_start, + future_end, ) def main(): logger.info( - "预测服务启动 VM=%s 预测窗口=%ds 轮询间隔=%ds", - VM_URL, HORIZON_SECONDS, POLL_INTERVAL, + "预测服务启动 VM=%s 历史窗口=%dmin 预测窗口=%ds 轮询间隔=%ds 清理旧预测=%s", + VM_URL, + HISTORY_MINUTES, + HORIZON_SECONDS, + POLL_INTERVAL, + CLEAR_OLD_PREDICTIONS, ) + while True: run_once() time.sleep(POLL_INTERVAL) if __name__ == "__main__": - main() + main() \ No newline at end of file From 88aec295671ca112fd422a28acd4d76d43a82f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Wed, 20 May 2026 21:21:11 +0800 Subject: [PATCH 20/31] fix --- ai/predict_v2.py | 263 +++++++++++++++++++++++------------------------ 1 file changed, 128 insertions(+), 135 deletions(-) diff --git a/ai/predict_v2.py b/ai/predict_v2.py index bc425c8..f631e12 100755 --- a/ai/predict_v2.py +++ b/ai/predict_v2.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- """ -ProtoForge 预测服务 v3 +ProtoForge 预测服务 v4 修复点: -1. 解决 HORIZON_SECONDS > POLL_INTERVAL 时,多轮预测窗口重叠导致 Grafana 出现毛刺/竖线问题。 -2. 每轮写入新预测前,删除同一个预测 metric 的旧预测序列,只保留最新一轮预测。 -3. 预测时间戳按整秒写入,避免毫秒时间戳和 Grafana step 不对齐。 -4. 拟合使用真实 timestamp 相对时间,不再假设历史数据严格 1 秒等间隔。 -5. 对历史数据做排序、去重、NaN/Inf 清洗。 +1. 不再使用 VictoriaMetrics delete_series,避免预测历史被整条删除。 +2. 不再每 30 秒写未来 120 秒,避免多轮预测窗口重叠导致 Grafana 出现竖线/毛刺。 +3. 每轮只写未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒的数据。 +4. 使用 forecast="rolling_v2" 新标签,避免和上一版 forecast="latest" 的旧预测数据混在一起。 +5. 使用真实 timestamp 做拟合,不假设采样严格等间隔。 +6. 拟合失败时不再简单写平直线,而是尽量重复最近一个周期的波形。 """ import logging @@ -44,29 +45,35 @@ ] HISTORY_MINUTES = 30 + +# 理论预测窗口 HORIZON_SECONDS = 120 + +# 轮询间隔 POLL_INTERVAL = 30 + +# 实际写入窗口。 +# 关键点:实际写入窗口不要大于轮询间隔,否则不同批次预测会重叠。 +WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) + MIN_POINTS = 120 QUERY_STEP = "1s" -# 关键修复:每轮写入前删除旧预测,避免 120s 预测窗口和 30s 轮询周期重叠 -CLEAR_OLD_PREDICTIONS = True - -# 如果删除旧预测失败,是否跳过本轮写入。 -# 建议 True,避免继续叠加脏数据。 -SKIP_WRITE_IF_CLEAR_FAILED = True +# 不要再清理旧预测,否则历史预测会被整条删除。 +CLEAR_OLD_PREDICTIONS = False -# 给新预测数据加一个稳定标签,方便 Grafana 查询过滤。 -# Grafana 可以查询:feed_rate_predicted{device_id="fanuc-cnc",forecast="latest"} +# 使用新标签,避免和上一版 forecast="latest" 数据混在一起。 EXTRA_PREDICT_LABELS = { - "forecast": "latest", + "forecast": "rolling_v2", "source": "protoforge", } -# 正弦周期限制 MIN_PERIOD_SECONDS = 5.0 MAX_PERIOD_SECONDS = 3600.0 +# 进程内记录每条预测序列上次写到哪里,避免本进程运行期间重复写同一时间段 +LAST_WRITTEN_UNTIL: Dict[str, int] = {} + # ───────────────────────────────────────────────────────────────────────────── @@ -134,10 +141,10 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]: """ 清洗历史数据: - 1. 转换为整秒时间戳 + 1. 时间戳转为整秒 2. 排序 3. 同一秒多个值时保留最后一个 - 4. 插值补齐中间缺失秒 + 4. 插值补齐缺失秒 """ if not ts or not ys or len(ts) != len(ys): return np.array([]), np.array([]) @@ -173,7 +180,6 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. if end_sec <= start_sec: return ts_clean, ys_clean - # 统一为 1 秒网格,减少 query_range 缺点、抖动、缺失点对 FFT 的影响 ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float) ys_grid = np.interp(ts_grid, ts_clean, ys_clean) @@ -187,7 +193,7 @@ def _sine_model(t: np.ndarray, A: float, T: float, phi: float, C: float) -> np.n def estimate_period_by_fft(ys_arr: np.ndarray) -> float: """ 使用 FFT 估算主周期。 - ys_arr 默认是 1 秒间隔。 + ys_arr 默认已经是 1 秒间隔。 """ n = len(ys_arr) @@ -205,8 +211,8 @@ def estimate_period_by_fft(ys_arr: np.ndarray) -> float: if len(freqs) <= 1: return 60.0 - # 跳过直流分量 index 0 power = np.abs(fft_vals[1:]) + if len(power) == 0 or np.max(power) <= 0: return 60.0 @@ -222,59 +228,84 @@ def estimate_period_by_fft(ys_arr: np.ndarray) -> float: return period +def repeat_last_period( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + ts_future_arr: np.ndarray, + period_seconds: float, +) -> np.ndarray: + """ + 拟合失败时的降级策略: + 不直接写平直线,而是把未来时间映射回最近一个周期的历史波形。 + """ + if len(ts_grid) < 2: + return np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float) + + period = max(int(round(period_seconds)), 1) + + y_pred = [] + + hist_start = float(ts_grid[0]) + hist_end = float(ts_grid[-1]) + + for future_ts in ts_future_arr: + mapped_ts = float(future_ts) + + while mapped_ts > hist_end: + mapped_ts -= period + + while mapped_ts < hist_start: + mapped_ts += period + + val = float(np.interp(mapped_ts, ts_grid, ys_grid)) + y_pred.append(val) + + return np.array(y_pred, dtype=float) + + def predict_next( ts: List[float], ys: List[float], - horizon: int = HORIZON_SECONDS, - start_from_now: bool = True, + horizon: int, + base_ts: int, ) -> Tuple[List[float], List[float]]: """ 用 FFT 检测主频,拟合正弦波,外推未来 horizon 秒。 - 返回: - future_timestamps: 未来整秒时间戳 - predicted_values: 预测值 + + base_ts: + 从 base_ts + 1 开始写预测。 """ ts_grid, ys_grid = normalize_history(ts, ys) if len(ys_grid) < MIN_POINTS: return [], [] - n = len(ys_grid) - y_min = float(np.min(ys_grid)) y_max = float(np.max(ys_grid)) y_mean = float(np.mean(ys_grid)) y_range = y_max - y_min - # 数据几乎不波动时,直接使用最后一个值保持 - if y_range <= 1e-9: - base_ts = int(time.time()) if start_from_now else int(ts_grid[-1]) - base_ts = max(base_ts, int(ts_grid[-1])) + base_ts = max(int(base_ts), int(ts_grid[-1])) + + ts_future_arr = np.arange( + base_ts + 1, + base_ts + 1 + horizon, + 1, + dtype=float, + ) - ts_future = [base_ts + i + 1 for i in range(horizon)] - y_pred = [float(ys_grid[-1])] * horizon - return ts_future, y_pred + if y_range <= 1e-9: + y_pred_arr = np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float) + return ts_future_arr.tolist(), y_pred_arr.tolist() period = estimate_period_by_fft(ys_grid) - # 用真实时间戳做相对时间,而不是 np.arange(n) t_fit = ts_grid - ts_grid[0] + t_future = ts_future_arr - ts_grid[0] amplitude = y_range / 2.0 offset = y_mean - # 预测起点统一对齐到整秒 - if start_from_now: - base_ts = int(time.time()) - else: - base_ts = int(ts_grid[-1]) - - # 避免因为 VM 查询延迟导致预测点落在最后一个真实点之前 - base_ts = max(base_ts, int(ts_grid[-1])) - - ts_future_arr = np.arange(base_ts + 1, base_ts + 1 + horizon, 1, dtype=float) - t_future = ts_future_arr - ts_grid[0] - try: popt, _ = curve_fit( _sine_model, @@ -290,7 +321,6 @@ def predict_next( y_pred_arr = _sine_model(t_future, *popt) - # 裁剪到合理范围,避免拟合异常时飞出去 margin = y_range * 0.2 lower = y_min - margin upper = y_max + margin @@ -309,16 +339,21 @@ def predict_next( return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist() except Exception as e: - logger.warning("正弦拟合失败,降级为最近值平滑外推: %s", e) + logger.warning("正弦拟合失败,降级为最近周期波形复制: %s", e) - # 降级策略:用最近 10 个点的均值保持,避免线性外推越走越偏 - tail = min(10, n) - last_value = float(np.mean(ys_grid[-tail:])) + y_pred_arr = repeat_last_period( + ts_grid=ts_grid, + ys_grid=ys_grid, + ts_future_arr=ts_future_arr, + period_seconds=period, + ) - ts_future = ts_future_arr.tolist() - y_pred = [last_value] * horizon + margin = y_range * 0.2 + lower = y_min - margin + upper = y_max + margin + y_pred_arr = np.clip(y_pred_arr, lower, upper) - return ts_future, y_pred + return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist() def prom_escape_label_value(value: str) -> str: @@ -333,83 +368,34 @@ def prom_escape_label_value(value: str) -> str: ) -def build_selector(metric_name: str, labels: Dict[str, str]) -> str: - """ - 构造 PromQL selector,用于 delete_series。 - - 示例: - feed_rate_predicted{device_id="fanuc-cnc"} - """ +def labels_to_str(labels: Dict[str, str]) -> str: if not labels: - return metric_name + return "" parts = [] + for k in sorted(labels.keys()): v = prom_escape_label_value(labels[k]) parts.append(f'{k}="{v}"') - return f'{metric_name}' + "{" + ",".join(parts) + "}" - - -def delete_old_predictions(metric_name: str, base_labels: Dict[str, str]) -> bool: - """ - 删除旧预测序列,避免多轮预测窗口重叠。 - - 注意: - 这里故意只用 base_labels,比如 device_id。 - 不带 forecast/source 标签,是为了兼容旧版本脚本写入的无 forecast 标签数据。 - """ - selector = build_selector(metric_name, base_labels) - - try: - resp = requests.post( - f"{VM_URL}/api/v1/admin/tsdb/delete_series", - params=[("match[]", selector)], - timeout=10, - ) - - if resp.status_code not in (200, 204): - logger.error( - "删除旧预测数据失败 metric=%s selector=%s status=%s body=%s", - metric_name, - selector, - resp.status_code, - resp.text[:500], - ) - return False - - logger.debug("已删除旧预测数据 selector=%s", selector) - return True - - except requests.RequestException as e: - logger.error("删除旧预测数据异常 metric=%s selector=%s: %s", metric_name, selector, e) - return False + return "{" + ",".join(parts) + "}" def write_predictions( ts_future: List[float], y_pred: List[float], metric_name: str, - labels: Dict[str, str] = None, + labels: Dict[str, str], ) -> bool: """ 将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。 时间戳为毫秒级 Unix timestamp。 """ - if labels is None: - labels = {} - if not ts_future or not y_pred or len(ts_future) != len(y_pred): logger.warning("预测数据为空或长度不一致 metric=%s", metric_name) return False - label_str = "" - if labels: - parts = [] - for k in sorted(labels.keys()): - v = prom_escape_label_value(labels[k]) - parts.append(f'{k}="{v}"') - label_str = "{" + ",".join(parts) + "}" + label_str = labels_to_str(labels) lines = [] @@ -449,7 +435,9 @@ def write_predictions( return False -_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*') +_LABEL_PATTERN = re.compile( + r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' +) def _parse_labels(query: str) -> Dict[str, str]: @@ -489,6 +477,13 @@ def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]: return result +def series_key(metric_name: str, labels: Dict[str, str]) -> str: + """ + 构造进程内唯一 key,用于记录上次写到哪个时间点。 + """ + return metric_name + labels_to_str(labels) + + def run_once(): now_str = datetime.now().strftime("%H:%M:%S") @@ -499,34 +494,28 @@ def run_once(): logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) continue + base_labels = _parse_labels(query) + write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) + + key = series_key(pred_metric, write_labels) + + now_sec = int(time.time()) + last_until = LAST_WRITTEN_UNTIL.get(key, 0) + + # 防止同一进程内重复写入已经预测过的时间段 + base_ts = max(now_sec, last_until) + ts_future, y_pred = predict_next( - ts, - ys, - horizon=HORIZON_SECONDS, - start_from_now=True, + ts=ts, + ys=ys, + horizon=WRITE_HORIZON_SECONDS, + base_ts=base_ts, ) if not ts_future or not y_pred: logger.warning("[%s] %s 预测结果为空,跳过", now_str, query) continue - base_labels = _parse_labels(query) - - # 先删除旧预测,再写入新预测。 - # 删除条件只带 base_labels,兼容老版本无 forecast/source 标签的脏数据。 - if CLEAR_OLD_PREDICTIONS: - clear_ok = delete_old_predictions(pred_metric, base_labels) - - if not clear_ok and SKIP_WRITE_IF_CLEAR_FAILED: - logger.error( - "[%s] %s 删除旧预测失败,为避免继续制造重叠数据,本轮跳过写入", - now_str, - pred_metric, - ) - continue - - write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) - ok = write_predictions( ts_future=ts_future, y_pred=y_pred, @@ -537,26 +526,30 @@ def run_once(): if not ok: continue + LAST_WRITTEN_UNTIL[key] = int(max(ts_future)) + future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") logger.info( - "[%s] %-40s → %-35s 写入 %d 点,预测区间 %s ~ %s", + "[%s] %-40s → %-35s 写入 %d 点,预测区间 %s ~ %s,标签=%s", now_str, query, pred_metric, len(y_pred), future_start, future_end, + labels_to_str(write_labels), ) def main(): logger.info( - "预测服务启动 VM=%s 历史窗口=%dmin 预测窗口=%ds 轮询间隔=%ds 清理旧预测=%s", + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds 清理旧预测=%s", VM_URL, HISTORY_MINUTES, HORIZON_SECONDS, + WRITE_HORIZON_SECONDS, POLL_INTERVAL, CLEAR_OLD_PREDICTIONS, ) From 72d5c092018c5caf59f1a9f6ae556e6eff24ecca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Wed, 20 May 2026 21:35:15 +0800 Subject: [PATCH 21/31] fix --- ai/predict_v2.py | 368 ++++++++++++++++++++++++----------------------- 1 file changed, 188 insertions(+), 180 deletions(-) diff --git a/ai/predict_v2.py b/ai/predict_v2.py index f631e12..933a34f 100755 --- a/ai/predict_v2.py +++ b/ai/predict_v2.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- """ -ProtoForge 预测服务 v4 +ProtoForge 预测服务 v5 修复点: -1. 不再使用 VictoriaMetrics delete_series,避免预测历史被整条删除。 -2. 不再每 30 秒写未来 120 秒,避免多轮预测窗口重叠导致 Grafana 出现竖线/毛刺。 -3. 每轮只写未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒的数据。 -4. 使用 forecast="rolling_v2" 新标签,避免和上一版 forecast="latest" 的旧预测数据混在一起。 -5. 使用真实 timestamp 做拟合,不假设采样严格等间隔。 -6. 拟合失败时不再简单写平直线,而是尽量重复最近一个周期的波形。 +1. 不再使用“单正弦拟合”作为主预测算法。 +2. 主算法改为:周期模板预测(同相位历史值加权平均)。 +3. 周期估计使用 FFT 粗估 + 自相关细化,比单纯 FFT 更稳。 +4. 若可用完整周期不足,则降级为多谐波回归(而不是单正弦)。 +5. 每轮只写入未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒,避免预测窗口重叠。 +6. 不删除旧预测历史,避免历史预测消失。 """ import logging @@ -20,17 +20,13 @@ import numpy as np import requests -from scipy.optimize import curve_fit - logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) - logger = logging.getLogger(__name__) - # ── 配置 ────────────────────────────────────────────────────────────────────── VM_URL = "http://localhost:8428" @@ -45,45 +41,36 @@ ] HISTORY_MINUTES = 30 - -# 理论预测窗口 HORIZON_SECONDS = 120 - -# 轮询间隔 POLL_INTERVAL = 30 - -# 实际写入窗口。 -# 关键点:实际写入窗口不要大于轮询间隔,否则不同批次预测会重叠。 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) - MIN_POINTS = 120 QUERY_STEP = "1s" -# 不要再清理旧预测,否则历史预测会被整条删除。 -CLEAR_OLD_PREDICTIONS = False +# 至少要有多少个完整周期,才使用“周期模板预测” +MIN_FULL_CYCLES_FOR_TEMPLATE = 3 +MAX_CYCLES_FOR_TEMPLATE = 6 + +# 周期范围 +MIN_PERIOD_SECONDS = 5 +MAX_PERIOD_SECONDS = 3600 + +# 多谐波回归最高阶数(降级模式) +MAX_HARMONICS = 4 -# 使用新标签,避免和上一版 forecast="latest" 数据混在一起。 EXTRA_PREDICT_LABELS = { - "forecast": "rolling_v2", + "forecast": "seasonal_v1", "source": "protoforge", } -MIN_PERIOD_SECONDS = 5.0 -MAX_PERIOD_SECONDS = 3600.0 - -# 进程内记录每条预测序列上次写到哪里,避免本进程运行期间重复写同一时间段 +# 进程内记录每条预测序列上次写到哪里,避免本进程运行时重复写 LAST_WRITTEN_UNTIL: Dict[str, int] = {} # ───────────────────────────────────────────────────────────────────────────── def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]: - """ - 从 VictoriaMetrics 拉取历史时序数据。 - 返回: - timestamps: Unix 秒级时间戳 - values: float 数值 - """ + """从 VictoriaMetrics 拉取历史时序数据。""" now = datetime.now() start = now - timedelta(minutes=minutes) @@ -118,20 +105,16 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa ts = [] ys = [] - for item in values: if len(item) < 2: continue - try: t = float(item[0]) y = float(item[1]) except Exception: continue - if not math.isfinite(t) or not math.isfinite(y): continue - ts.append(t) ys.append(y) @@ -141,33 +124,29 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]: """ 清洗历史数据: - 1. 时间戳转为整秒 + 1. 时间戳整秒化 2. 排序 - 3. 同一秒多个值时保留最后一个 - 4. 插值补齐缺失秒 + 3. 同一秒多个点保留最后一个 + 4. 按 1 秒插值补齐 """ if not ts or not ys or len(ts) != len(ys): return np.array([]), np.array([]) data = {} - for t, y in zip(ts, ys): try: sec = int(round(float(t))) val = float(y) except Exception: continue - if not math.isfinite(sec) or not math.isfinite(val): continue - data[sec] = val if not data: return np.array([]), np.array([]) sorted_items = sorted(data.items(), key=lambda x: x[0]) - ts_clean = np.array([x[0] for x in sorted_items], dtype=float) ys_clean = np.array([x[1] for x in sorted_items], dtype=float) @@ -186,22 +165,13 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. return ts_grid, ys_grid -def _sine_model(t: np.ndarray, A: float, T: float, phi: float, C: float) -> np.ndarray: - return A * np.sin(2.0 * np.pi / T * t + phi) + C - - def estimate_period_by_fft(ys_arr: np.ndarray) -> float: - """ - 使用 FFT 估算主周期。 - ys_arr 默认已经是 1 秒间隔。 - """ + """FFT 粗估周期。""" n = len(ys_arr) - - if n < 4: + if n < 8: return 60.0 centered = ys_arr - np.mean(ys_arr) - if np.allclose(centered, 0): return 60.0 @@ -212,55 +182,139 @@ def estimate_period_by_fft(ys_arr: np.ndarray) -> float: return 60.0 power = np.abs(fft_vals[1:]) - if len(power) == 0 or np.max(power) <= 0: return 60.0 dominant_idx = int(np.argmax(power)) + 1 dominant_freq = float(freqs[dominant_idx]) - if dominant_freq <= 0: return 60.0 period = 1.0 / dominant_freq - period = float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float: + """ + 用自相关在 init_period 附近细化周期估计。 + """ + n = len(ys_arr) + if n < 20: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + centered = ys_arr - np.mean(ys_arr) + if np.allclose(centered, 0): + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) - return period + corr = np.correlate(centered, centered, mode="full")[n - 1:] + p0 = int(round(init_period)) + left = max(MIN_PERIOD_SECONDS, int(max(2, p0 * 0.7))) + right = min(n // 2, int(max(left + 1, p0 * 1.3))) -def repeat_last_period( - ts_grid: np.ndarray, - ys_grid: np.ndarray, - ts_future_arr: np.ndarray, - period_seconds: float, -) -> np.ndarray: + if right <= left: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + search = corr[left:right + 1] + if len(search) == 0: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + best_lag = left + int(np.argmax(search)) + return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def estimate_period(ys_arr: np.ndarray) -> float: + """FFT + 自相关 的组合周期估计。""" + p_fft = estimate_period_by_fft(ys_arr) + p_refined = refine_period_by_autocorr(ys_arr, p_fft) + return p_refined + + +def seasonal_template_predict( + ys_arr: np.ndarray, + horizon: int, + period: int, + gap: int = 0, + max_cycles: int = MAX_CYCLES_FOR_TEMPLATE, +) -> List[float]: """ - 拟合失败时的降级策略: - 不直接写平直线,而是把未来时间映射回最近一个周期的历史波形。 + 同相位历史值加权平均预测。 + 对未来第 k 个点,取过去多个周期同相位点做加权平均: + y[n-1+gap+k] ≈ avg(y[n-1+gap+k-p], y[n-1+gap+k-2p], ...) """ - if len(ts_grid) < 2: - return np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float) + n = len(ys_arr) + preds = [] - period = max(int(round(period_seconds)), 1) + for k in range(1, horizon + 1): + target_idx = (n - 1) + gap + k - y_pred = [] + values = [] + weights = [] - hist_start = float(ts_grid[0]) - hist_end = float(ts_grid[-1]) + # m=1 表示最近一个周期;m 越大越久远 + for m in range(1, max_cycles + 1): + hist_idx = target_idx - m * period + if 0 <= hist_idx < n: + # 越近权重越大 + w = 1.0 / m + values.append(float(ys_arr[hist_idx])) + weights.append(w) - for future_ts in ts_future_arr: - mapped_ts = float(future_ts) + if not values: + # 万一拿不到,退化为最后一个值 + preds.append(float(ys_arr[-1])) + else: + preds.append(float(np.average(values, weights=weights))) - while mapped_ts > hist_end: - mapped_ts -= period + return preds - while mapped_ts < hist_start: - mapped_ts += period - val = float(np.interp(mapped_ts, ts_grid, ys_grid)) - y_pred.append(val) +def harmonic_regression_predict( + ys_arr: np.ndarray, + horizon: int, + period: int, + gap: int = 0, + max_harmonics: int = MAX_HARMONICS, +) -> List[float]: + """ + 多谐波回归(降级模式): + y = c + Σ [a_k sin(2πkt/P) + b_k cos(2πkt/P)] + 相比单正弦,更能表达非标准正弦波形。 + """ + n = len(ys_arr) + if n < 10 or period <= 1: + return [float(ys_arr[-1])] * horizon + + # 周期太短时,谐波数不能太大 + K = min(max_harmonics, max(1, period // 4)) + + t = np.arange(n, dtype=float) + cols = [np.ones(n, dtype=float)] + + for k in range(1, K + 1): + angle = 2.0 * np.pi * k * t / period + cols.append(np.sin(angle)) + cols.append(np.cos(angle)) + + X = np.column_stack(cols) + + try: + coef, _, _, _ = np.linalg.lstsq(X, ys_arr, rcond=None) + except Exception: + return [float(ys_arr[-1])] * horizon + + t_future = np.arange(n + gap, n + gap + horizon, dtype=float) + cols_future = [np.ones(horizon, dtype=float)] - return np.array(y_pred, dtype=float) + for k in range(1, K + 1): + angle = 2.0 * np.pi * k * t_future / period + cols_future.append(np.sin(angle)) + cols_future.append(np.cos(angle)) + + X_future = np.column_stack(cols_future) + y_pred = X_future @ coef + + return y_pred.astype(float).tolist() def predict_next( @@ -270,96 +324,74 @@ def predict_next( base_ts: int, ) -> Tuple[List[float], List[float]]: """ - 用 FFT 检测主频,拟合正弦波,外推未来 horizon 秒。 - - base_ts: - 从 base_ts + 1 开始写预测。 + 主预测函数: + 1. 周期估计 + 2. 优先使用周期模板预测 + 3. 周期不够时降级为多谐波回归 """ ts_grid, ys_grid = normalize_history(ts, ys) - if len(ys_grid) < MIN_POINTS: return [], [] y_min = float(np.min(ys_grid)) y_max = float(np.max(ys_grid)) - y_mean = float(np.mean(ys_grid)) y_range = y_max - y_min - base_ts = max(int(base_ts), int(ts_grid[-1])) - - ts_future_arr = np.arange( - base_ts + 1, - base_ts + 1 + horizon, - 1, - dtype=float, - ) - if y_range <= 1e-9: - y_pred_arr = np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float) - return ts_future_arr.tolist(), y_pred_arr.tolist() + base_ts = max(int(base_ts), int(ts_grid[-1])) + ts_future = [base_ts + i + 1 for i in range(horizon)] + y_pred = [float(ys_grid[-1])] * horizon + return ts_future, y_pred - period = estimate_period_by_fft(ys_grid) + period_est = estimate_period(ys_grid) + period = int(round(period_est)) + period = max(MIN_PERIOD_SECONDS, min(MAX_PERIOD_SECONDS, period)) - t_fit = ts_grid - ts_grid[0] - t_future = ts_future_arr - ts_grid[0] + last_real_ts = int(ts_grid[-1]) + base_ts = max(int(base_ts), last_real_ts) - amplitude = y_range / 2.0 - offset = y_mean + # 如果当前时间已经超过最后一个真实点,gap 表示中间“空过去”的秒数 + gap = max(0, base_ts - last_real_ts) - try: - popt, _ = curve_fit( - _sine_model, - t_fit, - ys_grid, - p0=[amplitude, period, 0.0, offset], - bounds=( - [0.0, MIN_PERIOD_SECONDS, -2.0 * np.pi, y_min - y_range], - [np.inf, MAX_PERIOD_SECONDS, 2.0 * np.pi, y_max + y_range], - ), - maxfev=12000, - ) + ts_future = [base_ts + i + 1 for i in range(horizon)] - y_pred_arr = _sine_model(t_future, *popt) + full_cycles = len(ys_grid) // period if period > 0 else 0 - margin = y_range * 0.2 - lower = y_min - margin - upper = y_max + margin - y_pred_arr = np.clip(y_pred_arr, lower, upper) - - if not np.all(np.isfinite(y_pred_arr)): - raise ValueError("预测结果包含 NaN/Inf") - - logger.debug( - "正弦拟合成功 period=%.2fs amplitude=%.4f offset=%.4f", - popt[1], - popt[0], - popt[3], + if full_cycles >= MIN_FULL_CYCLES_FOR_TEMPLATE: + y_pred = seasonal_template_predict( + ys_arr=ys_grid, + horizon=horizon, + period=period, + gap=gap, + max_cycles=min(MAX_CYCLES_FOR_TEMPLATE, full_cycles), ) - - return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist() - - except Exception as e: - logger.warning("正弦拟合失败,降级为最近周期波形复制: %s", e) - - y_pred_arr = repeat_last_period( - ts_grid=ts_grid, - ys_grid=ys_grid, - ts_future_arr=ts_future_arr, - period_seconds=period, + model_name = "seasonal_template" + else: + y_pred = harmonic_regression_predict( + ys_arr=ys_grid, + horizon=horizon, + period=period, + gap=gap, + max_harmonics=MAX_HARMONICS, ) + model_name = "harmonic_regression" + + # 合理裁剪,避免偶然外推过大 + margin = y_range * 0.15 + lower = y_min - margin + upper = y_max + margin + y_pred = np.clip(np.array(y_pred, dtype=float), lower, upper).astype(float).tolist() - margin = y_range * 0.2 - lower = y_min - margin - upper = y_max + margin - y_pred_arr = np.clip(y_pred_arr, lower, upper) + logger.debug( + "predict_next model=%s period=%ss full_cycles=%s gap=%s", + model_name, period, full_cycles, gap + ) - return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist() + return ts_future, y_pred def prom_escape_label_value(value: str) -> str: - """ - Prometheus exposition label value 转义。 - """ + """Prometheus label value 转义。""" return ( str(value) .replace("\\", "\\\\") @@ -371,13 +403,10 @@ def prom_escape_label_value(value: str) -> str: def labels_to_str(labels: Dict[str, str]) -> str: if not labels: return "" - parts = [] - for k in sorted(labels.keys()): v = prom_escape_label_value(labels[k]) parts.append(f'{k}="{v}"') - return "{" + ",".join(parts) + "}" @@ -387,16 +416,12 @@ def write_predictions( metric_name: str, labels: Dict[str, str], ) -> bool: - """ - 将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。 - 时间戳为毫秒级 Unix timestamp。 - """ + """将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。""" if not ts_future or not y_pred or len(ts_future) != len(y_pred): logger.warning("预测数据为空或长度不一致 metric=%s", metric_name) return False label_str = labels_to_str(labels) - lines = [] for t, y in zip(ts_future, y_pred): @@ -422,14 +447,11 @@ def write_predictions( resp = requests.post( f"{VM_URL}/api/v1/import/prometheus", data=payload.encode("utf-8"), - headers={ - "Content-Type": "text/plain; version=0.0.4; charset=utf-8", - }, + headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, timeout=10, ) resp.raise_for_status() return True - except requests.RequestException as e: logger.error("写入预测数据失败 metric=%s: %s", metric_name, e) return False @@ -441,12 +463,7 @@ def write_predictions( def _parse_labels(query: str) -> Dict[str, str]: - """ - 从查询表达式中解析标签。 - - 示例: - feed_rate{device_id="fanuc-cnc"} -> {"device_id": "fanuc-cnc"} - """ + """从查询表达式中解析标签。""" labels = {} if "{" not in query or "}" not in query: @@ -468,19 +485,13 @@ def _parse_labels(query: str) -> Dict[str, str]: def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]: result = {} - for d in dicts: - if not d: - continue - result.update(d) - + if d: + result.update(d) return result def series_key(metric_name: str, labels: Dict[str, str]) -> str: - """ - 构造进程内唯一 key,用于记录上次写到哪个时间点。 - """ return metric_name + labels_to_str(labels) @@ -489,7 +500,6 @@ def run_once(): for query, pred_metric in PREDICT_TARGETS: ts, ys = fetch_history(query) - if len(ys) < MIN_POINTS: logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) continue @@ -502,7 +512,7 @@ def run_once(): now_sec = int(time.time()) last_until = LAST_WRITTEN_UNTIL.get(key, 0) - # 防止同一进程内重复写入已经预测过的时间段 + # 避免同一进程内写重叠时间段 base_ts = max(now_sec, last_until) ts_future, y_pred = predict_next( @@ -522,7 +532,6 @@ def run_once(): metric_name=pred_metric, labels=write_labels, ) - if not ok: continue @@ -545,13 +554,12 @@ def run_once(): def main(): logger.info( - "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds 清理旧预测=%s", + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds", VM_URL, HISTORY_MINUTES, HORIZON_SECONDS, WRITE_HORIZON_SECONDS, POLL_INTERVAL, - CLEAR_OLD_PREDICTIONS, ) while True: From c26b9991d4cfac374d56829b47023b043d270aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 09:03:00 +0800 Subject: [PATCH 22/31] feat(ai): support single scene predict --- ai/predict_v3_single_scene.py | 1058 +++++++++++++++++++++++++++++++++ 1 file changed, 1058 insertions(+) create mode 100644 ai/predict_v3_single_scene.py diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py new file mode 100644 index 0000000..23af8c5 --- /dev/null +++ b/ai/predict_v3_single_scene.py @@ -0,0 +1,1058 @@ +# -*- coding: utf-8 -*- +""" +ProtoForge 预测服务 v6 + +核心能力: +1. 周期模板预测:适合 CNC 这类强周期、非标准正弦波形。 +2. 健康基线冻结:检测到异常后,不再用故障数据更新预测模板。 +3. 恢复冷却机制:故障恢复后,需要连续稳定多个周期,才恢复学习。 +4. 预测上下界:写入 predicted_upper / predicted_lower,方便 Grafana 展示预测带。 +5. 异常标记:写入 xxx_anomaly,1 表示异常,0 表示正常。 +6. 不删除历史预测,不使用 delete_series。 +""" + +""" +场景:不考虑物料、不考虑跨程序场景算法预测 +""" + +import json +import logging +import math +import os +import re +import time +from dataclasses import asdict, dataclass +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Tuple + +import numpy as np +import requests + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) + +logger = logging.getLogger(__name__) + + +# ── 基础配置 ────────────────────────────────────────────────────────────────── + +VM_URL = "http://localhost:8428" + +STATE_FILE = "/tmp/protoforge_predictor_state.json" + +HISTORY_MINUTES = 30 +HORIZON_SECONDS = 120 +POLL_INTERVAL = 30 + +# 实际每轮写入未来多少秒。 +# 不要大于 POLL_INTERVAL,否则多轮预测会重叠。 +WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) + +QUERY_STEP = "1s" +MIN_POINTS = 120 + +MIN_PERIOD_SECONDS = 5 +MAX_PERIOD_SECONDS = 3600 + +# 至少多少个完整周期才允许构建健康模板 +MIN_FULL_CYCLES_FOR_TEMPLATE = 3 + +# 构建模板最多使用最近多少个周期 +MAX_CYCLES_FOR_TEMPLATE = 6 + +# 检测异常使用最近多少秒实际数据 +DETECT_WINDOW_SECONDS = 15 + +# 恢复后,至少连续正常多少秒才考虑恢复学习 +RECOVERY_MIN_SECONDS = 60 + +# 健康状态下模板更新速度,越小越保守 +HEALTHY_EMA_ALPHA = 0.15 + +# 故障恢复后第一次重新学习时的更新速度 +RECOVERY_EMA_ALPHA = 0.35 + +# 最近窗口里有多少比例的点超过阈值,才认为异常 +OUTSIDE_RATIO_THRESHOLD = 0.60 + +# 最近窗口里有多少比例的点回到阈值内,才认为恢复正常 +RECOVERY_INSIDE_RATIO_THRESHOLD = 0.80 + + +# ── 指标配置 ────────────────────────────────────────────────────────────────── +# abs_threshold / rel_threshold 需要按指标单位调。 +# feed_rate 单位 mm/min,这里先给 400 和 25%。 + +PREDICT_TARGETS = [ + { + "query": 'feed_rate{device_id="fanuc-cnc"}', + "pred_metric": "feed_rate_predicted", + "anomaly_metric": "feed_rate_anomaly", + "abs_threshold": 400.0, + "rel_threshold": 0.25, + }, + { + "query": 'spindle_speed{device_id="fanuc-cnc"}', + "pred_metric": "spindle_speed_predicted", + "anomaly_metric": "spindle_speed_anomaly", + "abs_threshold": 500.0, + "rel_threshold": 0.25, + }, + { + "query": 'spindle_current{device_id="fanuc-cnc"}', + "pred_metric": "spindle_current_predicted", + "anomaly_metric": "spindle_current_anomaly", + "abs_threshold": 5.0, + "rel_threshold": 0.25, + }, + { + "query": 'vibration_x{device_id="fanuc-cnc"}', + "pred_metric": "vibration_x_predicted", + "anomaly_metric": "vibration_x_anomaly", + "abs_threshold": 1.0, + "rel_threshold": 0.30, + }, + { + "query": 'vibration_y{device_id="fanuc-cnc"}', + "pred_metric": "vibration_y_predicted", + "anomaly_metric": "vibration_y_anomaly", + "abs_threshold": 1.0, + "rel_threshold": 0.30, + }, + { + "query": 'vibration_z{device_id="fanuc-cnc"}', + "pred_metric": "vibration_z_predicted", + "anomaly_metric": "vibration_z_anomaly", + "abs_threshold": 1.0, + "rel_threshold": 0.30, + }, +] + +EXTRA_PREDICT_LABELS = { + "forecast": "health_gated_v1", + "source": "protoforge", +} + +BASELINE_STATUS_HEALTHY = "healthy" +BASELINE_STATUS_ANOMALY = "anomaly" +BASELINE_STATUS_RECOVERING = "recovering" +BASELINE_STATUS_LEARNING = "learning" + + +# ── 状态结构 ────────────────────────────────────────────────────────────────── + +@dataclass +class BaselineState: + period: int + template: List[float] + status: str + clean_seconds: int + last_update_ts: int + last_seen_ts: int + y_min: float + y_max: float + + +BASELINE_STATES: Dict[str, BaselineState] = {} +LAST_WRITTEN_UNTIL: Dict[str, int] = {} + + +# ── VM 读取 ─────────────────────────────────────────────────────────────────── + +def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]: + now = datetime.now() + start = now - timedelta(minutes=minutes) + + try: + resp = requests.get( + f"{VM_URL}/api/v1/query_range", + params={ + "query": query, + "start": start.timestamp(), + "end": now.timestamp(), + "step": QUERY_STEP, + }, + timeout=10, + ) + resp.raise_for_status() + except requests.RequestException as e: + logger.error("拉取数据失败 query=%s: %s", query, e) + return [], [] + + try: + result = resp.json().get("data", {}).get("result", []) + except Exception as e: + logger.error("解析 VM 返回失败 query=%s: %s", query, e) + return [], [] + + if not result: + return [], [] + + values = result[0].get("values", []) + if not values: + return [], [] + + ts = [] + ys = [] + + for item in values: + if len(item) < 2: + continue + + try: + t = float(item[0]) + y = float(item[1]) + except Exception: + continue + + if not math.isfinite(t) or not math.isfinite(y): + continue + + ts.append(t) + ys.append(y) + + return ts, ys + + +def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]: + if not ts or not ys or len(ts) != len(ys): + return np.array([]), np.array([]) + + data = {} + + for t, y in zip(ts, ys): + try: + sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(sec) or not math.isfinite(val): + continue + + data[sec] = val + + if not data: + return np.array([]), np.array([]) + + sorted_items = sorted(data.items(), key=lambda x: x[0]) + + ts_clean = np.array([x[0] for x in sorted_items], dtype=float) + ys_clean = np.array([x[1] for x in sorted_items], dtype=float) + + if len(ts_clean) < 2: + return ts_clean, ys_clean + + start_sec = int(ts_clean[0]) + end_sec = int(ts_clean[-1]) + + if end_sec <= start_sec: + return ts_clean, ys_clean + + ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float) + ys_grid = np.interp(ts_grid, ts_clean, ys_clean) + + return ts_grid, ys_grid + + +# ── 周期估计 ────────────────────────────────────────────────────────────────── + +def estimate_period_by_fft(ys_arr: np.ndarray) -> float: + n = len(ys_arr) + + if n < 8: + return 60.0 + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return 60.0 + + fft_vals = np.fft.rfft(centered) + freqs = np.fft.rfftfreq(n, d=1.0) + + if len(freqs) <= 1: + return 60.0 + + power = np.abs(fft_vals[1:]) + + if len(power) == 0 or np.max(power) <= 0: + return 60.0 + + dominant_idx = int(np.argmax(power)) + 1 + dominant_freq = float(freqs[dominant_idx]) + + if dominant_freq <= 0: + return 60.0 + + period = 1.0 / dominant_freq + + return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float: + n = len(ys_arr) + + if n < 20: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + corr = np.correlate(centered, centered, mode="full")[n - 1:] + + p0 = int(round(init_period)) + left = max(MIN_PERIOD_SECONDS, int(max(2, p0 * 0.7))) + right = min(n // 2, int(max(left + 1, p0 * 1.3))) + + if right <= left: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + search = corr[left:right + 1] + + if len(search) == 0: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + best_lag = left + int(np.argmax(search)) + + return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def estimate_period(ys_arr: np.ndarray) -> int: + p_fft = estimate_period_by_fft(ys_arr) + p_refined = refine_period_by_autocorr(ys_arr, p_fft) + + period = int(round(p_refined)) + period = max(MIN_PERIOD_SECONDS, min(MAX_PERIOD_SECONDS, period)) + + return int(period) + + +# ── 模板构建与预测 ───────────────────────────────────────────────────────────── + +def fill_template_nan(template: np.ndarray) -> np.ndarray: + period = len(template) + + if period == 0: + return template + + idx = np.arange(period) + valid = np.isfinite(template) + + if not np.any(valid): + return np.zeros(period, dtype=float) + + if np.all(valid): + return template + + x_valid = idx[valid] + y_valid = template[valid] + + # 环形插值,处理 phase 0 附近缺口 + x_ext = np.concatenate([x_valid - period, x_valid, x_valid + period]) + y_ext = np.concatenate([y_valid, y_valid, y_valid]) + + filled = np.interp(idx, x_ext, y_ext) + + return filled.astype(float) + + +def build_phase_template( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + period: int, + max_cycles: int = MAX_CYCLES_FOR_TEMPLATE, + tail_seconds: Optional[int] = None, +) -> Optional[np.ndarray]: + if period <= 1 or len(ys_grid) < period * MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + max_seconds = period * max_cycles + + if tail_seconds is not None: + max_seconds = min(max_seconds, int(tail_seconds)) + + max_seconds = max(period * MIN_FULL_CYCLES_FOR_TEMPLATE, max_seconds) + + if len(ys_grid) < max_seconds: + start_idx = 0 + else: + start_idx = len(ys_grid) - max_seconds + + ts_tail = ts_grid[start_idx:] + ys_tail = ys_grid[start_idx:] + + if len(ys_tail) < period * MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + sums = np.zeros(period, dtype=float) + weights = np.zeros(period, dtype=float) + + total = len(ys_tail) + + for i, (t, y) in enumerate(zip(ts_tail, ys_tail)): + phase = int(t) % period + + # 越近的数据权重越高 + recency = (i + 1) / total + weight = 0.3 + 0.7 * recency + + sums[phase] += float(y) * weight + weights[phase] += weight + + template = np.full(period, np.nan, dtype=float) + + valid = weights > 0 + template[valid] = sums[valid] / weights[valid] + + template = fill_template_nan(template) + + return template + + +def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: + old_period = len(old_template) + + if old_period == new_period: + return old_template.astype(float) + + if old_period <= 1 or new_period <= 1: + return np.full(new_period, float(np.mean(old_template)), dtype=float) + + old_x = np.linspace(0.0, 1.0, old_period, endpoint=False) + new_x = np.linspace(0.0, 1.0, new_period, endpoint=False) + + old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0]) + old_y_ext = np.concatenate([old_template, old_template, old_template]) + + return np.interp(new_x, old_x_ext, old_y_ext).astype(float) + + +def merge_template( + old_template: np.ndarray, + new_template: np.ndarray, + alpha: float, +) -> np.ndarray: + alpha = float(np.clip(alpha, 0.0, 1.0)) + + if len(old_template) != len(new_template): + old_template = resample_template(old_template, len(new_template)) + + return ((1.0 - alpha) * old_template + alpha * new_template).astype(float) + + +def predict_by_state(state: BaselineState, ts_list: List[int]) -> np.ndarray: + template = np.array(state.template, dtype=float) + period = int(state.period) + + if period <= 1 or len(template) != period: + return np.zeros(len(ts_list), dtype=float) + + values = [] + + for ts in ts_list: + phase = int(ts) % period + values.append(float(template[phase])) + + return np.array(values, dtype=float) + + +def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray: + return np.maximum(abs_threshold, np.abs(pred) * rel_threshold) + + +def calc_bounds(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> Tuple[np.ndarray, np.ndarray]: + threshold = calc_threshold(pred, abs_threshold, rel_threshold) + lower = pred - threshold + upper = pred + threshold + return lower, upper + + +# ── 异常检测与状态更新 ──────────────────────────────────────────────────────── + +def detect_anomaly( + state: BaselineState, + ts_grid: np.ndarray, + ys_grid: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> Tuple[bool, float, float, float]: + if len(ys_grid) < DETECT_WINDOW_SECONDS: + return False, 0.0, 0.0, 0.0 + + ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() + actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float) + + pred = predict_by_state(state, ts_recent) + threshold = calc_threshold(pred, abs_threshold, rel_threshold) + + abs_err = np.abs(actual - pred) + outside = abs_err > threshold + + outside_ratio = float(np.mean(outside)) + mean_abs_err = float(np.mean(abs_err)) + mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1.0))) + + is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD + + return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err + + +def is_recovered( + state: BaselineState, + ts_grid: np.ndarray, + ys_grid: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> Tuple[bool, float]: + if len(ys_grid) < DETECT_WINDOW_SECONDS: + return False, 0.0 + + ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() + actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float) + + pred = predict_by_state(state, ts_recent) + threshold = calc_threshold(pred, abs_threshold, rel_threshold) + + abs_err = np.abs(actual - pred) + inside = abs_err <= threshold + + inside_ratio = float(np.mean(inside)) + + return inside_ratio >= RECOVERY_INSIDE_RATIO_THRESHOLD, inside_ratio + + +def create_initial_state( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + now_sec: int, +) -> Optional[BaselineState]: + if len(ys_grid) < MIN_POINTS: + return None + + period = estimate_period(ys_grid) + + template = build_phase_template( + ts_grid=ts_grid, + ys_grid=ys_grid, + period=period, + max_cycles=MAX_CYCLES_FOR_TEMPLATE, + tail_seconds=period * MAX_CYCLES_FOR_TEMPLATE, + ) + + if template is None: + return None + + return BaselineState( + period=int(period), + template=template.astype(float).tolist(), + status=BASELINE_STATUS_HEALTHY, + clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE), + last_update_ts=now_sec, + last_seen_ts=now_sec, + y_min=float(np.min(ys_grid)), + y_max=float(np.max(ys_grid)), + ) + + +def maybe_update_state( + key: str, + ts_grid: np.ndarray, + ys_grid: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> Tuple[Optional[BaselineState], bool, float, float, float]: + now_sec = int(time.time()) + + state = BASELINE_STATES.get(key) + + if state is None: + state = create_initial_state(ts_grid, ys_grid, now_sec) + + if state is None: + return None, False, 0.0, 0.0, 0.0 + + BASELINE_STATES[key] = state + logger.info( + "初始化健康模板 key=%s period=%ss clean_seconds=%ss", + key, + state.period, + state.clean_seconds, + ) + return state, False, 0.0, 0.0, 0.0 + + elapsed = max(1, now_sec - int(state.last_seen_ts)) + elapsed = min(elapsed, POLL_INTERVAL * 2) + state.last_seen_ts = now_sec + + is_anom, outside_ratio, mean_abs_err, mean_rel_err = detect_anomaly( + state=state, + ts_grid=ts_grid, + ys_grid=ys_grid, + abs_threshold=abs_threshold, + rel_threshold=rel_threshold, + ) + + if is_anom: + state.status = BASELINE_STATUS_ANOMALY + state.clean_seconds = 0 + + logger.warning( + "检测到异常,冻结模板 key=%s outside_ratio=%.2f mean_abs_err=%.2f mean_rel_err=%.2f", + key, + outside_ratio, + mean_abs_err, + mean_rel_err, + ) + + BASELINE_STATES[key] = state + return state, True, outside_ratio, mean_abs_err, mean_rel_err + + recovered, inside_ratio = is_recovered( + state=state, + ts_grid=ts_grid, + ys_grid=ys_grid, + abs_threshold=abs_threshold, + rel_threshold=rel_threshold, + ) + + if state.status == BASELINE_STATUS_ANOMALY: + if recovered: + state.status = BASELINE_STATUS_RECOVERING + state.clean_seconds = elapsed + logger.info( + "异常开始恢复 key=%s inside_ratio=%.2f clean_seconds=%ss", + key, + inside_ratio, + state.clean_seconds, + ) + else: + state.clean_seconds = 0 + BASELINE_STATES[key] = state + return state, True, outside_ratio, mean_abs_err, mean_rel_err + + elif state.status == BASELINE_STATUS_RECOVERING: + if recovered: + state.clean_seconds += elapsed + else: + state.status = BASELINE_STATUS_ANOMALY + state.clean_seconds = 0 + BASELINE_STATES[key] = state + return state, True, outside_ratio, mean_abs_err, mean_rel_err + + else: + state.status = BASELINE_STATUS_HEALTHY + state.clean_seconds += elapsed + + # 故障恢复后,不要立刻学习。 + # 必须至少连续正常:max(RECOVERY_MIN_SECONDS, 3 个周期) + min_clean_for_update = max( + RECOVERY_MIN_SECONDS, + int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE, + ) + + if state.clean_seconds < min_clean_for_update: + BASELINE_STATES[key] = state + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + # 只使用最近 clean_seconds 这段连续正常数据来更新模板,避免历史故障污染。 + new_period = estimate_period(ys_grid) + tail_seconds = min( + int(state.clean_seconds), + int(new_period) * MAX_CYCLES_FOR_TEMPLATE, + ) + + new_template = build_phase_template( + ts_grid=ts_grid, + ys_grid=ys_grid, + period=new_period, + max_cycles=MAX_CYCLES_FOR_TEMPLATE, + tail_seconds=tail_seconds, + ) + + if new_template is None: + BASELINE_STATES[key] = state + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + old_template = np.array(state.template, dtype=float) + + if state.status == BASELINE_STATUS_RECOVERING: + alpha = RECOVERY_EMA_ALPHA + state.status = BASELINE_STATUS_HEALTHY + else: + alpha = HEALTHY_EMA_ALPHA + + merged = merge_template( + old_template=old_template, + new_template=new_template, + alpha=alpha, + ) + + state.period = int(new_period) + state.template = merged.astype(float).tolist() + state.last_update_ts = now_sec + state.y_min = float(np.min(ys_grid[-tail_seconds:])) + state.y_max = float(np.max(ys_grid[-tail_seconds:])) + + BASELINE_STATES[key] = state + + logger.info( + "更新健康模板 key=%s period=%ss status=%s clean_seconds=%ss alpha=%.2f", + key, + state.period, + state.status, + state.clean_seconds, + alpha, + ) + + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + +# ── Prometheus 格式写入 ─────────────────────────────────────────────────────── + +def prom_escape_label_value(value: str) -> str: + return ( + str(value) + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace('"', '\\"') + ) + + +def labels_to_str(labels: Dict[str, str]) -> str: + if not labels: + return "" + + parts = [] + + for k in sorted(labels.keys()): + v = prom_escape_label_value(labels[k]) + parts.append(f'{k}="{v}"') + + return "{" + ",".join(parts) + "}" + + +def write_series( + metric_name: str, + labels: Dict[str, str], + ts_list: List[int], + values: List[float], +) -> bool: + if not ts_list or not values or len(ts_list) != len(values): + return False + + label_str = labels_to_str(labels) + lines = [] + + for t, y in zip(ts_list, values): + try: + ts_sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(ts_sec) or not math.isfinite(val): + continue + + ts_ms = ts_sec * 1000 + lines.append(f"{metric_name}{label_str} {val:.6f} {ts_ms}") + + if not lines: + return False + + payload = "\n".join(lines) + "\n" + + try: + resp = requests.post( + f"{VM_URL}/api/v1/import/prometheus", + data=payload.encode("utf-8"), + headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, + timeout=10, + ) + resp.raise_for_status() + return True + except requests.RequestException as e: + logger.error("写入数据失败 metric=%s: %s", metric_name, e) + return False + + +def write_prediction_bundle( + pred_metric: str, + anomaly_metric: str, + labels: Dict[str, str], + ts_future: List[int], + pred_values: np.ndarray, + lower_values: np.ndarray, + upper_values: np.ndarray, + is_anomaly: bool, + outside_ratio: float, + mean_abs_err: float, + mean_rel_err: float, +) -> bool: + ok1 = write_series( + metric_name=pred_metric, + labels=labels, + ts_list=ts_future, + values=pred_values.astype(float).tolist(), + ) + + ok2 = write_series( + metric_name=f"{pred_metric}_lower", + labels=labels, + ts_list=ts_future, + values=lower_values.astype(float).tolist(), + ) + + ok3 = write_series( + metric_name=f"{pred_metric}_upper", + labels=labels, + ts_list=ts_future, + values=upper_values.astype(float).tolist(), + ) + + now_sec = int(time.time()) + + anomaly_labels = dict(labels) + anomaly_labels["type"] = "prediction_deviation" + + ok4 = write_series( + metric_name=anomaly_metric, + labels=anomaly_labels, + ts_list=[now_sec], + values=[1.0 if is_anomaly else 0.0], + ) + + ok5 = write_series( + metric_name=f"{anomaly_metric}_outside_ratio", + labels=anomaly_labels, + ts_list=[now_sec], + values=[outside_ratio], + ) + + ok6 = write_series( + metric_name=f"{anomaly_metric}_mean_abs_error", + labels=anomaly_labels, + ts_list=[now_sec], + values=[mean_abs_err], + ) + + ok7 = write_series( + metric_name=f"{anomaly_metric}_mean_rel_error", + labels=anomaly_labels, + ts_list=[now_sec], + values=[mean_rel_err], + ) + + return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 + + +# ── 标签解析 ────────────────────────────────────────────────────────────────── + +_LABEL_PATTERN = re.compile( + r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' +) + + +def _parse_labels(query: str) -> Dict[str, str]: + labels = {} + + if "{" not in query or "}" not in query: + return labels + + try: + label_part = query[query.index("{") + 1: query.rindex("}")] + except Exception: + return labels + + for match in _LABEL_PATTERN.finditer(label_part): + key = match.group(1) + value = match.group(2) + value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\") + labels[key] = value + + return labels + + +def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]: + result = {} + + for d in dicts: + if d: + result.update(d) + + return result + + +def series_key(metric_name: str, labels: Dict[str, str]) -> str: + return metric_name + labels_to_str(labels) + + +# ── 状态持久化 ──────────────────────────────────────────────────────────────── + +def load_state(): + global BASELINE_STATES + + if not os.path.exists(STATE_FILE): + return + + try: + with open(STATE_FILE, "r", encoding="utf-8") as f: + raw = json.load(f) + + states = {} + + for key, value in raw.get("baseline_states", {}).items(): + states[key] = BaselineState(**value) + + BASELINE_STATES = states + + logger.info("已加载预测状态文件 %s,状态数量=%d", STATE_FILE, len(BASELINE_STATES)) + + except Exception as e: + logger.warning("加载预测状态文件失败,将重新学习: %s", e) + + +def save_state(): + try: + raw = { + "baseline_states": { + key: asdict(value) + for key, value in BASELINE_STATES.items() + } + } + + tmp_file = STATE_FILE + ".tmp" + + with open(tmp_file, "w", encoding="utf-8") as f: + json.dump(raw, f, ensure_ascii=False, indent=2) + + os.replace(tmp_file, STATE_FILE) + + except Exception as e: + logger.warning("保存预测状态文件失败: %s", e) + + +# ── 主逻辑 ──────────────────────────────────────────────────────────────────── + +def run_once(): + now_str = datetime.now().strftime("%H:%M:%S") + + for target in PREDICT_TARGETS: + query = target["query"] + pred_metric = target["pred_metric"] + anomaly_metric = target["anomaly_metric"] + abs_threshold = float(target["abs_threshold"]) + rel_threshold = float(target["rel_threshold"]) + + ts, ys = fetch_history(query) + + if len(ys) < MIN_POINTS: + logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) + continue + + ts_grid, ys_grid = normalize_history(ts, ys) + + if len(ys_grid) < MIN_POINTS: + logger.info("[%s] %s 清洗后数据不足(%d 点),跳过", now_str, query, len(ys_grid)) + continue + + base_labels = _parse_labels(query) + write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) + + key = series_key(pred_metric, write_labels) + + state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state( + key=key, + ts_grid=ts_grid, + ys_grid=ys_grid, + abs_threshold=abs_threshold, + rel_threshold=rel_threshold, + ) + + if state is None: + logger.info("[%s] %s 暂无可用健康模板,等待学习", now_str, query) + continue + + now_sec = int(time.time()) + last_until = LAST_WRITTEN_UNTIL.get(key, 0) + last_real_ts = int(ts_grid[-1]) + + base_ts = max(now_sec, last_until, last_real_ts) + + ts_future = [ + base_ts + i + 1 + for i in range(WRITE_HORIZON_SECONDS) + ] + + pred_values = predict_by_state(state, ts_future) + + lower_values, upper_values = calc_bounds( + pred=pred_values, + abs_threshold=abs_threshold, + rel_threshold=rel_threshold, + ) + + ok = write_prediction_bundle( + pred_metric=pred_metric, + anomaly_metric=anomaly_metric, + labels=write_labels, + ts_future=ts_future, + pred_values=pred_values, + lower_values=lower_values, + upper_values=upper_values, + is_anomaly=is_anomaly, + outside_ratio=outside_ratio, + mean_abs_err=mean_abs_err, + mean_rel_err=mean_rel_err, + ) + + if not ok: + continue + + LAST_WRITTEN_UNTIL[key] = int(max(ts_future)) + + future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") + future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") + + logger.info( + "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss clean=%ss 写入 %d 点,预测区间 %s ~ %s", + now_str, + query, + pred_metric, + state.status, + is_anomaly, + state.period, + state.clean_seconds, + len(ts_future), + future_start, + future_end, + ) + + save_state() + + +def main(): + load_state() + + logger.info( + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s", + VM_URL, + HISTORY_MINUTES, + HORIZON_SECONDS, + WRITE_HORIZON_SECONDS, + POLL_INTERVAL, + STATE_FILE, + ) + + while True: + run_once() + time.sleep(POLL_INTERVAL) + + +if __name__ == "__main__": + main() \ No newline at end of file From 54c4b851a004567078cfec337933aafcbd676b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 13:39:33 +0800 Subject: [PATCH 23/31] fix --- ai/predict_v3_single_scene.py | 701 +++++++++++++++++++++++----------- 1 file changed, 488 insertions(+), 213 deletions(-) mode change 100644 => 100755 ai/predict_v3_single_scene.py diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py old mode 100644 new mode 100755 index 23af8c5..fc07f4f --- a/ai/predict_v3_single_scene.py +++ b/ai/predict_v3_single_scene.py @@ -1,18 +1,22 @@ # -*- coding: utf-8 -*- """ -ProtoForge 预测服务 v6 - -核心能力: -1. 周期模板预测:适合 CNC 这类强周期、非标准正弦波形。 -2. 健康基线冻结:检测到异常后,不再用故障数据更新预测模板。 -3. 恢复冷却机制:故障恢复后,需要连续稳定多个周期,才恢复学习。 -4. 预测上下界:写入 predicted_upper / predicted_lower,方便 Grafana 展示预测带。 -5. 异常标记:写入 xxx_anomaly,1 表示异常,0 表示正常。 -6. 不删除历史预测,不使用 delete_series。 -""" - -""" -场景:不考虑物料、不考虑跨程序场景算法预测 +ProtoForge Predictor v8 + +功能: +1. 从 VictoriaMetrics 拉取历史数据。 +2. 对 CNC 周期型指标进行相位对齐预测。 +3. 使用“谷底锚点”对齐周期,减少上升沿/下降沿相位偏差。 +4. 每轮只写入未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒,避免预测窗口重叠。 +5. 检测异常后冻结健康模板,不把故障数据学进去。 +6. 故障恢复后等待稳定一段时间,再恢复模板更新。 +7. 写入: + - xxx_predicted + - xxx_predicted_upper + - xxx_predicted_lower + - xxx_anomaly + - xxx_anomaly_outside_ratio + - xxx_anomaly_mean_abs_error + - xxx_anomaly_mean_rel_error """ import json @@ -29,6 +33,10 @@ import requests +# ============================================================================= +# 日志配置 +# ============================================================================= + logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", @@ -37,18 +45,19 @@ logger = logging.getLogger(__name__) -# ── 基础配置 ────────────────────────────────────────────────────────────────── +# ============================================================================= +# 基础配置 +# ============================================================================= VM_URL = "http://localhost:8428" -STATE_FILE = "/tmp/protoforge_predictor_state.json" +STATE_FILE = "/tmp/protoforge_predictor_state_v8.json" HISTORY_MINUTES = 30 HORIZON_SECONDS = 120 POLL_INTERVAL = 30 -# 实际每轮写入未来多少秒。 -# 不要大于 POLL_INTERVAL,否则多轮预测会重叠。 +# 实际写入窗口不要大于轮询间隔,否则多轮预测会重叠。 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) QUERY_STEP = "1s" @@ -57,34 +66,25 @@ MIN_PERIOD_SECONDS = 5 MAX_PERIOD_SECONDS = 3600 -# 至少多少个完整周期才允许构建健康模板 MIN_FULL_CYCLES_FOR_TEMPLATE = 3 - -# 构建模板最多使用最近多少个周期 MAX_CYCLES_FOR_TEMPLATE = 6 -# 检测异常使用最近多少秒实际数据 DETECT_WINDOW_SECONDS = 15 - -# 恢复后,至少连续正常多少秒才考虑恢复学习 RECOVERY_MIN_SECONDS = 60 -# 健康状态下模板更新速度,越小越保守 -HEALTHY_EMA_ALPHA = 0.15 - -# 故障恢复后第一次重新学习时的更新速度 -RECOVERY_EMA_ALPHA = 0.35 +HEALTHY_EMA_ALPHA = 0.12 +RECOVERY_EMA_ALPHA = 0.30 -# 最近窗口里有多少比例的点超过阈值,才认为异常 OUTSIDE_RATIO_THRESHOLD = 0.60 - -# 最近窗口里有多少比例的点回到阈值内,才认为恢复正常 RECOVERY_INSIDE_RATIO_THRESHOLD = 0.80 +PHASE_SEARCH_RATIO = 0.15 +VALLEY_QUANTILE = 45 + -# ── 指标配置 ────────────────────────────────────────────────────────────────── -# abs_threshold / rel_threshold 需要按指标单位调。 -# feed_rate 单位 mm/min,这里先给 400 和 25%。 +# ============================================================================= +# 预测指标配置 +# ============================================================================= PREDICT_TARGETS = [ { @@ -132,21 +132,23 @@ ] EXTRA_PREDICT_LABELS = { - "forecast": "health_gated_v1", + "forecast": "phase_aligned_health_v8", "source": "protoforge", } BASELINE_STATUS_HEALTHY = "healthy" BASELINE_STATUS_ANOMALY = "anomaly" BASELINE_STATUS_RECOVERING = "recovering" -BASELINE_STATUS_LEARNING = "learning" -# ── 状态结构 ────────────────────────────────────────────────────────────────── +# ============================================================================= +# 状态结构 +# ============================================================================= @dataclass class BaselineState: period: int + phase_origin_ts: int template: List[float] status: str clean_seconds: int @@ -160,7 +162,9 @@ class BaselineState: LAST_WRITTEN_UNTIL: Dict[str, int] = {} -# ── VM 读取 ─────────────────────────────────────────────────────────────────── +# ============================================================================= +# VictoriaMetrics 读取 +# ============================================================================= def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]: now = datetime.now() @@ -258,7 +262,25 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. return ts_grid, ys_grid -# ── 周期估计 ────────────────────────────────────────────────────────────────── +# ============================================================================= +# 周期估计 +# ============================================================================= + +def moving_average(arr: np.ndarray, window: int) -> np.ndarray: + if window <= 1 or len(arr) < window: + return arr.astype(float) + + window = int(window) + + if window % 2 == 0: + window += 1 + + kernel = np.ones(window, dtype=float) / window + pad = window // 2 + padded = np.pad(arr.astype(float), (pad, pad), mode="edge") + + return np.convolve(padded, kernel, mode="valid") + def estimate_period_by_fft(ys_arr: np.ndarray) -> float: n = len(ys_arr) @@ -307,7 +329,7 @@ def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float: corr = np.correlate(centered, centered, mode="full")[n - 1:] p0 = int(round(init_period)) - left = max(MIN_PERIOD_SECONDS, int(max(2, p0 * 0.7))) + left = max(int(MIN_PERIOD_SECONDS), int(max(2, p0 * 0.7))) right = min(n // 2, int(max(left + 1, p0 * 1.3))) if right <= left: @@ -323,96 +345,252 @@ def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float: return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) -def estimate_period(ys_arr: np.ndarray) -> int: +def estimate_period_rough(ys_arr: np.ndarray) -> int: p_fft = estimate_period_by_fft(ys_arr) p_refined = refine_period_by_autocorr(ys_arr, p_fft) period = int(round(p_refined)) - period = max(MIN_PERIOD_SECONDS, min(MAX_PERIOD_SECONDS, period)) + period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period)) return int(period) -# ── 模板构建与预测 ───────────────────────────────────────────────────────────── +# ============================================================================= +# 谷底锚点检测 +# ============================================================================= -def fill_template_nan(template: np.ndarray) -> np.ndarray: - period = len(template) +def find_valley_indices( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + expected_period: int, +) -> List[int]: + n = len(ys_grid) - if period == 0: - return template + if n < max(10, expected_period * 2): + return [] + + period = max(3, int(expected_period)) + + smooth_window = max(3, int(round(period * 0.08))) + smooth_window = min(smooth_window, 21) + + ys_smooth = moving_average(ys_grid, smooth_window) + threshold = float(np.percentile(ys_smooth, VALLEY_QUANTILE)) + + candidates = [] + + for i in range(1, n - 1): + if ( + ys_smooth[i] <= ys_smooth[i - 1] + and ys_smooth[i] < ys_smooth[i + 1] + and ys_smooth[i] <= threshold + ): + candidates.append(i) + + if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE: + candidates = [] + + for i in range(1, n - 1): + if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]: + candidates.append(i) + + if not candidates: + return [] - idx = np.arange(period) - valid = np.isfinite(template) + min_distance = max(2, int(round(period * 0.55))) + selected = [] - if not np.any(valid): - return np.zeros(period, dtype=float) + for idx in candidates: + if not selected: + selected.append(idx) + continue + + if idx - selected[-1] >= min_distance: + selected.append(idx) + continue + + if ys_smooth[idx] < ys_smooth[selected[-1]]: + selected[-1] = idx - if np.all(valid): - return template + if len(selected) < 2: + return selected - x_valid = idx[valid] - y_valid = template[valid] + cleaned = [selected[0]] + + for idx in selected[1:]: + diff = int(ts_grid[idx] - ts_grid[cleaned[-1]]) + + if int(period * 0.55) <= diff <= int(period * 1.60): + cleaned.append(idx) + continue - # 环形插值,处理 phase 0 附近缺口 - x_ext = np.concatenate([x_valid - period, x_valid, x_valid + period]) - y_ext = np.concatenate([y_valid, y_valid, y_valid]) + if diff < int(period * 0.55): + if ys_smooth[idx] < ys_smooth[cleaned[-1]]: + cleaned[-1] = idx + continue - filled = np.interp(idx, x_ext, y_ext) + cleaned.append(idx) - return filled.astype(float) + return cleaned -def build_phase_template( +def detect_period_and_valleys( + ts_grid: np.ndarray, + ys_grid: np.ndarray, +) -> Tuple[int, List[int]]: + rough = estimate_period_rough(ys_grid) + valleys = find_valley_indices(ts_grid, ys_grid, rough) + + if len(valleys) >= 3: + diffs = np.diff(ts_grid[valleys]) + good = diffs[(diffs >= rough * 0.55) & (diffs <= rough * 1.60)] + + if len(good) > 0: + period = int(round(float(np.median(good)))) + else: + period = rough + else: + period = rough + + period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period)) + + return int(period), valleys + + +# ============================================================================= +# 相位对齐模板构建 +# ============================================================================= + +def build_template_from_valleys( ts_grid: np.ndarray, ys_grid: np.ndarray, period: int, + valleys: List[int], max_cycles: int = MAX_CYCLES_FOR_TEMPLATE, - tail_seconds: Optional[int] = None, ) -> Optional[np.ndarray]: - if period <= 1 or len(ys_grid) < period * MIN_FULL_CYCLES_FOR_TEMPLATE: + if period <= 1 or len(valleys) < MIN_FULL_CYCLES_FOR_TEMPLATE + 1: return None - max_seconds = period * max_cycles + pairs = [] + + for a, b in zip(valleys[:-1], valleys[1:]): + cycle_len = float(ts_grid[b] - ts_grid[a]) - if tail_seconds is not None: - max_seconds = min(max_seconds, int(tail_seconds)) + if period * 0.55 <= cycle_len <= period * 1.60: + pairs.append((a, b, cycle_len)) - max_seconds = max(period * MIN_FULL_CYCLES_FOR_TEMPLATE, max_seconds) + if len(pairs) < MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + pairs = pairs[-max_cycles:] + + phase_grid = np.arange(period, dtype=float) + segments = [] + weights = [] + + for idx, (a, b, cycle_len) in enumerate(pairs): + seg_ts = ts_grid[a:b + 1] + seg_y = ys_grid[a:b + 1] + + if len(seg_y) < 3: + continue - if len(ys_grid) < max_seconds: - start_idx = 0 + x_old = (seg_ts - seg_ts[0]) / cycle_len * period + seg = np.interp(phase_grid, x_old, seg_y) + + segments.append(seg.astype(float)) + + weight = 0.5 + 0.5 * ((idx + 1) / len(pairs)) + weights.append(weight) + + if len(segments) < MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + arr = np.vstack(segments) + w_arr = np.array(weights, dtype=float) + + template = np.average(arr, axis=0, weights=w_arr) + + return template.astype(float) + + +def build_current_baseline( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + tail_seconds: Optional[int] = None, +) -> Optional[Tuple[int, int, np.ndarray]]: + if len(ys_grid) < MIN_POINTS: + return None + + if tail_seconds is not None and tail_seconds > 0: + cutoff = ts_grid[-1] - int(tail_seconds) + mask = ts_grid >= cutoff + ts_use = ts_grid[mask] + ys_use = ys_grid[mask] else: - start_idx = len(ys_grid) - max_seconds + ts_use = ts_grid + ys_use = ys_grid + + if len(ys_use) < MIN_POINTS: + return None - ts_tail = ts_grid[start_idx:] - ys_tail = ys_grid[start_idx:] + period, valleys = detect_period_and_valleys(ts_use, ys_use) - if len(ys_tail) < period * MIN_FULL_CYCLES_FOR_TEMPLATE: + template = build_template_from_valleys( + ts_grid=ts_use, + ys_grid=ys_use, + period=period, + valleys=valleys, + ) + + if template is None or len(valleys) == 0: return None - sums = np.zeros(period, dtype=float) - weights = np.zeros(period, dtype=float) + phase_origin_ts = int(round(float(ts_use[valleys[-1]]))) - total = len(ys_tail) + return int(period), phase_origin_ts, template - for i, (t, y) in enumerate(zip(ts_tail, ys_tail)): - phase = int(t) % period - # 越近的数据权重越高 - recency = (i + 1) / total - weight = 0.3 + 0.7 * recency +# ============================================================================= +# 模板预测 +# ============================================================================= - sums[phase] += float(y) * weight - weights[phase] += weight +def circular_template_value(template: np.ndarray, phase: float) -> float: + period = len(template) - template = np.full(period, np.nan, dtype=float) + if period == 0: + return 0.0 + + phase = float(phase) % period - valid = weights > 0 - template[valid] = sums[valid] / weights[valid] + i0 = int(math.floor(phase)) % period + i1 = (i0 + 1) % period - template = fill_template_nan(template) + frac = phase - math.floor(phase) - return template + return float((1.0 - frac) * template[i0] + frac * template[i1]) + + +def predict_with_origin( + state: BaselineState, + ts_list: List[int], + phase_origin_ts: Optional[int] = None, +) -> np.ndarray: + template = np.array(state.template, dtype=float) + period = int(state.period) + + if period <= 1 or len(template) != period: + return np.zeros(len(ts_list), dtype=float) + + origin = int(state.phase_origin_ts if phase_origin_ts is None else phase_origin_ts) + + values = [] + + for ts in ts_list: + phase = (int(ts) - origin) % period + values.append(circular_template_value(template, phase)) + + return np.array(values, dtype=float) def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: @@ -433,6 +611,38 @@ def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: return np.interp(new_x, old_x_ext, old_y_ext).astype(float) +def align_new_template_to_old( + old_template: np.ndarray, + new_template: np.ndarray, +) -> np.ndarray: + if len(old_template) != len(new_template): + old_template = resample_template(old_template, len(new_template)) + + period = len(new_template) + + if period <= 2: + return new_template.astype(float) + + max_shift = max(1, int(round(period * 0.10))) + + old_norm = old_template - np.mean(old_template) + + best_score = None + best_template = new_template + + for shift in range(-max_shift, max_shift + 1): + shifted = np.roll(new_template, shift) + shifted_norm = shifted - np.mean(shifted) + + score = float(np.dot(old_norm, shifted_norm)) + + if best_score is None or score > best_score: + best_score = score + best_template = shifted + + return best_template.astype(float) + + def merge_template( old_template: np.ndarray, new_template: np.ndarray, @@ -443,37 +653,64 @@ def merge_template( if len(old_template) != len(new_template): old_template = resample_template(old_template, len(new_template)) - return ((1.0 - alpha) * old_template + alpha * new_template).astype(float) - - -def predict_by_state(state: BaselineState, ts_list: List[int]) -> np.ndarray: - template = np.array(state.template, dtype=float) - period = int(state.period) - - if period <= 1 or len(template) != period: - return np.zeros(len(ts_list), dtype=float) + new_template = align_new_template_to_old(old_template, new_template) - values = [] + merged = (1.0 - alpha) * old_template + alpha * new_template - for ts in ts_list: - phase = int(ts) % period - values.append(float(template[phase])) + return merged.astype(float) - return np.array(values, dtype=float) +# ============================================================================= +# 异常检测 +# ============================================================================= -def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray: +def calc_threshold( + pred: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> np.ndarray: return np.maximum(abs_threshold, np.abs(pred) * rel_threshold) -def calc_bounds(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> Tuple[np.ndarray, np.ndarray]: +def calc_bounds( + pred: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> Tuple[np.ndarray, np.ndarray]: threshold = calc_threshold(pred, abs_threshold, rel_threshold) + lower = pred - threshold upper = pred + threshold + return lower, upper -# ── 异常检测与状态更新 ──────────────────────────────────────────────────────── +def find_best_phase_origin_for_recent( + state: BaselineState, + ts_recent: List[int], + actual: np.ndarray, +) -> Tuple[int, np.ndarray, float]: + period = int(state.period) + base_origin = int(state.phase_origin_ts) + + max_shift = max(1, int(round(period * PHASE_SEARCH_RATIO))) + + best_origin = base_origin + best_pred = predict_with_origin(state, ts_recent, base_origin) + best_mae = float(np.mean(np.abs(actual - best_pred))) + + for shift in range(-max_shift, max_shift + 1): + origin = base_origin + shift + pred = predict_with_origin(state, ts_recent, origin) + mae = float(np.mean(np.abs(actual - pred))) + + if mae < best_mae: + best_mae = mae + best_origin = origin + best_pred = pred + + return best_origin, best_pred, best_mae + def detect_anomaly( state: BaselineState, @@ -481,14 +718,19 @@ def detect_anomaly( ys_grid: np.ndarray, abs_threshold: float, rel_threshold: float, -) -> Tuple[bool, float, float, float]: +) -> Tuple[bool, float, float, float, int]: if len(ys_grid) < DETECT_WINDOW_SECONDS: - return False, 0.0, 0.0, 0.0 + return False, 0.0, 0.0, 0.0, int(state.phase_origin_ts) ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float) - pred = predict_by_state(state, ts_recent) + best_origin, pred, _ = find_best_phase_origin_for_recent( + state=state, + ts_recent=ts_recent, + actual=actual, + ) + threshold = calc_threshold(pred, abs_threshold, rel_threshold) abs_err = np.abs(actual - pred) @@ -500,56 +742,28 @@ def detect_anomaly( is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD - return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err - - -def is_recovered( - state: BaselineState, - ts_grid: np.ndarray, - ys_grid: np.ndarray, - abs_threshold: float, - rel_threshold: float, -) -> Tuple[bool, float]: - if len(ys_grid) < DETECT_WINDOW_SECONDS: - return False, 0.0 - - ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() - actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float) - - pred = predict_by_state(state, ts_recent) - threshold = calc_threshold(pred, abs_threshold, rel_threshold) - - abs_err = np.abs(actual - pred) - inside = abs_err <= threshold - - inside_ratio = float(np.mean(inside)) + return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err, int(best_origin) - return inside_ratio >= RECOVERY_INSIDE_RATIO_THRESHOLD, inside_ratio +# ============================================================================= +# 健康基线状态管理 +# ============================================================================= def create_initial_state( ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int, ) -> Optional[BaselineState]: - if len(ys_grid) < MIN_POINTS: - return None - - period = estimate_period(ys_grid) - - template = build_phase_template( - ts_grid=ts_grid, - ys_grid=ys_grid, - period=period, - max_cycles=MAX_CYCLES_FOR_TEMPLATE, - tail_seconds=period * MAX_CYCLES_FOR_TEMPLATE, - ) + baseline = build_current_baseline(ts_grid, ys_grid) - if template is None: + if baseline is None: return None + period, phase_origin_ts, template = baseline + return BaselineState( period=int(period), + phase_origin_ts=int(phase_origin_ts), template=template.astype(float).tolist(), status=BASELINE_STATUS_HEALTHY, clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE), @@ -578,19 +792,23 @@ def maybe_update_state( return None, False, 0.0, 0.0, 0.0 BASELINE_STATES[key] = state + logger.info( - "初始化健康模板 key=%s period=%ss clean_seconds=%ss", + "初始化健康模板 key=%s period=%ss origin=%s clean=%ss", key, state.period, + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), state.clean_seconds, ) + return state, False, 0.0, 0.0, 0.0 elapsed = max(1, now_sec - int(state.last_seen_ts)) elapsed = min(elapsed, POLL_INTERVAL * 2) + state.last_seen_ts = now_sec - is_anom, outside_ratio, mean_abs_err, mean_rel_err = detect_anomaly( + is_anom, outside_ratio, mean_abs_err, mean_rel_err, best_origin = detect_anomaly( state=state, ts_grid=ts_grid, ys_grid=ys_grid, @@ -602,6 +820,8 @@ def maybe_update_state( state.status = BASELINE_STATUS_ANOMALY state.clean_seconds = 0 + BASELINE_STATES[key] = state + logger.warning( "检测到异常,冻结模板 key=%s outside_ratio=%.2f mean_abs_err=%.2f mean_rel_err=%.2f", key, @@ -610,47 +830,39 @@ def maybe_update_state( mean_rel_err, ) - BASELINE_STATES[key] = state return state, True, outside_ratio, mean_abs_err, mean_rel_err - recovered, inside_ratio = is_recovered( - state=state, - ts_grid=ts_grid, - ys_grid=ys_grid, - abs_threshold=abs_threshold, - rel_threshold=rel_threshold, - ) + old_origin = int(state.phase_origin_ts) + state.phase_origin_ts = int(best_origin) + + if abs(state.phase_origin_ts - old_origin) >= 1: + logger.debug( + "相位校正 key=%s origin %s -> %s", + key, + datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"), + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + ) if state.status == BASELINE_STATUS_ANOMALY: - if recovered: - state.status = BASELINE_STATUS_RECOVERING - state.clean_seconds = elapsed - logger.info( - "异常开始恢复 key=%s inside_ratio=%.2f clean_seconds=%ss", - key, - inside_ratio, - state.clean_seconds, - ) - else: - state.clean_seconds = 0 - BASELINE_STATES[key] = state - return state, True, outside_ratio, mean_abs_err, mean_rel_err + state.status = BASELINE_STATUS_RECOVERING + state.clean_seconds = elapsed - elif state.status == BASELINE_STATUS_RECOVERING: - if recovered: - state.clean_seconds += elapsed - else: - state.status = BASELINE_STATUS_ANOMALY - state.clean_seconds = 0 - BASELINE_STATES[key] = state - return state, True, outside_ratio, mean_abs_err, mean_rel_err + BASELINE_STATES[key] = state + logger.info( + "异常开始恢复 key=%s clean_seconds=%ss", + key, + state.clean_seconds, + ) + + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + if state.status == BASELINE_STATUS_RECOVERING: + state.clean_seconds += elapsed else: state.status = BASELINE_STATUS_HEALTHY state.clean_seconds += elapsed - # 故障恢复后,不要立刻学习。 - # 必须至少连续正常:max(RECOVERY_MIN_SECONDS, 3 个周期) min_clean_for_update = max( RECOVERY_MIN_SECONDS, int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE, @@ -660,30 +872,26 @@ def maybe_update_state( BASELINE_STATES[key] = state return state, False, outside_ratio, mean_abs_err, mean_rel_err - # 只使用最近 clean_seconds 这段连续正常数据来更新模板,避免历史故障污染。 - new_period = estimate_period(ys_grid) tail_seconds = min( int(state.clean_seconds), - int(new_period) * MAX_CYCLES_FOR_TEMPLATE, + int(state.period) * MAX_CYCLES_FOR_TEMPLATE, ) - new_template = build_phase_template( + baseline = build_current_baseline( ts_grid=ts_grid, ys_grid=ys_grid, - period=new_period, - max_cycles=MAX_CYCLES_FOR_TEMPLATE, tail_seconds=tail_seconds, ) - if new_template is None: + if baseline is None: BASELINE_STATES[key] = state return state, False, outside_ratio, mean_abs_err, mean_rel_err + new_period, new_origin, new_template = baseline old_template = np.array(state.template, dtype=float) if state.status == BASELINE_STATUS_RECOVERING: alpha = RECOVERY_EMA_ALPHA - state.status = BASELINE_STATUS_HEALTHY else: alpha = HEALTHY_EMA_ALPHA @@ -694,18 +902,25 @@ def maybe_update_state( ) state.period = int(new_period) + state.phase_origin_ts = int(new_origin) state.template = merged.astype(float).tolist() + state.status = BASELINE_STATUS_HEALTHY state.last_update_ts = now_sec - state.y_min = float(np.min(ys_grid[-tail_seconds:])) - state.y_max = float(np.max(ys_grid[-tail_seconds:])) + + if tail_seconds > 0 and len(ys_grid) >= tail_seconds: + state.y_min = float(np.min(ys_grid[-tail_seconds:])) + state.y_max = float(np.max(ys_grid[-tail_seconds:])) + else: + state.y_min = float(np.min(ys_grid)) + state.y_max = float(np.max(ys_grid)) BASELINE_STATES[key] = state logger.info( - "更新健康模板 key=%s period=%ss status=%s clean_seconds=%ss alpha=%.2f", + "更新健康模板 key=%s period=%ss origin=%s clean=%ss alpha=%.2f", key, state.period, - state.status, + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), state.clean_seconds, alpha, ) @@ -713,7 +928,9 @@ def maybe_update_state( return state, False, outside_ratio, mean_abs_err, mean_rel_err -# ── Prometheus 格式写入 ─────────────────────────────────────────────────────── +# ============================================================================= +# Prometheus Exposition 写入 +# ============================================================================= def prom_escape_label_value(value: str) -> str: return ( @@ -731,8 +948,7 @@ def labels_to_str(labels: Dict[str, str]) -> str: parts = [] for k in sorted(labels.keys()): - v = prom_escape_label_value(labels[k]) - parts.append(f'{k}="{v}"') + parts.append(f'{k}="{prom_escape_label_value(labels[k])}"') return "{" + ",".join(parts) + "}" @@ -771,11 +987,14 @@ def write_series( resp = requests.post( f"{VM_URL}/api/v1/import/prometheus", data=payload.encode("utf-8"), - headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, + headers={ + "Content-Type": "text/plain; version=0.0.4; charset=utf-8", + }, timeout=10, ) resp.raise_for_status() return True + except requests.RequestException as e: logger.error("写入数据失败 metric=%s: %s", metric_name, e) return False @@ -851,28 +1070,37 @@ def write_prediction_bundle( return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 -# ── 标签解析 ────────────────────────────────────────────────────────────────── +# ============================================================================= +# 标签解析 +# ============================================================================= _LABEL_PATTERN = re.compile( r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' ) -def _parse_labels(query: str) -> Dict[str, str]: +def parse_labels_from_query(query: str) -> Dict[str, str]: labels = {} if "{" not in query or "}" not in query: return labels try: - label_part = query[query.index("{") + 1: query.rindex("}")] + label_part = query[query.index("{") + 1:query.rindex("}")] except Exception: return labels for match in _LABEL_PATTERN.finditer(label_part): key = match.group(1) value = match.group(2) - value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\") + + value = ( + value + .replace('\\"', '"') + .replace("\\n", "\n") + .replace("\\\\", "\\") + ) + labels[key] = value return labels @@ -892,9 +1120,11 @@ def series_key(metric_name: str, labels: Dict[str, str]) -> str: return metric_name + labels_to_str(labels) -# ── 状态持久化 ──────────────────────────────────────────────────────────────── +# ============================================================================= +# 状态持久化 +# ============================================================================= -def load_state(): +def load_state() -> None: global BASELINE_STATES if not os.path.exists(STATE_FILE): @@ -907,17 +1137,36 @@ def load_state(): states = {} for key, value in raw.get("baseline_states", {}).items(): + required_fields = { + "period", + "phase_origin_ts", + "template", + "status", + "clean_seconds", + "last_update_ts", + "last_seen_ts", + "y_min", + "y_max", + } + + if not required_fields.issubset(set(value.keys())): + continue + states[key] = BaselineState(**value) BASELINE_STATES = states - logger.info("已加载预测状态文件 %s,状态数量=%d", STATE_FILE, len(BASELINE_STATES)) + logger.info( + "已加载预测状态文件 %s,状态数量=%d", + STATE_FILE, + len(BASELINE_STATES), + ) except Exception as e: logger.warning("加载预测状态文件失败,将重新学习: %s", e) -def save_state(): +def save_state() -> None: try: raw = { "baseline_states": { @@ -937,9 +1186,11 @@ def save_state(): logger.warning("保存预测状态文件失败: %s", e) -# ── 主逻辑 ──────────────────────────────────────────────────────────────────── +# ============================================================================= +# 主流程 +# ============================================================================= -def run_once(): +def run_once() -> None: now_str = datetime.now().strftime("%H:%M:%S") for target in PREDICT_TARGETS: @@ -952,16 +1203,26 @@ def run_once(): ts, ys = fetch_history(query) if len(ys) < MIN_POINTS: - logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) + logger.info( + "[%s] %s 数据不足(%d 点),跳过", + now_str, + query, + len(ys), + ) continue ts_grid, ys_grid = normalize_history(ts, ys) if len(ys_grid) < MIN_POINTS: - logger.info("[%s] %s 清洗后数据不足(%d 点),跳过", now_str, query, len(ys_grid)) + logger.info( + "[%s] %s 清洗后数据不足(%d 点),跳过", + now_str, + query, + len(ys_grid), + ) continue - base_labels = _parse_labels(query) + base_labels = parse_labels_from_query(query) write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) key = series_key(pred_metric, write_labels) @@ -975,7 +1236,11 @@ def run_once(): ) if state is None: - logger.info("[%s] %s 暂无可用健康模板,等待学习", now_str, query) + logger.info( + "[%s] %s 暂无可用健康模板,等待学习", + now_str, + query, + ) continue now_sec = int(time.time()) @@ -989,7 +1254,7 @@ def run_once(): for i in range(WRITE_HORIZON_SECONDS) ] - pred_values = predict_by_state(state, ts_future) + pred_values = predict_with_origin(state, ts_future) lower_values, upper_values = calc_bounds( pred=pred_values, @@ -1012,21 +1277,28 @@ def run_once(): ) if not ok: + logger.error( + "[%s] %s 写入预测数据失败", + now_str, + query, + ) continue LAST_WRITTEN_UNTIL[key] = int(max(ts_future)) future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") + origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S") logger.info( - "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss clean=%ss 写入 %d 点,预测区间 %s ~ %s", + "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss origin=%s clean=%ss 写入 %d 点,预测区间 %s ~ %s", now_str, query, pred_metric, state.status, is_anomaly, state.period, + origin_str, state.clean_seconds, len(ts_future), future_start, @@ -1036,17 +1308,18 @@ def run_once(): save_state() -def main(): +def main() -> None: load_state() logger.info( - "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s", + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s", VM_URL, HISTORY_MINUTES, HORIZON_SECONDS, WRITE_HORIZON_SECONDS, POLL_INTERVAL, STATE_FILE, + EXTRA_PREDICT_LABELS["forecast"], ) while True: @@ -1055,4 +1328,6 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() + + \ No newline at end of file From 76e536eff9beb9e3f9db7453bbeb6f1c7844c198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 13:55:08 +0800 Subject: [PATCH 24/31] fix --- ai/predict_v3_single_scene.py | 316 ++++++++++++---------------------- 1 file changed, 110 insertions(+), 206 deletions(-) diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py index fc07f4f..2cde8b8 100755 --- a/ai/predict_v3_single_scene.py +++ b/ai/predict_v3_single_scene.py @@ -1,22 +1,12 @@ # -*- coding: utf-8 -*- """ -ProtoForge Predictor v8 - -功能: -1. 从 VictoriaMetrics 拉取历史数据。 -2. 对 CNC 周期型指标进行相位对齐预测。 -3. 使用“谷底锚点”对齐周期,减少上升沿/下降沿相位偏差。 -4. 每轮只写入未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒,避免预测窗口重叠。 -5. 检测异常后冻结健康模板,不把故障数据学进去。 -6. 故障恢复后等待稳定一段时间,再恢复模板更新。 -7. 写入: - - xxx_predicted - - xxx_predicted_upper - - xxx_predicted_lower - - xxx_anomaly - - xxx_anomaly_outside_ratio - - xxx_anomaly_mean_abs_error - - xxx_anomaly_mean_rel_error +ProtoForge Predictor v9 + +修复重点: +1. 预测时间轴改为锚定最后一个真实数据点 last_real_ts,而不是锚定 time.time()。 +2. 不再使用 LAST_WRITTEN_UNTIL 把预测不断推向更远未来,避免 Grafana 里预测线相对真实线出现延迟/错位。 +3. 如果真实数据时间戳没有推进,则跳过本轮预测写入,避免重复写同一段未来时间造成毛刺。 +4. 保留:相位对齐、健康模板冻结、故障期不学习、恢复后再学习、预测上下界、异常指标。 """ import json @@ -50,14 +40,13 @@ # ============================================================================= VM_URL = "http://localhost:8428" - -STATE_FILE = "/tmp/protoforge_predictor_state_v8.json" +STATE_FILE = "/tmp/protoforge_predictor_state_v9.json" HISTORY_MINUTES = 30 HORIZON_SECONDS = 120 POLL_INTERVAL = 30 -# 实际写入窗口不要大于轮询间隔,否则多轮预测会重叠。 +# 实际每轮写入的预测长度。不要大于 POLL_INTERVAL,否则容易出现预测窗口重叠。 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) QUERY_STEP = "1s" @@ -76,14 +65,23 @@ RECOVERY_EMA_ALPHA = 0.30 OUTSIDE_RATIO_THRESHOLD = 0.60 -RECOVERY_INSIDE_RATIO_THRESHOLD = 0.80 - PHASE_SEARCH_RATIO = 0.15 VALLEY_QUANTILE = 45 +# 关键修复:预测时间轴锚定真实数据最后一个点。 +# True:预测从 last_real_ts + 1 开始,适合 Grafana 与真实曲线对齐展示。 +# False:预测从当前系统时间 + 1 开始,适合只看纯未来预测,但容易与有采集延迟的真实数据错位。 +ALIGN_PREDICTION_TO_LAST_REAL_TS = True + +# 如果 last_real_ts 距离当前系统时间太久,说明采集链路可能断了,跳过预测,避免用陈旧数据继续画未来线。 +MAX_DATA_LAG_SECONDS = 180 + +# 真实数据至少推进多少秒,才写入新预测,避免同一段未来时间被反复写入。 +MIN_REAL_ADVANCE_SECONDS = 1 + # ============================================================================= -# 预测指标配置 +# 指标配置 # ============================================================================= PREDICT_TARGETS = [ @@ -132,7 +130,7 @@ ] EXTRA_PREDICT_LABELS = { - "forecast": "phase_aligned_health_v8", + "forecast": "phase_aligned_health_v9", "source": "protoforge", } @@ -159,7 +157,10 @@ class BaselineState: BASELINE_STATES: Dict[str, BaselineState] = {} -LAST_WRITTEN_UNTIL: Dict[str, int] = {} + +# 记录每条序列最后一次使用的真实数据时间戳,而不是预测写到哪里。 +# 这样不会把预测不断推向更远的未来。 +LAST_REAL_TS_WRITTEN: Dict[str, int] = {} # ============================================================================= @@ -243,7 +244,6 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. return np.array([]), np.array([]) sorted_items = sorted(data.items(), key=lambda x: x[0]) - ts_clean = np.array([x[0] for x in sorted_items], dtype=float) ys_clean = np.array([x[1] for x in sorted_items], dtype=float) @@ -263,7 +263,7 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. # ============================================================================= -# 周期估计 +# 周期估计与谷底检测 # ============================================================================= def moving_average(arr: np.ndarray, window: int) -> np.ndarray: @@ -355,10 +355,6 @@ def estimate_period_rough(ys_arr: np.ndarray) -> int: return int(period) -# ============================================================================= -# 谷底锚点检测 -# ============================================================================= - def find_valley_indices( ts_grid: np.ndarray, ys_grid: np.ndarray, @@ -370,7 +366,6 @@ def find_valley_indices( return [] period = max(3, int(expected_period)) - smooth_window = max(3, int(round(period * 0.08))) smooth_window = min(smooth_window, 21) @@ -389,7 +384,6 @@ def find_valley_indices( if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE: candidates = [] - for i in range(1, n - 1): if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]: candidates.append(i) @@ -458,7 +452,7 @@ def detect_period_and_valleys( # ============================================================================= -# 相位对齐模板构建 +# 相位对齐模板 # ============================================================================= def build_template_from_valleys( @@ -552,7 +546,7 @@ def build_current_baseline( # ============================================================================= -# 模板预测 +# 预测与模板合并 # ============================================================================= def circular_template_value(template: np.ndarray, phase: float) -> float: @@ -562,10 +556,8 @@ def circular_template_value(template: np.ndarray, phase: float) -> float: return 0.0 phase = float(phase) % period - i0 = int(math.floor(phase)) % period i1 = (i0 + 1) % period - frac = phase - math.floor(phase) return float((1.0 - frac) * template[i0] + frac * template[i1]) @@ -583,7 +575,6 @@ def predict_with_origin( return np.zeros(len(ts_list), dtype=float) origin = int(state.phase_origin_ts if phase_origin_ts is None else phase_origin_ts) - values = [] for ts in ts_list: @@ -611,10 +602,7 @@ def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: return np.interp(new_x, old_x_ext, old_y_ext).astype(float) -def align_new_template_to_old( - old_template: np.ndarray, - new_template: np.ndarray, -) -> np.ndarray: +def align_new_template_to_old(old_template: np.ndarray, new_template: np.ndarray) -> np.ndarray: if len(old_template) != len(new_template): old_template = resample_template(old_template, len(new_template)) @@ -624,7 +612,6 @@ def align_new_template_to_old( return new_template.astype(float) max_shift = max(1, int(round(period * 0.10))) - old_norm = old_template - np.mean(old_template) best_score = None @@ -633,7 +620,6 @@ def align_new_template_to_old( for shift in range(-max_shift, max_shift + 1): shifted = np.roll(new_template, shift) shifted_norm = shifted - np.mean(shifted) - score = float(np.dot(old_norm, shifted_norm)) if best_score is None or score > best_score: @@ -643,18 +629,13 @@ def align_new_template_to_old( return best_template.astype(float) -def merge_template( - old_template: np.ndarray, - new_template: np.ndarray, - alpha: float, -) -> np.ndarray: +def merge_template(old_template: np.ndarray, new_template: np.ndarray, alpha: float) -> np.ndarray: alpha = float(np.clip(alpha, 0.0, 1.0)) if len(old_template) != len(new_template): old_template = resample_template(old_template, len(new_template)) new_template = align_new_template_to_old(old_template, new_template) - merged = (1.0 - alpha) * old_template + alpha * new_template return merged.astype(float) @@ -664,11 +645,7 @@ def merge_template( # 异常检测 # ============================================================================= -def calc_threshold( - pred: np.ndarray, - abs_threshold: float, - rel_threshold: float, -) -> np.ndarray: +def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray: return np.maximum(abs_threshold, np.abs(pred) * rel_threshold) @@ -678,11 +655,7 @@ def calc_bounds( rel_threshold: float, ) -> Tuple[np.ndarray, np.ndarray]: threshold = calc_threshold(pred, abs_threshold, rel_threshold) - - lower = pred - threshold - upper = pred + threshold - - return lower, upper + return pred - threshold, pred + threshold def find_best_phase_origin_for_recent( @@ -692,7 +665,6 @@ def find_best_phase_origin_for_recent( ) -> Tuple[int, np.ndarray, float]: period = int(state.period) base_origin = int(state.phase_origin_ts) - max_shift = max(1, int(round(period * PHASE_SEARCH_RATIO))) best_origin = base_origin @@ -732,14 +704,12 @@ def detect_anomaly( ) threshold = calc_threshold(pred, abs_threshold, rel_threshold) - abs_err = np.abs(actual - pred) outside = abs_err > threshold outside_ratio = float(np.mean(outside)) mean_abs_err = float(np.mean(abs_err)) mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1.0))) - is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err, int(best_origin) @@ -749,11 +719,7 @@ def detect_anomaly( # 健康基线状态管理 # ============================================================================= -def create_initial_state( - ts_grid: np.ndarray, - ys_grid: np.ndarray, - now_sec: int, -) -> Optional[BaselineState]: +def create_initial_state(ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int) -> Optional[BaselineState]: baseline = build_current_baseline(ts_grid, ys_grid) if baseline is None: @@ -782,7 +748,6 @@ def maybe_update_state( rel_threshold: float, ) -> Tuple[Optional[BaselineState], bool, float, float, float]: now_sec = int(time.time()) - state = BASELINE_STATES.get(key) if state is None: @@ -805,7 +770,6 @@ def maybe_update_state( elapsed = max(1, now_sec - int(state.last_seen_ts)) elapsed = min(elapsed, POLL_INTERVAL * 2) - state.last_seen_ts = now_sec is_anom, outside_ratio, mean_abs_err, mean_rel_err, best_origin = detect_anomaly( @@ -819,7 +783,6 @@ def maybe_update_state( if is_anom: state.status = BASELINE_STATUS_ANOMALY state.clean_seconds = 0 - BASELINE_STATES[key] = state logger.warning( @@ -846,15 +809,9 @@ def maybe_update_state( if state.status == BASELINE_STATUS_ANOMALY: state.status = BASELINE_STATUS_RECOVERING state.clean_seconds = elapsed - BASELINE_STATES[key] = state - logger.info( - "异常开始恢复 key=%s clean_seconds=%ss", - key, - state.clean_seconds, - ) - + logger.info("异常开始恢复 key=%s clean_seconds=%ss", key, state.clean_seconds) return state, False, outside_ratio, mean_abs_err, mean_rel_err if state.status == BASELINE_STATUS_RECOVERING: @@ -877,11 +834,7 @@ def maybe_update_state( int(state.period) * MAX_CYCLES_FOR_TEMPLATE, ) - baseline = build_current_baseline( - ts_grid=ts_grid, - ys_grid=ys_grid, - tail_seconds=tail_seconds, - ) + baseline = build_current_baseline(ts_grid=ts_grid, ys_grid=ys_grid, tail_seconds=tail_seconds) if baseline is None: BASELINE_STATES[key] = state @@ -889,17 +842,9 @@ def maybe_update_state( new_period, new_origin, new_template = baseline old_template = np.array(state.template, dtype=float) + alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA - if state.status == BASELINE_STATUS_RECOVERING: - alpha = RECOVERY_EMA_ALPHA - else: - alpha = HEALTHY_EMA_ALPHA - - merged = merge_template( - old_template=old_template, - new_template=new_template, - alpha=alpha, - ) + merged = merge_template(old_template=old_template, new_template=new_template, alpha=alpha) state.period = int(new_period) state.phase_origin_ts = int(new_origin) @@ -933,12 +878,7 @@ def maybe_update_state( # ============================================================================= def prom_escape_label_value(value: str) -> str: - return ( - str(value) - .replace("\\", "\\\\") - .replace("\n", "\\n") - .replace('"', '\\"') - ) + return str(value).replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"') def labels_to_str(labels: Dict[str, str]) -> str: @@ -975,8 +915,7 @@ def write_series( if not math.isfinite(ts_sec) or not math.isfinite(val): continue - ts_ms = ts_sec * 1000 - lines.append(f"{metric_name}{label_str} {val:.6f} {ts_ms}") + lines.append(f"{metric_name}{label_str} {val:.6f} {ts_sec * 1000}") if not lines: return False @@ -987,9 +926,7 @@ def write_series( resp = requests.post( f"{VM_URL}/api/v1/import/prometheus", data=payload.encode("utf-8"), - headers={ - "Content-Type": "text/plain; version=0.0.4; charset=utf-8", - }, + headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, timeout=10, ) resp.raise_for_status() @@ -1012,60 +949,19 @@ def write_prediction_bundle( outside_ratio: float, mean_abs_err: float, mean_rel_err: float, + event_ts: int, ) -> bool: - ok1 = write_series( - metric_name=pred_metric, - labels=labels, - ts_list=ts_future, - values=pred_values.astype(float).tolist(), - ) - - ok2 = write_series( - metric_name=f"{pred_metric}_lower", - labels=labels, - ts_list=ts_future, - values=lower_values.astype(float).tolist(), - ) - - ok3 = write_series( - metric_name=f"{pred_metric}_upper", - labels=labels, - ts_list=ts_future, - values=upper_values.astype(float).tolist(), - ) - - now_sec = int(time.time()) + ok1 = write_series(pred_metric, labels, ts_future, pred_values.astype(float).tolist()) + ok2 = write_series(f"{pred_metric}_lower", labels, ts_future, lower_values.astype(float).tolist()) + ok3 = write_series(f"{pred_metric}_upper", labels, ts_future, upper_values.astype(float).tolist()) anomaly_labels = dict(labels) anomaly_labels["type"] = "prediction_deviation" - ok4 = write_series( - metric_name=anomaly_metric, - labels=anomaly_labels, - ts_list=[now_sec], - values=[1.0 if is_anomaly else 0.0], - ) - - ok5 = write_series( - metric_name=f"{anomaly_metric}_outside_ratio", - labels=anomaly_labels, - ts_list=[now_sec], - values=[outside_ratio], - ) - - ok6 = write_series( - metric_name=f"{anomaly_metric}_mean_abs_error", - labels=anomaly_labels, - ts_list=[now_sec], - values=[mean_abs_err], - ) - - ok7 = write_series( - metric_name=f"{anomaly_metric}_mean_rel_error", - labels=anomaly_labels, - ts_list=[now_sec], - values=[mean_rel_err], - ) + ok4 = write_series(anomaly_metric, anomaly_labels, [event_ts], [1.0 if is_anomaly else 0.0]) + ok5 = write_series(f"{anomaly_metric}_outside_ratio", anomaly_labels, [event_ts], [outside_ratio]) + ok6 = write_series(f"{anomaly_metric}_mean_abs_error", anomaly_labels, [event_ts], [mean_abs_err]) + ok7 = write_series(f"{anomaly_metric}_mean_rel_error", anomaly_labels, [event_ts], [mean_rel_err]) return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 @@ -1074,9 +970,7 @@ def write_prediction_bundle( # 标签解析 # ============================================================================= -_LABEL_PATTERN = re.compile( - r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' -) +_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*') def parse_labels_from_query(query: str) -> Dict[str, str]: @@ -1093,14 +987,7 @@ def parse_labels_from_query(query: str) -> Dict[str, str]: for match in _LABEL_PATTERN.finditer(label_part): key = match.group(1) value = match.group(2) - - value = ( - value - .replace('\\"', '"') - .replace("\\n", "\n") - .replace("\\\\", "\\") - ) - + value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\") labels[key] = value return labels @@ -1155,12 +1042,7 @@ def load_state() -> None: states[key] = BaselineState(**value) BASELINE_STATES = states - - logger.info( - "已加载预测状态文件 %s,状态数量=%d", - STATE_FILE, - len(BASELINE_STATES), - ) + logger.info("已加载预测状态文件 %s,状态数量=%d", STATE_FILE, len(BASELINE_STATES)) except Exception as e: logger.warning("加载预测状态文件失败,将重新学习: %s", e) @@ -1186,6 +1068,44 @@ def save_state() -> None: logger.warning("保存预测状态文件失败: %s", e) +# ============================================================================= +# 时间轴选择 +# ============================================================================= + +def build_prediction_timestamps(key: str, last_real_ts: int, now_sec: int) -> Optional[List[int]]: + data_lag = now_sec - last_real_ts + + if data_lag > MAX_DATA_LAG_SECONDS: + logger.warning( + "真实数据延迟过大,跳过预测 key=%s data_lag=%ss max=%ss", + key, + data_lag, + MAX_DATA_LAG_SECONDS, + ) + return None + + last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key) + + if last_written_real_ts is not None: + advance = last_real_ts - int(last_written_real_ts) + + if advance < MIN_REAL_ADVANCE_SECONDS: + logger.info( + "真实数据时间戳未推进,跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s", + key, + last_real_ts, + last_written_real_ts, + ) + return None + + if ALIGN_PREDICTION_TO_LAST_REAL_TS: + base_ts = last_real_ts + else: + base_ts = now_sec + + return [base_ts + i + 1 for i in range(WRITE_HORIZON_SECONDS)] + + # ============================================================================= # 主流程 # ============================================================================= @@ -1203,28 +1123,17 @@ def run_once() -> None: ts, ys = fetch_history(query) if len(ys) < MIN_POINTS: - logger.info( - "[%s] %s 数据不足(%d 点),跳过", - now_str, - query, - len(ys), - ) + logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) continue ts_grid, ys_grid = normalize_history(ts, ys) if len(ys_grid) < MIN_POINTS: - logger.info( - "[%s] %s 清洗后数据不足(%d 点),跳过", - now_str, - query, - len(ys_grid), - ) + logger.info("[%s] %s 清洗后数据不足(%d 点),跳过", now_str, query, len(ys_grid)) continue base_labels = parse_labels_from_query(query) write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) - key = series_key(pred_metric, write_labels) state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state( @@ -1236,26 +1145,23 @@ def run_once() -> None: ) if state is None: - logger.info( - "[%s] %s 暂无可用健康模板,等待学习", - now_str, - query, - ) + logger.info("[%s] %s 暂无可用健康模板,等待学习", now_str, query) continue now_sec = int(time.time()) - last_until = LAST_WRITTEN_UNTIL.get(key, 0) last_real_ts = int(ts_grid[-1]) + data_lag = now_sec - last_real_ts - base_ts = max(now_sec, last_until, last_real_ts) + ts_future = build_prediction_timestamps( + key=key, + last_real_ts=last_real_ts, + now_sec=now_sec, + ) - ts_future = [ - base_ts + i + 1 - for i in range(WRITE_HORIZON_SECONDS) - ] + if not ts_future: + continue pred_values = predict_with_origin(state, ts_future) - lower_values, upper_values = calc_bounds( pred=pred_values, abs_threshold=abs_threshold, @@ -1274,24 +1180,22 @@ def run_once() -> None: outside_ratio=outside_ratio, mean_abs_err=mean_abs_err, mean_rel_err=mean_rel_err, + event_ts=last_real_ts, ) if not ok: - logger.error( - "[%s] %s 写入预测数据失败", - now_str, - query, - ) + logger.error("[%s] %s 写入预测数据失败", now_str, query) continue - LAST_WRITTEN_UNTIL[key] = int(max(ts_future)) + LAST_REAL_TS_WRITTEN[key] = last_real_ts future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") + last_real_str = datetime.fromtimestamp(last_real_ts).strftime("%H:%M:%S") origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S") logger.info( - "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss origin=%s clean=%ss 写入 %d 点,预测区间 %s ~ %s", + "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss origin=%s last_real=%s lag=%ss 写入 %d 点,预测区间 %s ~ %s", now_str, query, pred_metric, @@ -1299,7 +1203,8 @@ def run_once() -> None: is_anomaly, state.period, origin_str, - state.clean_seconds, + last_real_str, + data_lag, len(ts_future), future_start, future_end, @@ -1312,7 +1217,7 @@ def main() -> None: load_state() logger.info( - "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s", + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s align_to_last_real=%s", VM_URL, HISTORY_MINUTES, HORIZON_SECONDS, @@ -1320,6 +1225,7 @@ def main() -> None: POLL_INTERVAL, STATE_FILE, EXTRA_PREDICT_LABELS["forecast"], + ALIGN_PREDICTION_TO_LAST_REAL_TS, ) while True: @@ -1329,5 +1235,3 @@ def main() -> None: if __name__ == "__main__": main() - - \ No newline at end of file From f5e7b2a27320066b5eb9157f297d6651adcd36c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 14:05:54 +0800 Subject: [PATCH 25/31] fix --- ai/predict_v3_single_scene.py | 520 +++++++++++++++++++++++++--------- 1 file changed, 385 insertions(+), 135 deletions(-) diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py index 2cde8b8..d212d2d 100755 --- a/ai/predict_v3_single_scene.py +++ b/ai/predict_v3_single_scene.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- """ -ProtoForge Predictor v9 +ProtoForge Predictor v10 修复重点: -1. 预测时间轴改为锚定最后一个真实数据点 last_real_ts,而不是锚定 time.time()。 -2. 不再使用 LAST_WRITTEN_UNTIL 把预测不断推向更远未来,避免 Grafana 里预测线相对真实线出现延迟/错位。 -3. 如果真实数据时间戳没有推进,则跳过本轮预测写入,避免重复写同一段未来时间造成毛刺。 -4. 保留:相位对齐、健康模板冻结、故障期不学习、恢复后再学习、预测上下界、异常指标。 +1. 修复 lag=0 但预测线仍然相位漂移的问题。 +2. 在谷底相位对齐基础上,增加 phase-lock 相位锁定。 +3. 每轮使用最近 1~2 个周期真实数据,搜索最佳 period + phase_origin。 +4. 预测起点仍然锚定最后一个真实点 last_real_ts,避免写入延迟。 +5. 保留健康模板冻结逻辑:异常期间不学习故障数据。 +6. 保留预测上下界和异常指标。 """ import json @@ -40,13 +42,12 @@ # ============================================================================= VM_URL = "http://localhost:8428" -STATE_FILE = "/tmp/protoforge_predictor_state_v9.json" +STATE_FILE = "/tmp/protoforge_predictor_state_v10.json" HISTORY_MINUTES = 30 HORIZON_SECONDS = 120 POLL_INTERVAL = 30 -# 实际每轮写入的预测长度。不要大于 POLL_INTERVAL,否则容易出现预测窗口重叠。 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) QUERY_STEP = "1s" @@ -58,26 +59,29 @@ MIN_FULL_CYCLES_FOR_TEMPLATE = 3 MAX_CYCLES_FOR_TEMPLATE = 6 -DETECT_WINDOW_SECONDS = 15 +DETECT_WINDOW_SECONDS = 20 RECOVERY_MIN_SECONDS = 60 -HEALTHY_EMA_ALPHA = 0.12 -RECOVERY_EMA_ALPHA = 0.30 +HEALTHY_EMA_ALPHA = 0.10 +RECOVERY_EMA_ALPHA = 0.25 OUTSIDE_RATIO_THRESHOLD = 0.60 -PHASE_SEARCH_RATIO = 0.15 + VALLEY_QUANTILE = 45 -# 关键修复:预测时间轴锚定真实数据最后一个点。 -# True:预测从 last_real_ts + 1 开始,适合 Grafana 与真实曲线对齐展示。 -# False:预测从当前系统时间 + 1 开始,适合只看纯未来预测,但容易与有采集延迟的真实数据错位。 -ALIGN_PREDICTION_TO_LAST_REAL_TS = True +# phase-lock 配置 +PHASE_LOCK_MIN_WINDOW_SECONDS = 45 +PHASE_LOCK_MAX_WINDOW_SECONDS = 180 +PHASE_LOCK_PERIOD_SEARCH_RATIO = 0.12 +PHASE_LOCK_ORIGIN_SEARCH_RATIO = 0.35 +PHASE_LOCK_PERIOD_STEP = 1 +PHASE_LOCK_ORIGIN_STEP = 1 -# 如果 last_real_ts 距离当前系统时间太久,说明采集链路可能断了,跳过预测,避免用陈旧数据继续画未来线。 +# 真实数据延迟超过这个值,就不继续预测 MAX_DATA_LAG_SECONDS = 180 -# 真实数据至少推进多少秒,才写入新预测,避免同一段未来时间被反复写入。 -MIN_REAL_ADVANCE_SECONDS = 1 +# 预测锚定最后一个真实点 +ALIGN_PREDICTION_TO_LAST_REAL_TS = True # ============================================================================= @@ -130,7 +134,7 @@ ] EXTRA_PREDICT_LABELS = { - "forecast": "phase_aligned_health_v9", + "forecast": "phase_locked_health_v10", "source": "protoforge", } @@ -157,9 +161,6 @@ class BaselineState: BASELINE_STATES: Dict[str, BaselineState] = {} - -# 记录每条序列最后一次使用的真实数据时间戳,而不是预测写到哪里。 -# 这样不会把预测不断推向更远的未来。 LAST_REAL_TS_WRITTEN: Dict[str, int] = {} @@ -197,8 +198,6 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa return [], [] values = result[0].get("values", []) - if not values: - return [], [] ts = [] ys = [] @@ -244,6 +243,7 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. return np.array([]), np.array([]) sorted_items = sorted(data.items(), key=lambda x: x[0]) + ts_clean = np.array([x[0] for x in sorted_items], dtype=float) ys_clean = np.array([x[1] for x in sorted_items], dtype=float) @@ -263,7 +263,7 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np. # ============================================================================= -# 周期估计与谷底检测 +# 周期估计 # ============================================================================= def moving_average(arr: np.ndarray, window: int) -> np.ndarray: @@ -355,6 +355,10 @@ def estimate_period_rough(ys_arr: np.ndarray) -> int: return int(period) +# ============================================================================= +# 谷底检测与模板构建 +# ============================================================================= + def find_valley_indices( ts_grid: np.ndarray, ys_grid: np.ndarray, @@ -366,6 +370,7 @@ def find_valley_indices( return [] period = max(3, int(expected_period)) + smooth_window = max(3, int(round(period * 0.08))) smooth_window = min(smooth_window, 21) @@ -384,6 +389,7 @@ def find_valley_indices( if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE: candidates = [] + for i in range(1, n - 1): if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]: candidates.append(i) @@ -451,10 +457,6 @@ def detect_period_and_valleys( return int(period), valleys -# ============================================================================= -# 相位对齐模板 -# ============================================================================= - def build_template_from_valleys( ts_grid: np.ndarray, ys_grid: np.ndarray, @@ -546,7 +548,7 @@ def build_current_baseline( # ============================================================================= -# 预测与模板合并 +# 模板预测与重采样 # ============================================================================= def circular_template_value(template: np.ndarray, phase: float) -> float: @@ -556,6 +558,7 @@ def circular_template_value(template: np.ndarray, phase: float) -> float: return 0.0 phase = float(phase) % period + i0 = int(math.floor(phase)) % period i1 = (i0 + 1) % period frac = phase - math.floor(phase) @@ -563,46 +566,77 @@ def circular_template_value(template: np.ndarray, phase: float) -> float: return float((1.0 - frac) * template[i0] + frac * template[i1]) -def predict_with_origin( - state: BaselineState, +def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: + old_period = len(old_template) + + if old_period == new_period: + return old_template.astype(float) + + if old_period <= 1 or new_period <= 1: + return np.full(new_period, float(np.mean(old_template)), dtype=float) + + old_x = np.linspace(0.0, 1.0, old_period, endpoint=False) + new_x = np.linspace(0.0, 1.0, new_period, endpoint=False) + + old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0]) + old_y_ext = np.concatenate([old_template, old_template, old_template]) + + return np.interp(new_x, old_x_ext, old_y_ext).astype(float) + + +def predict_template_values( + template: np.ndarray, + period: int, + phase_origin_ts: int, ts_list: List[int], - phase_origin_ts: Optional[int] = None, ) -> np.ndarray: - template = np.array(state.template, dtype=float) - period = int(state.period) - - if period <= 1 or len(template) != period: + if period <= 1: return np.zeros(len(ts_list), dtype=float) - origin = int(state.phase_origin_ts if phase_origin_ts is None else phase_origin_ts) + if len(template) != period: + template = resample_template(template, period) + values = [] for ts in ts_list: - phase = (int(ts) - origin) % period + phase = (int(ts) - int(phase_origin_ts)) % period values.append(circular_template_value(template, phase)) return np.array(values, dtype=float) -def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: - old_period = len(old_template) +def predict_with_state(state: BaselineState, ts_list: List[int]) -> np.ndarray: + template = np.array(state.template, dtype=float) - if old_period == new_period: - return old_template.astype(float) + return predict_template_values( + template=template, + period=int(state.period), + phase_origin_ts=int(state.phase_origin_ts), + ts_list=ts_list, + ) - if old_period <= 1 or new_period <= 1: - return np.full(new_period, float(np.mean(old_template)), dtype=float) - old_x = np.linspace(0.0, 1.0, old_period, endpoint=False) - new_x = np.linspace(0.0, 1.0, new_period, endpoint=False) +def normalize_origin_near(origin: int, period: int, near_ts: int) -> int: + if period <= 1: + return origin - old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0]) - old_y_ext = np.concatenate([old_template, old_template, old_template]) + origin = int(origin) + period = int(period) + near_ts = int(near_ts) - return np.interp(new_x, old_x_ext, old_y_ext).astype(float) + while origin + period <= near_ts: + origin += period + while origin > near_ts: + origin -= period -def align_new_template_to_old(old_template: np.ndarray, new_template: np.ndarray) -> np.ndarray: + return origin + + +def align_new_template_to_old( + old_template: np.ndarray, + new_template: np.ndarray, +) -> np.ndarray: if len(old_template) != len(new_template): old_template = resample_template(old_template, len(new_template)) @@ -629,23 +663,117 @@ def align_new_template_to_old(old_template: np.ndarray, new_template: np.ndarray return best_template.astype(float) -def merge_template(old_template: np.ndarray, new_template: np.ndarray, alpha: float) -> np.ndarray: +def merge_template( + old_template: np.ndarray, + new_template: np.ndarray, + alpha: float, +) -> np.ndarray: alpha = float(np.clip(alpha, 0.0, 1.0)) if len(old_template) != len(new_template): old_template = resample_template(old_template, len(new_template)) new_template = align_new_template_to_old(old_template, new_template) + merged = (1.0 - alpha) * old_template + alpha * new_template return merged.astype(float) +# ============================================================================= +# Phase Lock +# ============================================================================= + +def phase_lock_recent( + state: BaselineState, + ts_grid: np.ndarray, + ys_grid: np.ndarray, +) -> Tuple[int, int, np.ndarray, float]: + base_period = int(state.period) + base_origin = int(state.phase_origin_ts) + base_template = np.array(state.template, dtype=float) + + if base_period <= 1 or len(base_template) <= 1: + ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() + pred = predict_with_state(state, ts_recent) + actual = ys_grid[-len(ts_recent):].astype(float) + mae = float(np.mean(np.abs(actual - pred))) if len(actual) else 0.0 + return base_period, base_origin, pred, mae + + window_seconds = max( + PHASE_LOCK_MIN_WINDOW_SECONDS, + min(PHASE_LOCK_MAX_WINDOW_SECONDS, int(base_period * 2)), + ) + + cutoff = ts_grid[-1] - window_seconds + mask = ts_grid >= cutoff + + ts_recent_arr = ts_grid[mask].astype(int) + actual = ys_grid[mask].astype(float) + + if len(ts_recent_arr) < max(10, DETECT_WINDOW_SECONDS): + ts_recent_arr = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int) + actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float) + + ts_recent = ts_recent_arr.tolist() + last_ts = int(ts_recent[-1]) + + p_min = max(int(MIN_PERIOD_SECONDS), int(round(base_period * (1.0 - PHASE_LOCK_PERIOD_SEARCH_RATIO)))) + p_max = min(int(MAX_PERIOD_SECONDS), int(round(base_period * (1.0 + PHASE_LOCK_PERIOD_SEARCH_RATIO)))) + + if p_max < p_min: + p_min = p_max = base_period + + best_period = base_period + best_origin = normalize_origin_near(base_origin, base_period, last_ts) + best_template = resample_template(base_template, best_period) + best_pred = predict_template_values(best_template, best_period, best_origin, ts_recent) + best_mae = float(np.mean(np.abs(actual - best_pred))) + + for period in range(p_min, p_max + 1, PHASE_LOCK_PERIOD_STEP): + template = resample_template(base_template, period) + center_origin = normalize_origin_near(base_origin, period, last_ts) + + origin_shift = max(2, int(round(period * PHASE_LOCK_ORIGIN_SEARCH_RATIO))) + + for shift in range(-origin_shift, origin_shift + 1, PHASE_LOCK_ORIGIN_STEP): + origin = center_origin + shift + + pred = predict_template_values( + template=template, + period=period, + phase_origin_ts=origin, + ts_list=ts_recent, + ) + + mae = float(np.mean(np.abs(actual - pred))) + + # 轻微惩罚周期变化,避免过拟合抖动 + penalty = abs(period - base_period) * 0.5 + score = mae + penalty + + best_score = best_mae + abs(best_period - base_period) * 0.5 + + if score < best_score: + best_period = period + best_origin = origin + best_pred = pred + best_mae = mae + + best_origin = normalize_origin_near(best_origin, best_period, last_ts) + + return int(best_period), int(best_origin), best_pred, float(best_mae) + + # ============================================================================= # 异常检测 # ============================================================================= -def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray: +def calc_threshold( + pred: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> np.ndarray: return np.maximum(abs_threshold, np.abs(pred) * rel_threshold) @@ -655,33 +783,8 @@ def calc_bounds( rel_threshold: float, ) -> Tuple[np.ndarray, np.ndarray]: threshold = calc_threshold(pred, abs_threshold, rel_threshold) - return pred - threshold, pred + threshold - - -def find_best_phase_origin_for_recent( - state: BaselineState, - ts_recent: List[int], - actual: np.ndarray, -) -> Tuple[int, np.ndarray, float]: - period = int(state.period) - base_origin = int(state.phase_origin_ts) - max_shift = max(1, int(round(period * PHASE_SEARCH_RATIO))) - - best_origin = base_origin - best_pred = predict_with_origin(state, ts_recent, base_origin) - best_mae = float(np.mean(np.abs(actual - best_pred))) - - for shift in range(-max_shift, max_shift + 1): - origin = base_origin + shift - pred = predict_with_origin(state, ts_recent, origin) - mae = float(np.mean(np.abs(actual - pred))) - if mae < best_mae: - best_mae = mae - best_origin = origin - best_pred = pred - - return best_origin, best_pred, best_mae + return pred - threshold, pred + threshold def detect_anomaly( @@ -690,36 +793,50 @@ def detect_anomaly( ys_grid: np.ndarray, abs_threshold: float, rel_threshold: float, -) -> Tuple[bool, float, float, float, int]: - if len(ys_grid) < DETECT_WINDOW_SECONDS: - return False, 0.0, 0.0, 0.0, int(state.phase_origin_ts) - - ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() - actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float) - - best_origin, pred, _ = find_best_phase_origin_for_recent( +) -> Tuple[bool, float, float, float, int, int]: + best_period, best_origin, pred_recent, _ = phase_lock_recent( state=state, - ts_recent=ts_recent, - actual=actual, + ts_grid=ts_grid, + ys_grid=ys_grid, ) - threshold = calc_threshold(pred, abs_threshold, rel_threshold) - abs_err = np.abs(actual - pred) + recent_len = len(pred_recent) + + if recent_len <= 0: + return False, 0.0, 0.0, 0.0, best_period, best_origin + + actual = ys_grid[-recent_len:].astype(float) + + threshold = calc_threshold(pred_recent, abs_threshold, rel_threshold) + + abs_err = np.abs(actual - pred_recent) outside = abs_err > threshold outside_ratio = float(np.mean(outside)) mean_abs_err = float(np.mean(abs_err)) - mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1.0))) + mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred_recent), 1.0))) + is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD - return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err, int(best_origin) + return ( + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + int(best_period), + int(best_origin), + ) # ============================================================================= # 健康基线状态管理 # ============================================================================= -def create_initial_state(ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int) -> Optional[BaselineState]: +def create_initial_state( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + now_sec: int, +) -> Optional[BaselineState]: baseline = build_current_baseline(ts_grid, ys_grid) if baseline is None: @@ -740,6 +857,26 @@ def create_initial_state(ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int) ) +def apply_phase_lock_to_state( + state: BaselineState, + best_period: int, + best_origin: int, +) -> None: + best_period = int(best_period) + + if best_period <= 1: + return + + template = np.array(state.template, dtype=float) + + if len(template) != best_period: + template = resample_template(template, best_period) + + state.period = best_period + state.phase_origin_ts = int(best_origin) + state.template = template.astype(float).tolist() + + def maybe_update_state( key: str, ts_grid: np.ndarray, @@ -772,7 +909,14 @@ def maybe_update_state( elapsed = min(elapsed, POLL_INTERVAL * 2) state.last_seen_ts = now_sec - is_anom, outside_ratio, mean_abs_err, mean_rel_err, best_origin = detect_anomaly( + ( + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + best_period, + best_origin, + ) = detect_anomaly( state=state, ts_grid=ts_grid, ys_grid=ys_grid, @@ -780,9 +924,10 @@ def maybe_update_state( rel_threshold=rel_threshold, ) - if is_anom: + if is_anomaly: state.status = BASELINE_STATUS_ANOMALY state.clean_seconds = 0 + BASELINE_STATES[key] = state logger.warning( @@ -795,13 +940,17 @@ def maybe_update_state( return state, True, outside_ratio, mean_abs_err, mean_rel_err + old_period = int(state.period) old_origin = int(state.phase_origin_ts) - state.phase_origin_ts = int(best_origin) - if abs(state.phase_origin_ts - old_origin) >= 1: - logger.debug( - "相位校正 key=%s origin %s -> %s", + apply_phase_lock_to_state(state, best_period, best_origin) + + if old_period != state.period or old_origin != state.phase_origin_ts: + logger.info( + "phase-lock key=%s period %s -> %s origin %s -> %s", key, + old_period, + state.period, datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"), datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), ) @@ -809,9 +958,15 @@ def maybe_update_state( if state.status == BASELINE_STATUS_ANOMALY: state.status = BASELINE_STATUS_RECOVERING state.clean_seconds = elapsed + BASELINE_STATES[key] = state - logger.info("异常开始恢复 key=%s clean_seconds=%ss", key, state.clean_seconds) + logger.info( + "异常开始恢复 key=%s clean_seconds=%ss", + key, + state.clean_seconds, + ) + return state, False, outside_ratio, mean_abs_err, mean_rel_err if state.status == BASELINE_STATUS_RECOVERING: @@ -834,17 +989,27 @@ def maybe_update_state( int(state.period) * MAX_CYCLES_FOR_TEMPLATE, ) - baseline = build_current_baseline(ts_grid=ts_grid, ys_grid=ys_grid, tail_seconds=tail_seconds) + baseline = build_current_baseline( + ts_grid=ts_grid, + ys_grid=ys_grid, + tail_seconds=tail_seconds, + ) if baseline is None: BASELINE_STATES[key] = state return state, False, outside_ratio, mean_abs_err, mean_rel_err new_period, new_origin, new_template = baseline + old_template = np.array(state.template, dtype=float) + alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA - merged = merge_template(old_template=old_template, new_template=new_template, alpha=alpha) + merged = merge_template( + old_template=old_template, + new_template=new_template, + alpha=alpha, + ) state.period = int(new_period) state.phase_origin_ts = int(new_origin) @@ -878,7 +1043,12 @@ def maybe_update_state( # ============================================================================= def prom_escape_label_value(value: str) -> str: - return str(value).replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"') + return ( + str(value) + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace('"', '\\"') + ) def labels_to_str(labels: Dict[str, str]) -> str: @@ -926,7 +1096,9 @@ def write_series( resp = requests.post( f"{VM_URL}/api/v1/import/prometheus", data=payload.encode("utf-8"), - headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, + headers={ + "Content-Type": "text/plain; version=0.0.4; charset=utf-8", + }, timeout=10, ) resp.raise_for_status() @@ -951,17 +1123,57 @@ def write_prediction_bundle( mean_rel_err: float, event_ts: int, ) -> bool: - ok1 = write_series(pred_metric, labels, ts_future, pred_values.astype(float).tolist()) - ok2 = write_series(f"{pred_metric}_lower", labels, ts_future, lower_values.astype(float).tolist()) - ok3 = write_series(f"{pred_metric}_upper", labels, ts_future, upper_values.astype(float).tolist()) + ok1 = write_series( + metric_name=pred_metric, + labels=labels, + ts_list=ts_future, + values=pred_values.astype(float).tolist(), + ) + + ok2 = write_series( + metric_name=f"{pred_metric}_lower", + labels=labels, + ts_list=ts_future, + values=lower_values.astype(float).tolist(), + ) + + ok3 = write_series( + metric_name=f"{pred_metric}_upper", + labels=labels, + ts_list=ts_future, + values=upper_values.astype(float).tolist(), + ) anomaly_labels = dict(labels) anomaly_labels["type"] = "prediction_deviation" - ok4 = write_series(anomaly_metric, anomaly_labels, [event_ts], [1.0 if is_anomaly else 0.0]) - ok5 = write_series(f"{anomaly_metric}_outside_ratio", anomaly_labels, [event_ts], [outside_ratio]) - ok6 = write_series(f"{anomaly_metric}_mean_abs_error", anomaly_labels, [event_ts], [mean_abs_err]) - ok7 = write_series(f"{anomaly_metric}_mean_rel_error", anomaly_labels, [event_ts], [mean_rel_err]) + ok4 = write_series( + metric_name=anomaly_metric, + labels=anomaly_labels, + ts_list=[event_ts], + values=[1.0 if is_anomaly else 0.0], + ) + + ok5 = write_series( + metric_name=f"{anomaly_metric}_outside_ratio", + labels=anomaly_labels, + ts_list=[event_ts], + values=[outside_ratio], + ) + + ok6 = write_series( + metric_name=f"{anomaly_metric}_mean_abs_error", + labels=anomaly_labels, + ts_list=[event_ts], + values=[mean_abs_err], + ) + + ok7 = write_series( + metric_name=f"{anomaly_metric}_mean_rel_error", + labels=anomaly_labels, + ts_list=[event_ts], + values=[mean_rel_err], + ) return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 @@ -970,7 +1182,9 @@ def write_prediction_bundle( # 标签解析 # ============================================================================= -_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*') +_LABEL_PATTERN = re.compile( + r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' +) def parse_labels_from_query(query: str) -> Dict[str, str]: @@ -987,7 +1201,14 @@ def parse_labels_from_query(query: str) -> Dict[str, str]: for match in _LABEL_PATTERN.finditer(label_part): key = match.group(1) value = match.group(2) - value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\") + + value = ( + value + .replace('\\"', '"') + .replace("\\n", "\n") + .replace("\\\\", "\\") + ) + labels[key] = value return labels @@ -1042,7 +1263,12 @@ def load_state() -> None: states[key] = BaselineState(**value) BASELINE_STATES = states - logger.info("已加载预测状态文件 %s,状态数量=%d", STATE_FILE, len(BASELINE_STATES)) + + logger.info( + "已加载预测状态文件 %s,状态数量=%d", + STATE_FILE, + len(BASELINE_STATES), + ) except Exception as e: logger.warning("加载预测状态文件失败,将重新学习: %s", e) @@ -1069,10 +1295,14 @@ def save_state() -> None: # ============================================================================= -# 时间轴选择 +# 时间轴 # ============================================================================= -def build_prediction_timestamps(key: str, last_real_ts: int, now_sec: int) -> Optional[List[int]]: +def build_prediction_timestamps( + key: str, + last_real_ts: int, + now_sec: int, +) -> Optional[List[int]]: data_lag = now_sec - last_real_ts if data_lag > MAX_DATA_LAG_SECONDS: @@ -1086,24 +1316,24 @@ def build_prediction_timestamps(key: str, last_real_ts: int, now_sec: int) -> Op last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key) - if last_written_real_ts is not None: - advance = last_real_ts - int(last_written_real_ts) - - if advance < MIN_REAL_ADVANCE_SECONDS: - logger.info( - "真实数据时间戳未推进,跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s", - key, - last_real_ts, - last_written_real_ts, - ) - return None + if last_written_real_ts is not None and last_real_ts <= int(last_written_real_ts): + logger.info( + "真实数据时间戳未推进,跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s", + key, + last_real_ts, + last_written_real_ts, + ) + return None if ALIGN_PREDICTION_TO_LAST_REAL_TS: base_ts = last_real_ts else: base_ts = now_sec - return [base_ts + i + 1 for i in range(WRITE_HORIZON_SECONDS)] + return [ + base_ts + i + 1 + for i in range(WRITE_HORIZON_SECONDS) + ] # ============================================================================= @@ -1123,17 +1353,28 @@ def run_once() -> None: ts, ys = fetch_history(query) if len(ys) < MIN_POINTS: - logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) + logger.info( + "[%s] %s 数据不足(%d 点),跳过", + now_str, + query, + len(ys), + ) continue ts_grid, ys_grid = normalize_history(ts, ys) if len(ys_grid) < MIN_POINTS: - logger.info("[%s] %s 清洗后数据不足(%d 点),跳过", now_str, query, len(ys_grid)) + logger.info( + "[%s] %s 清洗后数据不足(%d 点),跳过", + now_str, + query, + len(ys_grid), + ) continue base_labels = parse_labels_from_query(query) write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) + key = series_key(pred_metric, write_labels) state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state( @@ -1145,7 +1386,11 @@ def run_once() -> None: ) if state is None: - logger.info("[%s] %s 暂无可用健康模板,等待学习", now_str, query) + logger.info( + "[%s] %s 暂无可用健康模板,等待学习", + now_str, + query, + ) continue now_sec = int(time.time()) @@ -1161,7 +1406,8 @@ def run_once() -> None: if not ts_future: continue - pred_values = predict_with_origin(state, ts_future) + pred_values = predict_with_state(state, ts_future) + lower_values, upper_values = calc_bounds( pred=pred_values, abs_threshold=abs_threshold, @@ -1184,7 +1430,11 @@ def run_once() -> None: ) if not ok: - logger.error("[%s] %s 写入预测数据失败", now_str, query) + logger.error( + "[%s] %s 写入预测数据失败", + now_str, + query, + ) continue LAST_REAL_TS_WRITTEN[key] = last_real_ts @@ -1234,4 +1484,4 @@ def main() -> None: if __name__ == "__main__": - main() + main() \ No newline at end of file From 79e9f9b080e3f5fc4a284ab1422d3a54aacaff0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 21:23:40 +0800 Subject: [PATCH 26/31] feat(pridict_v4): update pridict v4 version --- ai/pridict_v4.py | 1604 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1604 insertions(+) create mode 100644 ai/pridict_v4.py diff --git a/ai/pridict_v4.py b/ai/pridict_v4.py new file mode 100644 index 0000000..774ad3a --- /dev/null +++ b/ai/pridict_v4.py @@ -0,0 +1,1604 @@ +# -*- coding: utf-8 -*- +""" +ProtoForge Predictor v11 + +核心能力: +1. feed_rate / spindle_speed / spindle_current 使用 phase-lock 点预测。 +2. vibration_x / vibration_y / vibration_z 使用 phase-band 预测带。 +3. vibration 类指标不再追求单点完全贴合,而是输出: + - xxx_predicted 中位数预测线 + - xxx_predicted_upper 正常上边界 + - xxx_predicted_lower 正常下边界 +4. 预测起点锚定最后一个真实点 last_real_ts,避免时间错位。 +5. 异常期间冻结健康模板,不学习故障数据。 +6. 故障恢复后等待稳定,再恢复模板学习。 +""" + +import json +import logging +import math +import os +import re +import time +from dataclasses import asdict, dataclass +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Tuple + +import numpy as np +import requests + + +# ============================================================================= +# 日志配置 +# ============================================================================= + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# 基础配置 +# ============================================================================= + +VM_URL = "http://localhost:8428" +STATE_FILE = "/tmp/protoforge_predictor_state_v11.json" + +HISTORY_MINUTES = 30 +HORIZON_SECONDS = 120 +POLL_INTERVAL = 30 + +WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) + +QUERY_STEP = "1s" +MIN_POINTS = 120 + +MIN_PERIOD_SECONDS = 5 +MAX_PERIOD_SECONDS = 3600 + +MIN_FULL_CYCLES_FOR_TEMPLATE = 3 +MAX_CYCLES_FOR_TEMPLATE = 8 + +DETECT_WINDOW_SECONDS = 20 +RECOVERY_MIN_SECONDS = 60 + +HEALTHY_EMA_ALPHA = 0.10 +RECOVERY_EMA_ALPHA = 0.25 + +OUTSIDE_RATIO_THRESHOLD = 0.60 + +VALLEY_QUANTILE = 45 + +MAX_DATA_LAG_SECONDS = 180 + +PHASE_LOCK_MIN_WINDOW_SECONDS = 45 +PHASE_LOCK_MAX_WINDOW_SECONDS = 180 +PHASE_LOCK_PERIOD_SEARCH_RATIO = 0.12 +PHASE_LOCK_ORIGIN_SEARCH_RATIO = 0.35 +PHASE_LOCK_PERIOD_STEP = 1 +PHASE_LOCK_ORIGIN_STEP = 1 + + +# ============================================================================= +# 指标配置 +# ============================================================================= + +PREDICT_TARGETS = [ + { + "query": 'feed_rate{device_id="fanuc-cnc"}', + "pred_metric": "feed_rate_predicted", + "anomaly_metric": "feed_rate_anomaly", + "strategy": "phase_point", + "abs_threshold": 400.0, + "rel_threshold": 0.25, + "smooth_window": 1, + }, + { + "query": 'spindle_speed{device_id="fanuc-cnc"}', + "pred_metric": "spindle_speed_predicted", + "anomaly_metric": "spindle_speed_anomaly", + "strategy": "phase_point", + "abs_threshold": 500.0, + "rel_threshold": 0.25, + "smooth_window": 1, + }, + { + "query": 'spindle_current{device_id="fanuc-cnc"}', + "pred_metric": "spindle_current_predicted", + "anomaly_metric": "spindle_current_anomaly", + "strategy": "phase_point", + "abs_threshold": 5.0, + "rel_threshold": 0.25, + "smooth_window": 1, + }, + { + "query": 'vibration_x{device_id="fanuc-cnc"}', + "pred_metric": "vibration_x_predicted", + "anomaly_metric": "vibration_x_anomaly", + "strategy": "phase_band", + "abs_threshold": 0.12, + "rel_threshold": 0.35, + "smooth_window": 5, + "band_low_q": 10, + "band_high_q": 90, + "band_pad_abs": 0.06, + }, + { + "query": 'vibration_y{device_id="fanuc-cnc"}', + "pred_metric": "vibration_y_predicted", + "anomaly_metric": "vibration_y_anomaly", + "strategy": "phase_band", + "abs_threshold": 0.12, + "rel_threshold": 0.35, + "smooth_window": 5, + "band_low_q": 10, + "band_high_q": 90, + "band_pad_abs": 0.06, + }, + { + "query": 'vibration_z{device_id="fanuc-cnc"}', + "pred_metric": "vibration_z_predicted", + "anomaly_metric": "vibration_z_anomaly", + "strategy": "phase_band", + "abs_threshold": 0.12, + "rel_threshold": 0.35, + "smooth_window": 5, + "band_low_q": 10, + "band_high_q": 90, + "band_pad_abs": 0.06, + }, +] + +EXTRA_PREDICT_LABELS = { + "forecast": "phase_band_health_v11", + "source": "protoforge", +} + +BASELINE_STATUS_HEALTHY = "healthy" +BASELINE_STATUS_ANOMALY = "anomaly" +BASELINE_STATUS_RECOVERING = "recovering" + + +# ============================================================================= +# 状态结构 +# ============================================================================= + +@dataclass +class BaselineState: + period: int + phase_origin_ts: int + template: List[float] + lower_template: List[float] + upper_template: List[float] + strategy: str + status: str + clean_seconds: int + last_update_ts: int + last_seen_ts: int + y_min: float + y_max: float + + +BASELINE_STATES: Dict[str, BaselineState] = {} +LAST_REAL_TS_WRITTEN: Dict[str, int] = {} + + +# ============================================================================= +# VictoriaMetrics 读取 +# ============================================================================= + +def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]: + now = datetime.now() + start = now - timedelta(minutes=minutes) + + try: + resp = requests.get( + f"{VM_URL}/api/v1/query_range", + params={ + "query": query, + "start": start.timestamp(), + "end": now.timestamp(), + "step": QUERY_STEP, + }, + timeout=10, + ) + resp.raise_for_status() + except requests.RequestException as e: + logger.error("拉取数据失败 query=%s: %s", query, e) + return [], [] + + try: + result = resp.json().get("data", {}).get("result", []) + except Exception as e: + logger.error("解析 VM 返回失败 query=%s: %s", query, e) + return [], [] + + if not result: + return [], [] + + values = result[0].get("values", []) + + ts = [] + ys = [] + + for item in values: + if len(item) < 2: + continue + + try: + t = float(item[0]) + y = float(item[1]) + except Exception: + continue + + if not math.isfinite(t) or not math.isfinite(y): + continue + + ts.append(t) + ys.append(y) + + return ts, ys + + +def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]: + if not ts or not ys or len(ts) != len(ys): + return np.array([]), np.array([]) + + data = {} + + for t, y in zip(ts, ys): + try: + sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(sec) or not math.isfinite(val): + continue + + data[sec] = val + + if not data: + return np.array([]), np.array([]) + + sorted_items = sorted(data.items(), key=lambda x: x[0]) + + ts_clean = np.array([x[0] for x in sorted_items], dtype=float) + ys_clean = np.array([x[1] for x in sorted_items], dtype=float) + + if len(ts_clean) < 2: + return ts_clean, ys_clean + + start_sec = int(ts_clean[0]) + end_sec = int(ts_clean[-1]) + + if end_sec <= start_sec: + return ts_clean, ys_clean + + ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float) + ys_grid = np.interp(ts_grid, ts_clean, ys_clean) + + return ts_grid, ys_grid + + +# ============================================================================= +# 平滑与预处理 +# ============================================================================= + +def rolling_median(arr: np.ndarray, window: int) -> np.ndarray: + if window <= 1 or len(arr) < window: + return arr.astype(float) + + if window % 2 == 0: + window += 1 + + pad = window // 2 + padded = np.pad(arr.astype(float), (pad, pad), mode="edge") + + result = [] + + for i in range(len(arr)): + result.append(float(np.median(padded[i:i + window]))) + + return np.array(result, dtype=float) + + +def moving_average(arr: np.ndarray, window: int) -> np.ndarray: + if window <= 1 or len(arr) < window: + return arr.astype(float) + + if window % 2 == 0: + window += 1 + + kernel = np.ones(window, dtype=float) / window + pad = window // 2 + padded = np.pad(arr.astype(float), (pad, pad), mode="edge") + + return np.convolve(padded, kernel, mode="valid") + + +def preprocess_values(ys_grid: np.ndarray, target: Dict) -> np.ndarray: + strategy = target.get("strategy", "phase_point") + smooth_window = int(target.get("smooth_window", 1)) + + if strategy == "phase_band": + return rolling_median(ys_grid, smooth_window) + + if smooth_window > 1: + return moving_average(ys_grid, smooth_window) + + return ys_grid.astype(float) + + +# ============================================================================= +# 周期估计 +# ============================================================================= + +def estimate_period_by_fft(ys_arr: np.ndarray) -> float: + n = len(ys_arr) + + if n < 8: + return 60.0 + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return 60.0 + + fft_vals = np.fft.rfft(centered) + freqs = np.fft.rfftfreq(n, d=1.0) + + if len(freqs) <= 1: + return 60.0 + + power = np.abs(fft_vals[1:]) + + if len(power) == 0 or np.max(power) <= 0: + return 60.0 + + dominant_idx = int(np.argmax(power)) + 1 + dominant_freq = float(freqs[dominant_idx]) + + if dominant_freq <= 0: + return 60.0 + + period = 1.0 / dominant_freq + + return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float: + n = len(ys_arr) + + if n < 20: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + corr = np.correlate(centered, centered, mode="full")[n - 1:] + + p0 = int(round(init_period)) + left = max(int(MIN_PERIOD_SECONDS), int(max(2, p0 * 0.7))) + right = min(n // 2, int(max(left + 1, p0 * 1.3))) + + if right <= left: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + search = corr[left:right + 1] + + if len(search) == 0: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + best_lag = left + int(np.argmax(search)) + + return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def estimate_period_rough(ys_arr: np.ndarray) -> int: + p_fft = estimate_period_by_fft(ys_arr) + p_refined = refine_period_by_autocorr(ys_arr, p_fft) + + period = int(round(p_refined)) + period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period)) + + return int(period) + + +# ============================================================================= +# 谷底检测 +# ============================================================================= + +def find_valley_indices( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + expected_period: int, +) -> List[int]: + n = len(ys_grid) + + if n < max(10, expected_period * 2): + return [] + + period = max(3, int(expected_period)) + smooth_window = max(3, int(round(period * 0.08))) + smooth_window = min(smooth_window, 21) + + ys_smooth = moving_average(ys_grid, smooth_window) + threshold = float(np.percentile(ys_smooth, VALLEY_QUANTILE)) + + candidates = [] + + for i in range(1, n - 1): + if ( + ys_smooth[i] <= ys_smooth[i - 1] + and ys_smooth[i] < ys_smooth[i + 1] + and ys_smooth[i] <= threshold + ): + candidates.append(i) + + if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE: + candidates = [] + + for i in range(1, n - 1): + if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]: + candidates.append(i) + + if not candidates: + return [] + + min_distance = max(2, int(round(period * 0.55))) + selected = [] + + for idx in candidates: + if not selected: + selected.append(idx) + continue + + if idx - selected[-1] >= min_distance: + selected.append(idx) + continue + + if ys_smooth[idx] < ys_smooth[selected[-1]]: + selected[-1] = idx + + if len(selected) < 2: + return selected + + cleaned = [selected[0]] + + for idx in selected[1:]: + diff = int(ts_grid[idx] - ts_grid[cleaned[-1]]) + + if int(period * 0.55) <= diff <= int(period * 1.60): + cleaned.append(idx) + continue + + if diff < int(period * 0.55): + if ys_smooth[idx] < ys_smooth[cleaned[-1]]: + cleaned[-1] = idx + continue + + cleaned.append(idx) + + return cleaned + + +def detect_period_and_valleys( + ts_grid: np.ndarray, + ys_grid: np.ndarray, +) -> Tuple[int, List[int]]: + rough = estimate_period_rough(ys_grid) + valleys = find_valley_indices(ts_grid, ys_grid, rough) + + if len(valleys) >= 3: + diffs = np.diff(ts_grid[valleys]) + good = diffs[(diffs >= rough * 0.55) & (diffs <= rough * 1.60)] + + if len(good) > 0: + period = int(round(float(np.median(good)))) + else: + period = rough + else: + period = rough + + period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period)) + + return int(period), valleys + + +# ============================================================================= +# 模板构建 +# ============================================================================= + +def build_templates_from_valleys( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + period: int, + valleys: List[int], + target: Dict, +) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]: + if period <= 1 or len(valleys) < MIN_FULL_CYCLES_FOR_TEMPLATE + 1: + return None + + strategy = target.get("strategy", "phase_point") + low_q = float(target.get("band_low_q", 10)) + high_q = float(target.get("band_high_q", 90)) + + pairs = [] + + for a, b in zip(valleys[:-1], valleys[1:]): + cycle_len = float(ts_grid[b] - ts_grid[a]) + + if period * 0.55 <= cycle_len <= period * 1.60: + pairs.append((a, b, cycle_len)) + + if len(pairs) < MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + pairs = pairs[-MAX_CYCLES_FOR_TEMPLATE:] + + phase_grid = np.arange(period, dtype=float) + segments = [] + weights = [] + + for idx, (a, b, cycle_len) in enumerate(pairs): + seg_ts = ts_grid[a:b + 1] + seg_y = ys_grid[a:b + 1] + + if len(seg_y) < 3: + continue + + x_old = (seg_ts - seg_ts[0]) / cycle_len * period + seg = np.interp(phase_grid, x_old, seg_y) + + segments.append(seg.astype(float)) + weights.append(0.5 + 0.5 * ((idx + 1) / len(pairs))) + + if len(segments) < MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + arr = np.vstack(segments) + w_arr = np.array(weights, dtype=float) + + if strategy == "phase_band": + mid_template = np.percentile(arr, 50, axis=0) + lower_template = np.percentile(arr, low_q, axis=0) + upper_template = np.percentile(arr, high_q, axis=0) + else: + mid_template = np.average(arr, axis=0, weights=w_arr) + lower_template = mid_template.copy() + upper_template = mid_template.copy() + + return ( + mid_template.astype(float), + lower_template.astype(float), + upper_template.astype(float), + ) + + +def build_current_baseline( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + target: Dict, + tail_seconds: Optional[int] = None, +) -> Optional[Tuple[int, int, np.ndarray, np.ndarray, np.ndarray]]: + if len(ys_grid) < MIN_POINTS: + return None + + if tail_seconds is not None and tail_seconds > 0: + cutoff = ts_grid[-1] - int(tail_seconds) + mask = ts_grid >= cutoff + ts_use = ts_grid[mask] + ys_use = ys_grid[mask] + else: + ts_use = ts_grid + ys_use = ys_grid + + if len(ys_use) < MIN_POINTS: + return None + + period, valleys = detect_period_and_valleys(ts_use, ys_use) + + templates = build_templates_from_valleys( + ts_grid=ts_use, + ys_grid=ys_use, + period=period, + valleys=valleys, + target=target, + ) + + if templates is None or len(valleys) == 0: + return None + + template, lower_template, upper_template = templates + phase_origin_ts = int(round(float(ts_use[valleys[-1]]))) + + return int(period), phase_origin_ts, template, lower_template, upper_template + + +# ============================================================================= +# 模板预测 +# ============================================================================= + +def circular_template_value(template: np.ndarray, phase: float) -> float: + period = len(template) + + if period == 0: + return 0.0 + + phase = float(phase) % period + i0 = int(math.floor(phase)) % period + i1 = (i0 + 1) % period + frac = phase - math.floor(phase) + + return float((1.0 - frac) * template[i0] + frac * template[i1]) + + +def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: + old_period = len(old_template) + + if old_period == new_period: + return old_template.astype(float) + + if old_period <= 1 or new_period <= 1: + return np.full(new_period, float(np.mean(old_template)), dtype=float) + + old_x = np.linspace(0.0, 1.0, old_period, endpoint=False) + new_x = np.linspace(0.0, 1.0, new_period, endpoint=False) + + old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0]) + old_y_ext = np.concatenate([old_template, old_template, old_template]) + + return np.interp(new_x, old_x_ext, old_y_ext).astype(float) + + +def predict_template_values( + template: np.ndarray, + period: int, + phase_origin_ts: int, + ts_list: List[int], +) -> np.ndarray: + if period <= 1: + return np.zeros(len(ts_list), dtype=float) + + if len(template) != period: + template = resample_template(template, period) + + values = [] + + for ts in ts_list: + phase = (int(ts) - int(phase_origin_ts)) % period + values.append(circular_template_value(template, phase)) + + return np.array(values, dtype=float) + + +def predict_state_bundle( + state: BaselineState, + ts_list: List[int], +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + period = int(state.period) + origin = int(state.phase_origin_ts) + + mid = predict_template_values( + template=np.array(state.template, dtype=float), + period=period, + phase_origin_ts=origin, + ts_list=ts_list, + ) + + lower = predict_template_values( + template=np.array(state.lower_template, dtype=float), + period=period, + phase_origin_ts=origin, + ts_list=ts_list, + ) + + upper = predict_template_values( + template=np.array(state.upper_template, dtype=float), + period=period, + phase_origin_ts=origin, + ts_list=ts_list, + ) + + return mid, lower, upper + + +def normalize_origin_near(origin: int, period: int, near_ts: int) -> int: + if period <= 1: + return origin + + origin = int(origin) + period = int(period) + near_ts = int(near_ts) + + while origin + period <= near_ts: + origin += period + + while origin > near_ts: + origin -= period + + return origin + + +def merge_template( + old_template: np.ndarray, + new_template: np.ndarray, + alpha: float, +) -> np.ndarray: + alpha = float(np.clip(alpha, 0.0, 1.0)) + + if len(old_template) != len(new_template): + old_template = resample_template(old_template, len(new_template)) + + merged = (1.0 - alpha) * old_template + alpha * new_template + + return merged.astype(float) + + +# ============================================================================= +# Phase Lock +# ============================================================================= + +def phase_lock_recent( + state: BaselineState, + ts_grid: np.ndarray, + ys_model: np.ndarray, +) -> Tuple[int, int, np.ndarray, float]: + base_period = int(state.period) + base_origin = int(state.phase_origin_ts) + base_template = np.array(state.template, dtype=float) + + if base_period <= 1 or len(base_template) <= 1: + ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() + pred = predict_template_values(base_template, base_period, base_origin, ts_recent) + actual = ys_model[-len(ts_recent):].astype(float) + mae = float(np.mean(np.abs(actual - pred))) if len(actual) else 0.0 + return base_period, base_origin, pred, mae + + window_seconds = max( + PHASE_LOCK_MIN_WINDOW_SECONDS, + min(PHASE_LOCK_MAX_WINDOW_SECONDS, int(base_period * 2)), + ) + + cutoff = ts_grid[-1] - window_seconds + mask = ts_grid >= cutoff + + ts_recent_arr = ts_grid[mask].astype(int) + actual = ys_model[mask].astype(float) + + if len(ts_recent_arr) < max(10, DETECT_WINDOW_SECONDS): + ts_recent_arr = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int) + actual = ys_model[-DETECT_WINDOW_SECONDS:].astype(float) + + ts_recent = ts_recent_arr.tolist() + last_ts = int(ts_recent[-1]) + + p_min = max( + int(MIN_PERIOD_SECONDS), + int(round(base_period * (1.0 - PHASE_LOCK_PERIOD_SEARCH_RATIO))), + ) + p_max = min( + int(MAX_PERIOD_SECONDS), + int(round(base_period * (1.0 + PHASE_LOCK_PERIOD_SEARCH_RATIO))), + ) + + best_period = base_period + best_origin = normalize_origin_near(base_origin, base_period, last_ts) + best_template = resample_template(base_template, best_period) + + best_pred = predict_template_values( + template=best_template, + period=best_period, + phase_origin_ts=best_origin, + ts_list=ts_recent, + ) + + best_mae = float(np.mean(np.abs(actual - best_pred))) + + for period in range(p_min, p_max + 1, PHASE_LOCK_PERIOD_STEP): + template = resample_template(base_template, period) + center_origin = normalize_origin_near(base_origin, period, last_ts) + origin_shift = max(2, int(round(period * PHASE_LOCK_ORIGIN_SEARCH_RATIO))) + + for shift in range(-origin_shift, origin_shift + 1, PHASE_LOCK_ORIGIN_STEP): + origin = center_origin + shift + + pred = predict_template_values( + template=template, + period=period, + phase_origin_ts=origin, + ts_list=ts_recent, + ) + + mae = float(np.mean(np.abs(actual - pred))) + penalty = abs(period - base_period) * 0.5 + score = mae + penalty + + best_score = best_mae + abs(best_period - base_period) * 0.5 + + if score < best_score: + best_period = period + best_origin = origin + best_pred = pred + best_mae = mae + + best_origin = normalize_origin_near(best_origin, best_period, last_ts) + + return int(best_period), int(best_origin), best_pred, float(best_mae) + + +# ============================================================================= +# 异常检测 +# ============================================================================= + +def calc_point_bounds( + pred: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> Tuple[np.ndarray, np.ndarray]: + threshold = np.maximum(abs_threshold, np.abs(pred) * rel_threshold) + return pred - threshold, pred + threshold + + +def calc_final_bounds( + state: BaselineState, + pred: np.ndarray, + lower_raw: np.ndarray, + upper_raw: np.ndarray, + target: Dict, +) -> Tuple[np.ndarray, np.ndarray]: + strategy = target.get("strategy", "phase_point") + abs_threshold = float(target.get("abs_threshold", 1.0)) + rel_threshold = float(target.get("rel_threshold", 0.25)) + + if strategy == "phase_band": + pad_abs = float(target.get("band_pad_abs", abs_threshold)) + dynamic_pad = np.maximum(pad_abs, np.abs(pred) * rel_threshold * 0.20) + lower = lower_raw - dynamic_pad + upper = upper_raw + dynamic_pad + return lower, upper + + return calc_point_bounds(pred, abs_threshold, rel_threshold) + + +def detect_anomaly( + state: BaselineState, + ts_grid: np.ndarray, + ys_model: np.ndarray, + target: Dict, +) -> Tuple[bool, float, float, float, int, int]: + best_period, best_origin, pred_recent, _ = phase_lock_recent( + state=state, + ts_grid=ts_grid, + ys_model=ys_model, + ) + + recent_len = len(pred_recent) + + if recent_len <= 0: + return False, 0.0, 0.0, 0.0, best_period, best_origin + + actual = ys_model[-recent_len:].astype(float) + + tmp_state = BaselineState( + period=best_period, + phase_origin_ts=best_origin, + template=state.template, + lower_template=state.lower_template, + upper_template=state.upper_template, + strategy=state.strategy, + status=state.status, + clean_seconds=state.clean_seconds, + last_update_ts=state.last_update_ts, + last_seen_ts=state.last_seen_ts, + y_min=state.y_min, + y_max=state.y_max, + ) + + recent_ts = ts_grid[-recent_len:].astype(int).tolist() + pred, lower_raw, upper_raw = predict_state_bundle(tmp_state, recent_ts) + + lower, upper = calc_final_bounds( + state=tmp_state, + pred=pred, + lower_raw=lower_raw, + upper_raw=upper_raw, + target=target, + ) + + outside = (actual < lower) | (actual > upper) + abs_err = np.abs(actual - pred) + + outside_ratio = float(np.mean(outside)) + mean_abs_err = float(np.mean(abs_err)) + mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1e-6))) + + is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD + + return ( + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + int(best_period), + int(best_origin), + ) + + +# ============================================================================= +# 状态管理 +# ============================================================================= + +def create_initial_state( + ts_grid: np.ndarray, + ys_model: np.ndarray, + target: Dict, + now_sec: int, +) -> Optional[BaselineState]: + baseline = build_current_baseline( + ts_grid=ts_grid, + ys_grid=ys_model, + target=target, + ) + + if baseline is None: + return None + + period, phase_origin_ts, template, lower_template, upper_template = baseline + + return BaselineState( + period=int(period), + phase_origin_ts=int(phase_origin_ts), + template=template.astype(float).tolist(), + lower_template=lower_template.astype(float).tolist(), + upper_template=upper_template.astype(float).tolist(), + strategy=str(target.get("strategy", "phase_point")), + status=BASELINE_STATUS_HEALTHY, + clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE), + last_update_ts=now_sec, + last_seen_ts=now_sec, + y_min=float(np.min(ys_model)), + y_max=float(np.max(ys_model)), + ) + + +def apply_phase_lock_to_state( + state: BaselineState, + best_period: int, + best_origin: int, +) -> None: + best_period = int(best_period) + + if best_period <= 1: + return + + if len(state.template) != best_period: + state.template = resample_template( + np.array(state.template, dtype=float), + best_period, + ).astype(float).tolist() + + if len(state.lower_template) != best_period: + state.lower_template = resample_template( + np.array(state.lower_template, dtype=float), + best_period, + ).astype(float).tolist() + + if len(state.upper_template) != best_period: + state.upper_template = resample_template( + np.array(state.upper_template, dtype=float), + best_period, + ).astype(float).tolist() + + state.period = best_period + state.phase_origin_ts = int(best_origin) + + +def maybe_update_state( + key: str, + ts_grid: np.ndarray, + ys_model: np.ndarray, + target: Dict, +) -> Tuple[Optional[BaselineState], bool, float, float, float]: + now_sec = int(time.time()) + state = BASELINE_STATES.get(key) + + if state is None: + state = create_initial_state( + ts_grid=ts_grid, + ys_model=ys_model, + target=target, + now_sec=now_sec, + ) + + if state is None: + return None, False, 0.0, 0.0, 0.0 + + BASELINE_STATES[key] = state + + logger.info( + "初始化健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss", + key, + state.strategy, + state.period, + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + state.clean_seconds, + ) + + return state, False, 0.0, 0.0, 0.0 + + elapsed = max(1, now_sec - int(state.last_seen_ts)) + elapsed = min(elapsed, POLL_INTERVAL * 2) + state.last_seen_ts = now_sec + + ( + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + best_period, + best_origin, + ) = detect_anomaly( + state=state, + ts_grid=ts_grid, + ys_model=ys_model, + target=target, + ) + + if is_anomaly: + state.status = BASELINE_STATUS_ANOMALY + state.clean_seconds = 0 + BASELINE_STATES[key] = state + + logger.warning( + "检测到异常,冻结模板 key=%s outside_ratio=%.2f mean_abs_err=%.4f mean_rel_err=%.4f", + key, + outside_ratio, + mean_abs_err, + mean_rel_err, + ) + + return state, True, outside_ratio, mean_abs_err, mean_rel_err + + old_period = int(state.period) + old_origin = int(state.phase_origin_ts) + + apply_phase_lock_to_state(state, best_period, best_origin) + + if old_period != state.period or old_origin != state.phase_origin_ts: + logger.info( + "phase-lock key=%s period %s -> %s origin %s -> %s", + key, + old_period, + state.period, + datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"), + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + ) + + if state.status == BASELINE_STATUS_ANOMALY: + state.status = BASELINE_STATUS_RECOVERING + state.clean_seconds = elapsed + BASELINE_STATES[key] = state + + logger.info( + "异常开始恢复 key=%s clean_seconds=%ss", + key, + state.clean_seconds, + ) + + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + if state.status == BASELINE_STATUS_RECOVERING: + state.clean_seconds += elapsed + else: + state.status = BASELINE_STATUS_HEALTHY + state.clean_seconds += elapsed + + min_clean_for_update = max( + RECOVERY_MIN_SECONDS, + int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE, + ) + + if state.clean_seconds < min_clean_for_update: + BASELINE_STATES[key] = state + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + tail_seconds = min( + int(state.clean_seconds), + int(state.period) * MAX_CYCLES_FOR_TEMPLATE, + ) + + baseline = build_current_baseline( + ts_grid=ts_grid, + ys_grid=ys_model, + target=target, + tail_seconds=tail_seconds, + ) + + if baseline is None: + BASELINE_STATES[key] = state + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + new_period, new_origin, new_template, new_lower_template, new_upper_template = baseline + + alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA + + state.template = merge_template( + np.array(state.template, dtype=float), + new_template, + alpha, + ).astype(float).tolist() + + state.lower_template = merge_template( + np.array(state.lower_template, dtype=float), + new_lower_template, + alpha, + ).astype(float).tolist() + + state.upper_template = merge_template( + np.array(state.upper_template, dtype=float), + new_upper_template, + alpha, + ).astype(float).tolist() + + state.period = int(new_period) + state.phase_origin_ts = int(new_origin) + state.status = BASELINE_STATUS_HEALTHY + state.last_update_ts = now_sec + + if tail_seconds > 0 and len(ys_model) >= tail_seconds: + state.y_min = float(np.min(ys_model[-tail_seconds:])) + state.y_max = float(np.max(ys_model[-tail_seconds:])) + else: + state.y_min = float(np.min(ys_model)) + state.y_max = float(np.max(ys_model)) + + BASELINE_STATES[key] = state + + logger.info( + "更新健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss alpha=%.2f", + key, + state.strategy, + state.period, + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + state.clean_seconds, + alpha, + ) + + return state, False, outside_ratio, mean_abs_err, mean_rel_err + + +# ============================================================================= +# Prometheus 写入 +# ============================================================================= + +def prom_escape_label_value(value: str) -> str: + return ( + str(value) + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace('"', '\\"') + ) + + +def labels_to_str(labels: Dict[str, str]) -> str: + if not labels: + return "" + + parts = [] + + for k in sorted(labels.keys()): + parts.append(f'{k}="{prom_escape_label_value(labels[k])}"') + + return "{" + ",".join(parts) + "}" + + +def write_series( + metric_name: str, + labels: Dict[str, str], + ts_list: List[int], + values: List[float], +) -> bool: + if not ts_list or not values or len(ts_list) != len(values): + return False + + label_str = labels_to_str(labels) + lines = [] + + for t, y in zip(ts_list, values): + try: + ts_sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(ts_sec) or not math.isfinite(val): + continue + + lines.append(f"{metric_name}{label_str} {val:.6f} {ts_sec * 1000}") + + if not lines: + return False + + payload = "\n".join(lines) + "\n" + + try: + resp = requests.post( + f"{VM_URL}/api/v1/import/prometheus", + data=payload.encode("utf-8"), + headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, + timeout=10, + ) + resp.raise_for_status() + return True + + except requests.RequestException as e: + logger.error("写入数据失败 metric=%s: %s", metric_name, e) + return False + + +def write_prediction_bundle( + pred_metric: str, + anomaly_metric: str, + labels: Dict[str, str], + ts_future: List[int], + pred_values: np.ndarray, + lower_values: np.ndarray, + upper_values: np.ndarray, + is_anomaly: bool, + outside_ratio: float, + mean_abs_err: float, + mean_rel_err: float, + event_ts: int, +) -> bool: + ok1 = write_series( + metric_name=pred_metric, + labels=labels, + ts_list=ts_future, + values=pred_values.astype(float).tolist(), + ) + + ok2 = write_series( + metric_name=f"{pred_metric}_lower", + labels=labels, + ts_list=ts_future, + values=lower_values.astype(float).tolist(), + ) + + ok3 = write_series( + metric_name=f"{pred_metric}_upper", + labels=labels, + ts_list=ts_future, + values=upper_values.astype(float).tolist(), + ) + + anomaly_labels = dict(labels) + anomaly_labels["type"] = "prediction_deviation" + + ok4 = write_series( + metric_name=anomaly_metric, + labels=anomaly_labels, + ts_list=[event_ts], + values=[1.0 if is_anomaly else 0.0], + ) + + ok5 = write_series( + metric_name=f"{anomaly_metric}_outside_ratio", + labels=anomaly_labels, + ts_list=[event_ts], + values=[outside_ratio], + ) + + ok6 = write_series( + metric_name=f"{anomaly_metric}_mean_abs_error", + labels=anomaly_labels, + ts_list=[event_ts], + values=[mean_abs_err], + ) + + ok7 = write_series( + metric_name=f"{anomaly_metric}_mean_rel_error", + labels=anomaly_labels, + ts_list=[event_ts], + values=[mean_rel_err], + ) + + return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 + + +# ============================================================================= +# 标签解析 +# ============================================================================= + +_LABEL_PATTERN = re.compile( + r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' +) + + +def parse_labels_from_query(query: str) -> Dict[str, str]: + labels = {} + + if "{" not in query or "}" not in query: + return labels + + try: + label_part = query[query.index("{") + 1:query.rindex("}")] + except Exception: + return labels + + for match in _LABEL_PATTERN.finditer(label_part): + key = match.group(1) + value = match.group(2) + + value = ( + value + .replace('\\"', '"') + .replace("\\n", "\n") + .replace("\\\\", "\\") + ) + + labels[key] = value + + return labels + + +def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]: + result = {} + + for d in dicts: + if d: + result.update(d) + + return result + + +def series_key(metric_name: str, labels: Dict[str, str]) -> str: + return metric_name + labels_to_str(labels) + + +# ============================================================================= +# 状态持久化 +# ============================================================================= + +def load_state() -> None: + global BASELINE_STATES + + if not os.path.exists(STATE_FILE): + return + + try: + with open(STATE_FILE, "r", encoding="utf-8") as f: + raw = json.load(f) + + states = {} + + for key, value in raw.get("baseline_states", {}).items(): + required_fields = { + "period", + "phase_origin_ts", + "template", + "lower_template", + "upper_template", + "strategy", + "status", + "clean_seconds", + "last_update_ts", + "last_seen_ts", + "y_min", + "y_max", + } + + if not required_fields.issubset(set(value.keys())): + continue + + states[key] = BaselineState(**value) + + BASELINE_STATES = states + + logger.info( + "已加载预测状态文件 %s,状态数量=%d", + STATE_FILE, + len(BASELINE_STATES), + ) + + except Exception as e: + logger.warning("加载预测状态文件失败,将重新学习: %s", e) + + +def save_state() -> None: + try: + raw = { + "baseline_states": { + key: asdict(value) + for key, value in BASELINE_STATES.items() + } + } + + tmp_file = STATE_FILE + ".tmp" + + with open(tmp_file, "w", encoding="utf-8") as f: + json.dump(raw, f, ensure_ascii=False, indent=2) + + os.replace(tmp_file, STATE_FILE) + + except Exception as e: + logger.warning("保存预测状态文件失败: %s", e) + + +# ============================================================================= +# 时间轴 +# ============================================================================= + +def build_prediction_timestamps( + key: str, + last_real_ts: int, + now_sec: int, +) -> Optional[List[int]]: + data_lag = now_sec - last_real_ts + + if data_lag > MAX_DATA_LAG_SECONDS: + logger.warning( + "真实数据延迟过大,跳过预测 key=%s data_lag=%ss max=%ss", + key, + data_lag, + MAX_DATA_LAG_SECONDS, + ) + return None + + last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key) + + if last_written_real_ts is not None and last_real_ts <= int(last_written_real_ts): + logger.info( + "真实数据时间戳未推进,跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s", + key, + last_real_ts, + last_written_real_ts, + ) + return None + + base_ts = last_real_ts + + return [ + base_ts + i + 1 + for i in range(WRITE_HORIZON_SECONDS) + ] + + +# ============================================================================= +# 主流程 +# ============================================================================= + +def run_once() -> None: + now_str = datetime.now().strftime("%H:%M:%S") + + for target in PREDICT_TARGETS: + query = target["query"] + pred_metric = target["pred_metric"] + anomaly_metric = target["anomaly_metric"] + + ts, ys = fetch_history(query) + + if len(ys) < MIN_POINTS: + logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) + continue + + ts_grid, ys_grid_raw = normalize_history(ts, ys) + + if len(ys_grid_raw) < MIN_POINTS: + logger.info("[%s] %s 清洗后数据不足(%d 点),跳过", now_str, query, len(ys_grid_raw)) + continue + + ys_grid_model = preprocess_values(ys_grid_raw, target) + + base_labels = parse_labels_from_query(query) + write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) + + key = series_key(pred_metric, write_labels) + + state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state( + key=key, + ts_grid=ts_grid, + ys_model=ys_grid_model, + target=target, + ) + + if state is None: + logger.info("[%s] %s 暂无可用健康模板,等待学习", now_str, query) + continue + + now_sec = int(time.time()) + last_real_ts = int(ts_grid[-1]) + data_lag = now_sec - last_real_ts + + ts_future = build_prediction_timestamps( + key=key, + last_real_ts=last_real_ts, + now_sec=now_sec, + ) + + if not ts_future: + continue + + pred_values, lower_raw, upper_raw = predict_state_bundle(state, ts_future) + + lower_values, upper_values = calc_final_bounds( + state=state, + pred=pred_values, + lower_raw=lower_raw, + upper_raw=upper_raw, + target=target, + ) + + ok = write_prediction_bundle( + pred_metric=pred_metric, + anomaly_metric=anomaly_metric, + labels=write_labels, + ts_future=ts_future, + pred_values=pred_values, + lower_values=lower_values, + upper_values=upper_values, + is_anomaly=is_anomaly, + outside_ratio=outside_ratio, + mean_abs_err=mean_abs_err, + mean_rel_err=mean_rel_err, + event_ts=last_real_ts, + ) + + if not ok: + logger.error("[%s] %s 写入预测数据失败", now_str, query) + continue + + LAST_REAL_TS_WRITTEN[key] = last_real_ts + + future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") + future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") + last_real_str = datetime.fromtimestamp(last_real_ts).strftime("%H:%M:%S") + origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S") + + logger.info( + "[%s] %-40s → %-35s strategy=%s status=%s anomaly=%s period=%ss origin=%s last_real=%s lag=%ss 写入 %d 点,预测区间 %s ~ %s", + now_str, + query, + pred_metric, + state.strategy, + state.status, + is_anomaly, + state.period, + origin_str, + last_real_str, + data_lag, + len(ts_future), + future_start, + future_end, + ) + + save_state() + + +def main() -> None: + load_state() + + logger.info( + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s", + VM_URL, + HISTORY_MINUTES, + HORIZON_SECONDS, + WRITE_HORIZON_SECONDS, + POLL_INTERVAL, + STATE_FILE, + EXTRA_PREDICT_LABELS["forecast"], + ) + + while True: + run_once() + time.sleep(POLL_INTERVAL) + + +if __name__ == "__main__": + main() \ No newline at end of file From 3609fbae4e2fbb33700de79ee1f7730ac81e366c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 21:39:55 +0800 Subject: [PATCH 27/31] fix --- ai/pridict_v4.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ai/pridict_v4.py b/ai/pridict_v4.py index 774ad3a..c55f21a 100644 --- a/ai/pridict_v4.py +++ b/ai/pridict_v4.py @@ -120,11 +120,11 @@ "anomaly_metric": "vibration_x_anomaly", "strategy": "phase_band", "abs_threshold": 0.12, - "rel_threshold": 0.35, + "rel_threshold": 0.40, "smooth_window": 5, "band_low_q": 10, "band_high_q": 90, - "band_pad_abs": 0.06, + "band_pad_abs": 0.08, }, { "query": 'vibration_y{device_id="fanuc-cnc"}', @@ -132,11 +132,11 @@ "anomaly_metric": "vibration_y_anomaly", "strategy": "phase_band", "abs_threshold": 0.12, - "rel_threshold": 0.35, + "rel_threshold": 0.40, "smooth_window": 5, "band_low_q": 10, "band_high_q": 90, - "band_pad_abs": 0.06, + "band_pad_abs": 0.08, }, { "query": 'vibration_z{device_id="fanuc-cnc"}', @@ -144,11 +144,11 @@ "anomaly_metric": "vibration_z_anomaly", "strategy": "phase_band", "abs_threshold": 0.12, - "rel_threshold": 0.35, + "rel_threshold": 0.40, "smooth_window": 5, "band_low_q": 10, "band_high_q": 90, - "band_pad_abs": 0.06, + "band_pad_abs": 0.08, }, ] From 1c4217b31d9304cddeb6be16949cc54eaa8d31ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Thu, 21 May 2026 21:48:41 +0800 Subject: [PATCH 28/31] fix --- ai/pridict_v4.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ai/pridict_v4.py b/ai/pridict_v4.py index c55f21a..8657944 100644 --- a/ai/pridict_v4.py +++ b/ai/pridict_v4.py @@ -119,37 +119,37 @@ "pred_metric": "vibration_x_predicted", "anomaly_metric": "vibration_x_anomaly", "strategy": "phase_band", - "abs_threshold": 0.12, - "rel_threshold": 0.40, + "abs_threshold": 0.18, + "rel_threshold": 0.50, "smooth_window": 5, - "band_low_q": 10, - "band_high_q": 90, - "band_pad_abs": 0.08, + "band_low_q": 2, + "band_high_q": 98, + "band_pad_abs": 0.12, }, { "query": 'vibration_y{device_id="fanuc-cnc"}', "pred_metric": "vibration_y_predicted", "anomaly_metric": "vibration_y_anomaly", "strategy": "phase_band", - "abs_threshold": 0.12, - "rel_threshold": 0.40, + "abs_threshold": 0.18, + "rel_threshold": 0.50, "smooth_window": 5, - "band_low_q": 10, - "band_high_q": 90, - "band_pad_abs": 0.08, + "band_low_q": 2, + "band_high_q": 98, + "band_pad_abs": 0.12, }, { "query": 'vibration_z{device_id="fanuc-cnc"}', "pred_metric": "vibration_z_predicted", "anomaly_metric": "vibration_z_anomaly", "strategy": "phase_band", - "abs_threshold": 0.12, - "rel_threshold": 0.40, + "abs_threshold": 0.18, + "rel_threshold": 0.50, "smooth_window": 5, - "band_low_q": 10, - "band_high_q": 90, - "band_pad_abs": 0.08, - }, + "band_low_q": 2, + "band_high_q": 98, + "band_pad_abs": 0.12, + } ] EXTRA_PREDICT_LABELS = { From 4077e8f416d9d2f9d60df50d551fc817ddacd982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Fri, 22 May 2026 09:40:02 +0800 Subject: [PATCH 29/31] feat(predict_v5): add predict v5 --- ai/pridict_v5.py | 1794 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1794 insertions(+) create mode 100644 ai/pridict_v5.py diff --git a/ai/pridict_v5.py b/ai/pridict_v5.py new file mode 100644 index 0000000..6894a66 --- /dev/null +++ b/ai/pridict_v5.py @@ -0,0 +1,1794 @@ +# -*- coding: utf-8 -*- +""" +ProtoForge Predictor v12 + +核心能力: +1. feed_rate / spindle_speed / spindle_current 使用 phase-lock 点预测。 +2. vibration_x / vibration_y / vibration_z 使用 phase-band 预测带。 +3. vibration 类指标: + - predicted 使用平滑后的中位数模板,用于趋势参考。 + - upper/lower 使用原始波动分位数模板 + padding,用于正常波动容忍带。 + - 偶发越界不直接报警,只有持续越界 / 高比例越界 / 严重越界才报警。 +4. 预测起点锚定最后一个真实点 last_real_ts,避免时间错位。 +5. 异常期间冻结健康模板,不学习故障数据。 +6. 故障恢复后等待稳定,再恢复模板学习。 +7. 写入: + - xxx_predicted + - xxx_predicted_upper + - xxx_predicted_lower + - xxx_anomaly + - xxx_anomaly_outside_ratio + - xxx_anomaly_mean_abs_error + - xxx_anomaly_mean_rel_error + - xxx_anomaly_max_consecutive_outside + - xxx_anomaly_max_exceed_ratio +""" + +import json +import logging +import math +import os +import re +import time +from dataclasses import asdict, dataclass +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Tuple + +import numpy as np +import requests + + +# ============================================================================= +# 日志配置 +# ============================================================================= + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# 基础配置 +# ============================================================================= + +VM_URL = "http://localhost:8428" +STATE_FILE = "/tmp/protoforge_predictor_state_v12.json" + +HISTORY_MINUTES = 30 +HORIZON_SECONDS = 120 +POLL_INTERVAL = 30 + +WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL) + +QUERY_STEP = "1s" +MIN_POINTS = 120 + +MIN_PERIOD_SECONDS = 5 +MAX_PERIOD_SECONDS = 3600 + +MIN_FULL_CYCLES_FOR_TEMPLATE = 3 +MAX_CYCLES_FOR_TEMPLATE = 8 + +DETECT_WINDOW_SECONDS = 30 +RECOVERY_MIN_SECONDS = 60 + +HEALTHY_EMA_ALPHA = 0.10 +RECOVERY_EMA_ALPHA = 0.25 + +OUTSIDE_RATIO_THRESHOLD = 0.60 +MIN_CONSECUTIVE_OUTSIDE = 5 +SEVERE_EXCEED_RATIO = 1.8 + +VALLEY_QUANTILE = 45 + +MAX_DATA_LAG_SECONDS = 180 + +PHASE_LOCK_MIN_WINDOW_SECONDS = 45 +PHASE_LOCK_MAX_WINDOW_SECONDS = 180 +PHASE_LOCK_PERIOD_SEARCH_RATIO = 0.12 +PHASE_LOCK_ORIGIN_SEARCH_RATIO = 0.35 +PHASE_LOCK_PERIOD_STEP = 1 +PHASE_LOCK_ORIGIN_STEP = 1 + + +# ============================================================================= +# 指标配置 +# ============================================================================= + +PREDICT_TARGETS = [ + { + "query": 'feed_rate{device_id="fanuc-cnc"}', + "pred_metric": "feed_rate_predicted", + "anomaly_metric": "feed_rate_anomaly", + "strategy": "phase_point", + "abs_threshold": 400.0, + "rel_threshold": 0.25, + "smooth_window": 1, + "outside_ratio_threshold": 0.60, + "min_consecutive_outside": 5, + "severe_exceed_ratio": 1.8, + }, + { + "query": 'spindle_speed{device_id="fanuc-cnc"}', + "pred_metric": "spindle_speed_predicted", + "anomaly_metric": "spindle_speed_anomaly", + "strategy": "phase_point", + "abs_threshold": 500.0, + "rel_threshold": 0.25, + "smooth_window": 1, + "outside_ratio_threshold": 0.60, + "min_consecutive_outside": 5, + "severe_exceed_ratio": 1.8, + }, + { + "query": 'spindle_current{device_id="fanuc-cnc"}', + "pred_metric": "spindle_current_predicted", + "anomaly_metric": "spindle_current_anomaly", + "strategy": "phase_point", + "abs_threshold": 5.0, + "rel_threshold": 0.25, + "smooth_window": 1, + "outside_ratio_threshold": 0.60, + "min_consecutive_outside": 5, + "severe_exceed_ratio": 1.8, + }, + { + "query": 'vibration_x{device_id="fanuc-cnc"}', + "pred_metric": "vibration_x_predicted", + "anomaly_metric": "vibration_x_anomaly", + "strategy": "phase_band", + + # vibration 类指标噪声、尖峰较多,不建议用很窄的阈值。 + "abs_threshold": 0.18, + "rel_threshold": 0.55, + + # 平滑只用于相位锁定和 predicted 中位趋势。 + "smooth_window": 5, + + # upper/lower 用原始值分位数,范围放宽,覆盖正常尖峰。 + "band_low_q": 1, + "band_high_q": 99, + "band_pad_abs": 0.15, + + # 偶发越界容忍。 + "outside_ratio_threshold": 0.70, + "min_consecutive_outside": 5, + "severe_exceed_ratio": 2.0, + }, + { + "query": 'vibration_y{device_id="fanuc-cnc"}', + "pred_metric": "vibration_y_predicted", + "anomaly_metric": "vibration_y_anomaly", + "strategy": "phase_band", + "abs_threshold": 0.18, + "rel_threshold": 0.55, + "smooth_window": 5, + "band_low_q": 1, + "band_high_q": 99, + "band_pad_abs": 0.15, + "outside_ratio_threshold": 0.70, + "min_consecutive_outside": 5, + "severe_exceed_ratio": 2.0, + }, + { + "query": 'vibration_z{device_id="fanuc-cnc"}', + "pred_metric": "vibration_z_predicted", + "anomaly_metric": "vibration_z_anomaly", + "strategy": "phase_band", + "abs_threshold": 0.18, + "rel_threshold": 0.55, + "smooth_window": 5, + "band_low_q": 1, + "band_high_q": 99, + "band_pad_abs": 0.15, + "outside_ratio_threshold": 0.70, + "min_consecutive_outside": 5, + "severe_exceed_ratio": 2.0, + }, +] + +EXTRA_PREDICT_LABELS = { + "forecast": "phase_band_health_v12", + "source": "protoforge", +} + +BASELINE_STATUS_HEALTHY = "healthy" +BASELINE_STATUS_ANOMALY = "anomaly" +BASELINE_STATUS_RECOVERING = "recovering" + + +# ============================================================================= +# 状态结构 +# ============================================================================= + +@dataclass +class BaselineState: + period: int + phase_origin_ts: int + template: List[float] + lower_template: List[float] + upper_template: List[float] + strategy: str + status: str + clean_seconds: int + last_update_ts: int + last_seen_ts: int + y_min: float + y_max: float + + +BASELINE_STATES: Dict[str, BaselineState] = {} +LAST_REAL_TS_WRITTEN: Dict[str, int] = {} + + +# ============================================================================= +# VictoriaMetrics 读取 +# ============================================================================= + +def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]: + now = datetime.now() + start = now - timedelta(minutes=minutes) + + try: + resp = requests.get( + f"{VM_URL}/api/v1/query_range", + params={ + "query": query, + "start": start.timestamp(), + "end": now.timestamp(), + "step": QUERY_STEP, + }, + timeout=10, + ) + resp.raise_for_status() + except requests.RequestException as e: + logger.error("拉取数据失败 query=%s: %s", query, e) + return [], [] + + try: + result = resp.json().get("data", {}).get("result", []) + except Exception as e: + logger.error("解析 VM 返回失败 query=%s: %s", query, e) + return [], [] + + if not result: + return [], [] + + values = result[0].get("values", []) + + ts = [] + ys = [] + + for item in values: + if len(item) < 2: + continue + + try: + t = float(item[0]) + y = float(item[1]) + except Exception: + continue + + if not math.isfinite(t) or not math.isfinite(y): + continue + + ts.append(t) + ys.append(y) + + return ts, ys + + +def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]: + if not ts or not ys or len(ts) != len(ys): + return np.array([]), np.array([]) + + data = {} + + for t, y in zip(ts, ys): + try: + sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(sec) or not math.isfinite(val): + continue + + data[sec] = val + + if not data: + return np.array([]), np.array([]) + + sorted_items = sorted(data.items(), key=lambda x: x[0]) + + ts_clean = np.array([x[0] for x in sorted_items], dtype=float) + ys_clean = np.array([x[1] for x in sorted_items], dtype=float) + + if len(ts_clean) < 2: + return ts_clean, ys_clean + + start_sec = int(ts_clean[0]) + end_sec = int(ts_clean[-1]) + + if end_sec <= start_sec: + return ts_clean, ys_clean + + ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float) + ys_grid = np.interp(ts_grid, ts_clean, ys_clean) + + return ts_grid, ys_grid + + +# ============================================================================= +# 平滑与预处理 +# ============================================================================= + +def rolling_median(arr: np.ndarray, window: int) -> np.ndarray: + if window <= 1 or len(arr) < window: + return arr.astype(float) + + if window % 2 == 0: + window += 1 + + pad = window // 2 + padded = np.pad(arr.astype(float), (pad, pad), mode="edge") + + result = [] + + for i in range(len(arr)): + result.append(float(np.median(padded[i:i + window]))) + + return np.array(result, dtype=float) + + +def moving_average(arr: np.ndarray, window: int) -> np.ndarray: + if window <= 1 or len(arr) < window: + return arr.astype(float) + + if window % 2 == 0: + window += 1 + + kernel = np.ones(window, dtype=float) / window + pad = window // 2 + padded = np.pad(arr.astype(float), (pad, pad), mode="edge") + + return np.convolve(padded, kernel, mode="valid") + + +def preprocess_values(ys_grid: np.ndarray, target: Dict) -> np.ndarray: + strategy = target.get("strategy", "phase_point") + smooth_window = int(target.get("smooth_window", 1)) + + if strategy == "phase_band": + return rolling_median(ys_grid, smooth_window) + + if smooth_window > 1: + return moving_average(ys_grid, smooth_window) + + return ys_grid.astype(float) + + +# ============================================================================= +# 周期估计 +# ============================================================================= + +def estimate_period_by_fft(ys_arr: np.ndarray) -> float: + n = len(ys_arr) + + if n < 8: + return 60.0 + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return 60.0 + + fft_vals = np.fft.rfft(centered) + freqs = np.fft.rfftfreq(n, d=1.0) + + if len(freqs) <= 1: + return 60.0 + + power = np.abs(fft_vals[1:]) + + if len(power) == 0 or np.max(power) <= 0: + return 60.0 + + dominant_idx = int(np.argmax(power)) + 1 + dominant_freq = float(freqs[dominant_idx]) + + if dominant_freq <= 0: + return 60.0 + + period = 1.0 / dominant_freq + + return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float: + n = len(ys_arr) + + if n < 20: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + centered = ys_arr - np.mean(ys_arr) + + if np.allclose(centered, 0): + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + corr = np.correlate(centered, centered, mode="full")[n - 1:] + + p0 = int(round(init_period)) + left = max(int(MIN_PERIOD_SECONDS), int(max(2, p0 * 0.7))) + right = min(n // 2, int(max(left + 1, p0 * 1.3))) + + if right <= left: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + search = corr[left:right + 1] + + if len(search) == 0: + return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + best_lag = left + int(np.argmax(search)) + + return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS)) + + +def estimate_period_rough(ys_arr: np.ndarray) -> int: + p_fft = estimate_period_by_fft(ys_arr) + p_refined = refine_period_by_autocorr(ys_arr, p_fft) + + period = int(round(p_refined)) + period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period)) + + return int(period) + + +# ============================================================================= +# 谷底检测 +# ============================================================================= + +def find_valley_indices( + ts_grid: np.ndarray, + ys_grid: np.ndarray, + expected_period: int, +) -> List[int]: + n = len(ys_grid) + + if n < max(10, expected_period * 2): + return [] + + period = max(3, int(expected_period)) + smooth_window = max(3, int(round(period * 0.08))) + smooth_window = min(smooth_window, 21) + + ys_smooth = moving_average(ys_grid, smooth_window) + threshold = float(np.percentile(ys_smooth, VALLEY_QUANTILE)) + + candidates = [] + + for i in range(1, n - 1): + if ( + ys_smooth[i] <= ys_smooth[i - 1] + and ys_smooth[i] < ys_smooth[i + 1] + and ys_smooth[i] <= threshold + ): + candidates.append(i) + + if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE: + candidates = [] + + for i in range(1, n - 1): + if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]: + candidates.append(i) + + if not candidates: + return [] + + min_distance = max(2, int(round(period * 0.55))) + selected = [] + + for idx in candidates: + if not selected: + selected.append(idx) + continue + + if idx - selected[-1] >= min_distance: + selected.append(idx) + continue + + if ys_smooth[idx] < ys_smooth[selected[-1]]: + selected[-1] = idx + + if len(selected) < 2: + return selected + + cleaned = [selected[0]] + + for idx in selected[1:]: + diff = int(ts_grid[idx] - ts_grid[cleaned[-1]]) + + if int(period * 0.55) <= diff <= int(period * 1.60): + cleaned.append(idx) + continue + + if diff < int(period * 0.55): + if ys_smooth[idx] < ys_smooth[cleaned[-1]]: + cleaned[-1] = idx + continue + + cleaned.append(idx) + + return cleaned + + +def detect_period_and_valleys( + ts_grid: np.ndarray, + ys_grid: np.ndarray, +) -> Tuple[int, List[int]]: + rough = estimate_period_rough(ys_grid) + valleys = find_valley_indices(ts_grid, ys_grid, rough) + + if len(valleys) >= 3: + diffs = np.diff(ts_grid[valleys]) + good = diffs[(diffs >= rough * 0.55) & (diffs <= rough * 1.60)] + + if len(good) > 0: + period = int(round(float(np.median(good)))) + else: + period = rough + else: + period = rough + + period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period)) + + return int(period), valleys + + +# ============================================================================= +# 模板构建 +# ============================================================================= + +def build_templates_from_valleys( + ts_grid: np.ndarray, + ys_mid_grid: np.ndarray, + ys_band_grid: np.ndarray, + period: int, + valleys: List[int], + target: Dict, +) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]: + if period <= 1 or len(valleys) < MIN_FULL_CYCLES_FOR_TEMPLATE + 1: + return None + + strategy = target.get("strategy", "phase_point") + low_q = float(target.get("band_low_q", 10)) + high_q = float(target.get("band_high_q", 90)) + + pairs = [] + + for a, b in zip(valleys[:-1], valleys[1:]): + cycle_len = float(ts_grid[b] - ts_grid[a]) + + if period * 0.55 <= cycle_len <= period * 1.60: + pairs.append((a, b, cycle_len)) + + if len(pairs) < MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + pairs = pairs[-MAX_CYCLES_FOR_TEMPLATE:] + + phase_grid = np.arange(period, dtype=float) + mid_segments = [] + band_segments = [] + weights = [] + + for idx, (a, b, cycle_len) in enumerate(pairs): + seg_ts = ts_grid[a:b + 1] + seg_mid_y = ys_mid_grid[a:b + 1] + seg_band_y = ys_band_grid[a:b + 1] + + if len(seg_mid_y) < 3 or len(seg_band_y) < 3: + continue + + x_old = (seg_ts - seg_ts[0]) / cycle_len * period + + mid_seg = np.interp(phase_grid, x_old, seg_mid_y) + band_seg = np.interp(phase_grid, x_old, seg_band_y) + + mid_segments.append(mid_seg.astype(float)) + band_segments.append(band_seg.astype(float)) + weights.append(0.5 + 0.5 * ((idx + 1) / len(pairs))) + + if len(mid_segments) < MIN_FULL_CYCLES_FOR_TEMPLATE: + return None + + mid_arr = np.vstack(mid_segments) + band_arr = np.vstack(band_segments) + w_arr = np.array(weights, dtype=float) + + if strategy == "phase_band": + mid_template = np.percentile(mid_arr, 50, axis=0) + + # upper/lower 使用原始值分布,而不是平滑值分布。 + lower_template = np.percentile(band_arr, low_q, axis=0) + upper_template = np.percentile(band_arr, high_q, axis=0) + else: + mid_template = np.average(mid_arr, axis=0, weights=w_arr) + lower_template = mid_template.copy() + upper_template = mid_template.copy() + + return ( + mid_template.astype(float), + lower_template.astype(float), + upper_template.astype(float), + ) + + +def build_current_baseline( + ts_grid: np.ndarray, + ys_mid_grid: np.ndarray, + ys_band_grid: np.ndarray, + target: Dict, + tail_seconds: Optional[int] = None, +) -> Optional[Tuple[int, int, np.ndarray, np.ndarray, np.ndarray]]: + if len(ys_mid_grid) < MIN_POINTS or len(ys_band_grid) < MIN_POINTS: + return None + + if tail_seconds is not None and tail_seconds > 0: + cutoff = ts_grid[-1] - int(tail_seconds) + mask = ts_grid >= cutoff + ts_use = ts_grid[mask] + ys_mid_use = ys_mid_grid[mask] + ys_band_use = ys_band_grid[mask] + else: + ts_use = ts_grid + ys_mid_use = ys_mid_grid + ys_band_use = ys_band_grid + + if len(ys_mid_use) < MIN_POINTS or len(ys_band_use) < MIN_POINTS: + return None + + period, valleys = detect_period_and_valleys(ts_use, ys_mid_use) + + templates = build_templates_from_valleys( + ts_grid=ts_use, + ys_mid_grid=ys_mid_use, + ys_band_grid=ys_band_use, + period=period, + valleys=valleys, + target=target, + ) + + if templates is None or len(valleys) == 0: + return None + + template, lower_template, upper_template = templates + phase_origin_ts = int(round(float(ts_use[valleys[-1]]))) + + return int(period), phase_origin_ts, template, lower_template, upper_template + + +# ============================================================================= +# 模板预测 +# ============================================================================= + +def circular_template_value(template: np.ndarray, phase: float) -> float: + period = len(template) + + if period == 0: + return 0.0 + + phase = float(phase) % period + i0 = int(math.floor(phase)) % period + i1 = (i0 + 1) % period + frac = phase - math.floor(phase) + + return float((1.0 - frac) * template[i0] + frac * template[i1]) + + +def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray: + old_period = len(old_template) + + if old_period == new_period: + return old_template.astype(float) + + if old_period <= 1 or new_period <= 1: + return np.full(new_period, float(np.mean(old_template)), dtype=float) + + old_x = np.linspace(0.0, 1.0, old_period, endpoint=False) + new_x = np.linspace(0.0, 1.0, new_period, endpoint=False) + + old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0]) + old_y_ext = np.concatenate([old_template, old_template, old_template]) + + return np.interp(new_x, old_x_ext, old_y_ext).astype(float) + + +def predict_template_values( + template: np.ndarray, + period: int, + phase_origin_ts: int, + ts_list: List[int], +) -> np.ndarray: + if period <= 1: + return np.zeros(len(ts_list), dtype=float) + + if len(template) != period: + template = resample_template(template, period) + + values = [] + + for ts in ts_list: + phase = (int(ts) - int(phase_origin_ts)) % period + values.append(circular_template_value(template, phase)) + + return np.array(values, dtype=float) + + +def predict_state_bundle( + state: BaselineState, + ts_list: List[int], +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + period = int(state.period) + origin = int(state.phase_origin_ts) + + mid = predict_template_values( + template=np.array(state.template, dtype=float), + period=period, + phase_origin_ts=origin, + ts_list=ts_list, + ) + + lower = predict_template_values( + template=np.array(state.lower_template, dtype=float), + period=period, + phase_origin_ts=origin, + ts_list=ts_list, + ) + + upper = predict_template_values( + template=np.array(state.upper_template, dtype=float), + period=period, + phase_origin_ts=origin, + ts_list=ts_list, + ) + + return mid, lower, upper + + +def normalize_origin_near(origin: int, period: int, near_ts: int) -> int: + if period <= 1: + return origin + + origin = int(origin) + period = int(period) + near_ts = int(near_ts) + + while origin + period <= near_ts: + origin += period + + while origin > near_ts: + origin -= period + + return origin + + +def merge_template( + old_template: np.ndarray, + new_template: np.ndarray, + alpha: float, +) -> np.ndarray: + alpha = float(np.clip(alpha, 0.0, 1.0)) + + if len(old_template) != len(new_template): + old_template = resample_template(old_template, len(new_template)) + + merged = (1.0 - alpha) * old_template + alpha * new_template + + return merged.astype(float) + + +# ============================================================================= +# Phase Lock +# ============================================================================= + +def phase_lock_recent( + state: BaselineState, + ts_grid: np.ndarray, + ys_model: np.ndarray, +) -> Tuple[int, int, np.ndarray, float]: + base_period = int(state.period) + base_origin = int(state.phase_origin_ts) + base_template = np.array(state.template, dtype=float) + + if base_period <= 1 or len(base_template) <= 1: + ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist() + pred = predict_template_values(base_template, base_period, base_origin, ts_recent) + actual = ys_model[-len(ts_recent):].astype(float) + mae = float(np.mean(np.abs(actual - pred))) if len(actual) else 0.0 + return base_period, base_origin, pred, mae + + window_seconds = max( + PHASE_LOCK_MIN_WINDOW_SECONDS, + min(PHASE_LOCK_MAX_WINDOW_SECONDS, int(base_period * 2)), + ) + + cutoff = ts_grid[-1] - window_seconds + mask = ts_grid >= cutoff + + ts_recent_arr = ts_grid[mask].astype(int) + actual = ys_model[mask].astype(float) + + if len(ts_recent_arr) < max(10, DETECT_WINDOW_SECONDS): + ts_recent_arr = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int) + actual = ys_model[-DETECT_WINDOW_SECONDS:].astype(float) + + ts_recent = ts_recent_arr.tolist() + last_ts = int(ts_recent[-1]) + + p_min = max( + int(MIN_PERIOD_SECONDS), + int(round(base_period * (1.0 - PHASE_LOCK_PERIOD_SEARCH_RATIO))), + ) + p_max = min( + int(MAX_PERIOD_SECONDS), + int(round(base_period * (1.0 + PHASE_LOCK_PERIOD_SEARCH_RATIO))), + ) + + best_period = base_period + best_origin = normalize_origin_near(base_origin, base_period, last_ts) + best_template = resample_template(base_template, best_period) + + best_pred = predict_template_values( + template=best_template, + period=best_period, + phase_origin_ts=best_origin, + ts_list=ts_recent, + ) + + best_mae = float(np.mean(np.abs(actual - best_pred))) + + for period in range(p_min, p_max + 1, PHASE_LOCK_PERIOD_STEP): + template = resample_template(base_template, period) + center_origin = normalize_origin_near(base_origin, period, last_ts) + origin_shift = max(2, int(round(period * PHASE_LOCK_ORIGIN_SEARCH_RATIO))) + + for shift in range(-origin_shift, origin_shift + 1, PHASE_LOCK_ORIGIN_STEP): + origin = center_origin + shift + + pred = predict_template_values( + template=template, + period=period, + phase_origin_ts=origin, + ts_list=ts_recent, + ) + + mae = float(np.mean(np.abs(actual - pred))) + penalty = abs(period - base_period) * 0.5 + score = mae + penalty + + best_score = best_mae + abs(best_period - base_period) * 0.5 + + if score < best_score: + best_period = period + best_origin = origin + best_pred = pred + best_mae = mae + + best_origin = normalize_origin_near(best_origin, best_period, last_ts) + + return int(best_period), int(best_origin), best_pred, float(best_mae) + + +# ============================================================================= +# 异常检测 +# ============================================================================= + +def max_consecutive_true(flags: np.ndarray) -> int: + max_count = 0 + current = 0 + + for flag in flags: + if bool(flag): + current += 1 + max_count = max(max_count, current) + else: + current = 0 + + return int(max_count) + + +def calc_point_bounds( + pred: np.ndarray, + abs_threshold: float, + rel_threshold: float, +) -> Tuple[np.ndarray, np.ndarray]: + threshold = np.maximum(abs_threshold, np.abs(pred) * rel_threshold) + return pred - threshold, pred + threshold + + +def calc_final_bounds( + state: BaselineState, + pred: np.ndarray, + lower_raw: np.ndarray, + upper_raw: np.ndarray, + target: Dict, +) -> Tuple[np.ndarray, np.ndarray]: + strategy = target.get("strategy", "phase_point") + abs_threshold = float(target.get("abs_threshold", 1.0)) + rel_threshold = float(target.get("rel_threshold", 0.25)) + + if strategy == "phase_band": + pad_abs = float(target.get("band_pad_abs", abs_threshold)) + + # 对 vibration 类指标:边界更像正常波动容忍带,不是硬边界。 + dynamic_pad = np.maximum( + pad_abs, + np.abs(pred) * rel_threshold * 0.25, + ) + + lower = lower_raw - dynamic_pad + upper = upper_raw + dynamic_pad + + return lower, upper + + return calc_point_bounds(pred, abs_threshold, rel_threshold) + + +def detect_anomaly( + state: BaselineState, + ts_grid: np.ndarray, + ys_model: np.ndarray, + ys_actual: np.ndarray, + target: Dict, +) -> Tuple[bool, float, float, float, int, int, int, float]: + best_period, best_origin, pred_recent, _ = phase_lock_recent( + state=state, + ts_grid=ts_grid, + ys_model=ys_model, + ) + + recent_len = len(pred_recent) + + if recent_len <= 0: + return False, 0.0, 0.0, 0.0, best_period, best_origin, 0, 0.0 + + if target.get("strategy", "phase_point") == "phase_band": + actual = ys_actual[-recent_len:].astype(float) + else: + actual = ys_model[-recent_len:].astype(float) + + tmp_state = BaselineState( + period=best_period, + phase_origin_ts=best_origin, + template=state.template, + lower_template=state.lower_template, + upper_template=state.upper_template, + strategy=state.strategy, + status=state.status, + clean_seconds=state.clean_seconds, + last_update_ts=state.last_update_ts, + last_seen_ts=state.last_seen_ts, + y_min=state.y_min, + y_max=state.y_max, + ) + + recent_ts = ts_grid[-recent_len:].astype(int).tolist() + pred, lower_raw, upper_raw = predict_state_bundle(tmp_state, recent_ts) + + lower, upper = calc_final_bounds( + state=tmp_state, + pred=pred, + lower_raw=lower_raw, + upper_raw=upper_raw, + target=target, + ) + + above_upper = actual - upper + below_lower = lower - actual + + exceed = np.maximum(above_upper, below_lower) + exceed = np.maximum(exceed, 0.0) + + outside = exceed > 0 + + band_width = np.maximum(upper - lower, 1e-6) + exceed_ratio = exceed / band_width + + abs_err = np.abs(actual - pred) + + outside_ratio = float(np.mean(outside)) + mean_abs_err = float(np.mean(abs_err)) + mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1e-6))) + + max_outside_seconds = max_consecutive_true(outside) + max_exceed_ratio = float(np.max(exceed_ratio)) if len(exceed_ratio) > 0 else 0.0 + + outside_ratio_threshold = float( + target.get("outside_ratio_threshold", OUTSIDE_RATIO_THRESHOLD) + ) + min_consecutive_outside = int( + target.get("min_consecutive_outside", MIN_CONSECUTIVE_OUTSIDE) + ) + severe_exceed_ratio = float( + target.get("severe_exceed_ratio", SEVERE_EXCEED_RATIO) + ) + + # 核心优化: + # 1. 偶发 1~3 个点越界不报警。 + # 2. 持续越界才报警。 + # 3. 高比例越界才报警。 + # 4. 严重越界才立即报警。 + is_anomaly = ( + outside_ratio >= outside_ratio_threshold + or max_outside_seconds >= min_consecutive_outside + or max_exceed_ratio >= severe_exceed_ratio + ) + + return ( + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + int(best_period), + int(best_origin), + int(max_outside_seconds), + float(max_exceed_ratio), + ) + + +# ============================================================================= +# 状态管理 +# ============================================================================= + +def create_initial_state( + ts_grid: np.ndarray, + ys_model: np.ndarray, + ys_actual: np.ndarray, + target: Dict, + now_sec: int, +) -> Optional[BaselineState]: + baseline = build_current_baseline( + ts_grid=ts_grid, + ys_mid_grid=ys_model, + ys_band_grid=ys_actual, + target=target, + ) + + if baseline is None: + return None + + period, phase_origin_ts, template, lower_template, upper_template = baseline + + return BaselineState( + period=int(period), + phase_origin_ts=int(phase_origin_ts), + template=template.astype(float).tolist(), + lower_template=lower_template.astype(float).tolist(), + upper_template=upper_template.astype(float).tolist(), + strategy=str(target.get("strategy", "phase_point")), + status=BASELINE_STATUS_HEALTHY, + clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE), + last_update_ts=now_sec, + last_seen_ts=now_sec, + y_min=float(np.min(ys_actual)), + y_max=float(np.max(ys_actual)), + ) + + +def apply_phase_lock_to_state( + state: BaselineState, + best_period: int, + best_origin: int, +) -> None: + best_period = int(best_period) + + if best_period <= 1: + return + + if len(state.template) != best_period: + state.template = resample_template( + np.array(state.template, dtype=float), + best_period, + ).astype(float).tolist() + + if len(state.lower_template) != best_period: + state.lower_template = resample_template( + np.array(state.lower_template, dtype=float), + best_period, + ).astype(float).tolist() + + if len(state.upper_template) != best_period: + state.upper_template = resample_template( + np.array(state.upper_template, dtype=float), + best_period, + ).astype(float).tolist() + + state.period = best_period + state.phase_origin_ts = int(best_origin) + + +def maybe_update_state( + key: str, + ts_grid: np.ndarray, + ys_model: np.ndarray, + ys_actual: np.ndarray, + target: Dict, +) -> Tuple[Optional[BaselineState], bool, float, float, float, int, float]: + now_sec = int(time.time()) + state = BASELINE_STATES.get(key) + + if state is None: + state = create_initial_state( + ts_grid=ts_grid, + ys_model=ys_model, + ys_actual=ys_actual, + target=target, + now_sec=now_sec, + ) + + if state is None: + return None, False, 0.0, 0.0, 0.0, 0, 0.0 + + BASELINE_STATES[key] = state + + logger.info( + "初始化健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss", + key, + state.strategy, + state.period, + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + state.clean_seconds, + ) + + return state, False, 0.0, 0.0, 0.0, 0, 0.0 + + elapsed = max(1, now_sec - int(state.last_seen_ts)) + elapsed = min(elapsed, POLL_INTERVAL * 2) + state.last_seen_ts = now_sec + + ( + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + best_period, + best_origin, + max_outside_seconds, + max_exceed_ratio, + ) = detect_anomaly( + state=state, + ts_grid=ts_grid, + ys_model=ys_model, + ys_actual=ys_actual, + target=target, + ) + + if is_anomaly: + state.status = BASELINE_STATUS_ANOMALY + state.clean_seconds = 0 + BASELINE_STATES[key] = state + + logger.warning( + "检测到异常,冻结模板 key=%s outside_ratio=%.2f max_outside=%ss max_exceed_ratio=%.2f mean_abs_err=%.4f mean_rel_err=%.4f", + key, + outside_ratio, + max_outside_seconds, + max_exceed_ratio, + mean_abs_err, + mean_rel_err, + ) + + return ( + state, + True, + outside_ratio, + mean_abs_err, + mean_rel_err, + max_outside_seconds, + max_exceed_ratio, + ) + + old_period = int(state.period) + old_origin = int(state.phase_origin_ts) + + apply_phase_lock_to_state(state, best_period, best_origin) + + if old_period != state.period or old_origin != state.phase_origin_ts: + logger.info( + "phase-lock key=%s period %s -> %s origin %s -> %s", + key, + old_period, + state.period, + datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"), + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + ) + + if state.status == BASELINE_STATUS_ANOMALY: + state.status = BASELINE_STATUS_RECOVERING + state.clean_seconds = elapsed + BASELINE_STATES[key] = state + + logger.info( + "异常开始恢复 key=%s clean_seconds=%ss", + key, + state.clean_seconds, + ) + + return ( + state, + False, + outside_ratio, + mean_abs_err, + mean_rel_err, + max_outside_seconds, + max_exceed_ratio, + ) + + if state.status == BASELINE_STATUS_RECOVERING: + state.clean_seconds += elapsed + else: + state.status = BASELINE_STATUS_HEALTHY + state.clean_seconds += elapsed + + min_clean_for_update = max( + RECOVERY_MIN_SECONDS, + int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE, + ) + + if state.clean_seconds < min_clean_for_update: + BASELINE_STATES[key] = state + return ( + state, + False, + outside_ratio, + mean_abs_err, + mean_rel_err, + max_outside_seconds, + max_exceed_ratio, + ) + + tail_seconds = min( + int(state.clean_seconds), + int(state.period) * MAX_CYCLES_FOR_TEMPLATE, + ) + + baseline = build_current_baseline( + ts_grid=ts_grid, + ys_mid_grid=ys_model, + ys_band_grid=ys_actual, + target=target, + tail_seconds=tail_seconds, + ) + + if baseline is None: + BASELINE_STATES[key] = state + return ( + state, + False, + outside_ratio, + mean_abs_err, + mean_rel_err, + max_outside_seconds, + max_exceed_ratio, + ) + + new_period, new_origin, new_template, new_lower_template, new_upper_template = baseline + + alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA + + state.template = merge_template( + np.array(state.template, dtype=float), + new_template, + alpha, + ).astype(float).tolist() + + state.lower_template = merge_template( + np.array(state.lower_template, dtype=float), + new_lower_template, + alpha, + ).astype(float).tolist() + + state.upper_template = merge_template( + np.array(state.upper_template, dtype=float), + new_upper_template, + alpha, + ).astype(float).tolist() + + state.period = int(new_period) + state.phase_origin_ts = int(new_origin) + state.status = BASELINE_STATUS_HEALTHY + state.last_update_ts = now_sec + + if tail_seconds > 0 and len(ys_actual) >= tail_seconds: + state.y_min = float(np.min(ys_actual[-tail_seconds:])) + state.y_max = float(np.max(ys_actual[-tail_seconds:])) + else: + state.y_min = float(np.min(ys_actual)) + state.y_max = float(np.max(ys_actual)) + + BASELINE_STATES[key] = state + + logger.info( + "更新健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss alpha=%.2f", + key, + state.strategy, + state.period, + datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"), + state.clean_seconds, + alpha, + ) + + return ( + state, + False, + outside_ratio, + mean_abs_err, + mean_rel_err, + max_outside_seconds, + max_exceed_ratio, + ) + + +# ============================================================================= +# Prometheus 写入 +# ============================================================================= + +def prom_escape_label_value(value: str) -> str: + return ( + str(value) + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace('"', '\\"') + ) + + +def labels_to_str(labels: Dict[str, str]) -> str: + if not labels: + return "" + + parts = [] + + for k in sorted(labels.keys()): + parts.append(f'{k}="{prom_escape_label_value(labels[k])}"') + + return "{" + ",".join(parts) + "}" + + +def write_series( + metric_name: str, + labels: Dict[str, str], + ts_list: List[int], + values: List[float], +) -> bool: + if not ts_list or not values or len(ts_list) != len(values): + return False + + label_str = labels_to_str(labels) + lines = [] + + for t, y in zip(ts_list, values): + try: + ts_sec = int(round(float(t))) + val = float(y) + except Exception: + continue + + if not math.isfinite(ts_sec) or not math.isfinite(val): + continue + + lines.append(f"{metric_name}{label_str} {val:.6f} {ts_sec * 1000}") + + if not lines: + return False + + payload = "\n".join(lines) + "\n" + + try: + resp = requests.post( + f"{VM_URL}/api/v1/import/prometheus", + data=payload.encode("utf-8"), + headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, + timeout=10, + ) + resp.raise_for_status() + return True + + except requests.RequestException as e: + logger.error("写入数据失败 metric=%s: %s", metric_name, e) + return False + + +def write_prediction_bundle( + pred_metric: str, + anomaly_metric: str, + labels: Dict[str, str], + ts_future: List[int], + pred_values: np.ndarray, + lower_values: np.ndarray, + upper_values: np.ndarray, + is_anomaly: bool, + outside_ratio: float, + mean_abs_err: float, + mean_rel_err: float, + max_outside_seconds: int, + max_exceed_ratio: float, + event_ts: int, +) -> bool: + ok1 = write_series( + metric_name=pred_metric, + labels=labels, + ts_list=ts_future, + values=pred_values.astype(float).tolist(), + ) + + ok2 = write_series( + metric_name=f"{pred_metric}_lower", + labels=labels, + ts_list=ts_future, + values=lower_values.astype(float).tolist(), + ) + + ok3 = write_series( + metric_name=f"{pred_metric}_upper", + labels=labels, + ts_list=ts_future, + values=upper_values.astype(float).tolist(), + ) + + anomaly_labels = dict(labels) + anomaly_labels["type"] = "prediction_deviation" + + ok4 = write_series( + metric_name=anomaly_metric, + labels=anomaly_labels, + ts_list=[event_ts], + values=[1.0 if is_anomaly else 0.0], + ) + + ok5 = write_series( + metric_name=f"{anomaly_metric}_outside_ratio", + labels=anomaly_labels, + ts_list=[event_ts], + values=[outside_ratio], + ) + + ok6 = write_series( + metric_name=f"{anomaly_metric}_mean_abs_error", + labels=anomaly_labels, + ts_list=[event_ts], + values=[mean_abs_err], + ) + + ok7 = write_series( + metric_name=f"{anomaly_metric}_mean_rel_error", + labels=anomaly_labels, + ts_list=[event_ts], + values=[mean_rel_err], + ) + + ok8 = write_series( + metric_name=f"{anomaly_metric}_max_consecutive_outside", + labels=anomaly_labels, + ts_list=[event_ts], + values=[float(max_outside_seconds)], + ) + + ok9 = write_series( + metric_name=f"{anomaly_metric}_max_exceed_ratio", + labels=anomaly_labels, + ts_list=[event_ts], + values=[float(max_exceed_ratio)], + ) + + return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 and ok8 and ok9 + + +# ============================================================================= +# 标签解析 +# ============================================================================= + +_LABEL_PATTERN = re.compile( + r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*' +) + + +def parse_labels_from_query(query: str) -> Dict[str, str]: + labels = {} + + if "{" not in query or "}" not in query: + return labels + + try: + label_part = query[query.index("{") + 1:query.rindex("}")] + except Exception: + return labels + + for match in _LABEL_PATTERN.finditer(label_part): + key = match.group(1) + value = match.group(2) + + value = ( + value + .replace('\\"', '"') + .replace("\\n", "\n") + .replace("\\\\", "\\") + ) + + labels[key] = value + + return labels + + +def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]: + result = {} + + for d in dicts: + if d: + result.update(d) + + return result + + +def series_key(metric_name: str, labels: Dict[str, str]) -> str: + return metric_name + labels_to_str(labels) + + +# ============================================================================= +# 状态持久化 +# ============================================================================= + +def load_state() -> None: + global BASELINE_STATES + + if not os.path.exists(STATE_FILE): + return + + try: + with open(STATE_FILE, "r", encoding="utf-8") as f: + raw = json.load(f) + + states = {} + + for key, value in raw.get("baseline_states", {}).items(): + required_fields = { + "period", + "phase_origin_ts", + "template", + "lower_template", + "upper_template", + "strategy", + "status", + "clean_seconds", + "last_update_ts", + "last_seen_ts", + "y_min", + "y_max", + } + + if not required_fields.issubset(set(value.keys())): + continue + + states[key] = BaselineState(**value) + + BASELINE_STATES = states + + logger.info( + "已加载预测状态文件 %s,状态数量=%d", + STATE_FILE, + len(BASELINE_STATES), + ) + + except Exception as e: + logger.warning("加载预测状态文件失败,将重新学习: %s", e) + + +def save_state() -> None: + try: + raw = { + "baseline_states": { + key: asdict(value) + for key, value in BASELINE_STATES.items() + } + } + + tmp_file = STATE_FILE + ".tmp" + + with open(tmp_file, "w", encoding="utf-8") as f: + json.dump(raw, f, ensure_ascii=False, indent=2) + + os.replace(tmp_file, STATE_FILE) + + except Exception as e: + logger.warning("保存预测状态文件失败: %s", e) + + +# ============================================================================= +# 时间轴 +# ============================================================================= + +def build_prediction_timestamps( + key: str, + last_real_ts: int, + now_sec: int, +) -> Optional[List[int]]: + data_lag = now_sec - last_real_ts + + if data_lag > MAX_DATA_LAG_SECONDS: + logger.warning( + "真实数据延迟过大,跳过预测 key=%s data_lag=%ss max=%ss", + key, + data_lag, + MAX_DATA_LAG_SECONDS, + ) + return None + + last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key) + + if last_written_real_ts is not None and last_real_ts <= int(last_written_real_ts): + logger.info( + "真实数据时间戳未推进,跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s", + key, + last_real_ts, + last_written_real_ts, + ) + return None + + base_ts = last_real_ts + + return [ + base_ts + i + 1 + for i in range(WRITE_HORIZON_SECONDS) + ] + + +# ============================================================================= +# 主流程 +# ============================================================================= + +def run_once() -> None: + now_str = datetime.now().strftime("%H:%M:%S") + + for target in PREDICT_TARGETS: + query = target["query"] + pred_metric = target["pred_metric"] + anomaly_metric = target["anomaly_metric"] + + ts, ys = fetch_history(query) + + if len(ys) < MIN_POINTS: + logger.info("[%s] %s 数据不足(%d 点),跳过", now_str, query, len(ys)) + continue + + ts_grid, ys_grid_raw = normalize_history(ts, ys) + + if len(ys_grid_raw) < MIN_POINTS: + logger.info("[%s] %s 清洗后数据不足(%d 点),跳过", now_str, query, len(ys_grid_raw)) + continue + + ys_grid_model = preprocess_values(ys_grid_raw, target) + + base_labels = parse_labels_from_query(query) + write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS) + + key = series_key(pred_metric, write_labels) + + ( + state, + is_anomaly, + outside_ratio, + mean_abs_err, + mean_rel_err, + max_outside_seconds, + max_exceed_ratio, + ) = maybe_update_state( + key=key, + ts_grid=ts_grid, + ys_model=ys_grid_model, + ys_actual=ys_grid_raw, + target=target, + ) + + if state is None: + logger.info("[%s] %s 暂无可用健康模板,等待学习", now_str, query) + continue + + now_sec = int(time.time()) + last_real_ts = int(ts_grid[-1]) + data_lag = now_sec - last_real_ts + + ts_future = build_prediction_timestamps( + key=key, + last_real_ts=last_real_ts, + now_sec=now_sec, + ) + + if not ts_future: + continue + + pred_values, lower_raw, upper_raw = predict_state_bundle(state, ts_future) + + lower_values, upper_values = calc_final_bounds( + state=state, + pred=pred_values, + lower_raw=lower_raw, + upper_raw=upper_raw, + target=target, + ) + + ok = write_prediction_bundle( + pred_metric=pred_metric, + anomaly_metric=anomaly_metric, + labels=write_labels, + ts_future=ts_future, + pred_values=pred_values, + lower_values=lower_values, + upper_values=upper_values, + is_anomaly=is_anomaly, + outside_ratio=outside_ratio, + mean_abs_err=mean_abs_err, + mean_rel_err=mean_rel_err, + max_outside_seconds=max_outside_seconds, + max_exceed_ratio=max_exceed_ratio, + event_ts=last_real_ts, + ) + + if not ok: + logger.error("[%s] %s 写入预测数据失败", now_str, query) + continue + + LAST_REAL_TS_WRITTEN[key] = last_real_ts + + future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S") + future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S") + last_real_str = datetime.fromtimestamp(last_real_ts).strftime("%H:%M:%S") + origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S") + + logger.info( + "[%s] %-40s → %-35s strategy=%s status=%s anomaly=%s outside=%.2f max_outside=%ss max_exceed=%.2f period=%ss origin=%s last_real=%s lag=%ss 写入 %d 点,预测区间 %s ~ %s", + now_str, + query, + pred_metric, + state.strategy, + state.status, + is_anomaly, + outside_ratio, + max_outside_seconds, + max_exceed_ratio, + state.period, + origin_str, + last_real_str, + data_lag, + len(ts_future), + future_start, + future_end, + ) + + save_state() + + +def main() -> None: + load_state() + + logger.info( + "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s", + VM_URL, + HISTORY_MINUTES, + HORIZON_SECONDS, + WRITE_HORIZON_SECONDS, + POLL_INTERVAL, + STATE_FILE, + EXTRA_PREDICT_LABELS["forecast"], + ) + + while True: + run_once() + time.sleep(POLL_INTERVAL) + + +if __name__ == "__main__": + main() \ No newline at end of file From f9b6506452a75ffd1f1f8beee1ebd1008fdc724d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Mon, 25 May 2026 13:46:29 +0800 Subject: [PATCH 30/31] feat(protoforge): fault update --- protoforge/core/demo.py | 2 +- protoforge/core/fault.py | 187 +++++++++++++++++++++++++++++++++++++ protoforge/models/fault.py | 2 + web/src/views/Devices.vue | 117 ++++++++++++++++++++--- 4 files changed, 296 insertions(+), 12 deletions(-) diff --git a/protoforge/core/demo.py b/protoforge/core/demo.py index ff0b333..b7ccae7 100644 --- a/protoforge/core/demo.py +++ b/protoforge/core/demo.py @@ -108,7 +108,7 @@ async def seed_demo_data(engine: Any, template_manager: Any) -> None: "points": [ {"name": "weight", "address": "net_weight", "data_type": "float32", "generator_type": "random", "min_value": 0.5, "max_value": 50.0}, {"name": "tare", "address": "tare_weight", "data_type": "float32", "generator_type": "fixed", "fixed_value": 2.5}, - {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": true}, + {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": True}, ], }, ] diff --git a/protoforge/core/fault.py b/protoforge/core/fault.py index e72842d..5beba87 100644 --- a/protoforge/core/fault.py +++ b/protoforge/core/fault.py @@ -41,6 +41,7 @@ name="刀具磨损", description="刀具切削刃磨损,切削阻力增大,主轴电流升高,振动增大,进给速率下降", category="mechanical", + scenario_type="trend_drift", default_duration=300.0, tags=["刀具", "磨损", "渐进"], point_faults=[ @@ -67,6 +68,7 @@ name="刀具崩刃", description="刀具突发性崩刃,振动剧烈突增,主轴电流峰值,进给停止", category="mechanical", + scenario_type="sudden_spike", default_duration=15.0, tags=["刀具", "崩刃", "突发"], point_faults=[ @@ -93,6 +95,7 @@ name="主轴过热", description="主轴长时间高负荷运转或冷却不足,电流持续偏高,转速因热保护下降", category="thermal", + scenario_type="trend_drift", default_duration=240.0, tags=["主轴", "过热", "渐进"], point_faults=[ @@ -117,6 +120,7 @@ name="主轴轴承故障", description="主轴轴承磨损或润滑不足,振动幅度持续升高,伴随电流轻微上升", category="mechanical", + scenario_type="trend_drift", default_duration=360.0, tags=["主轴", "轴承", "渐进"], point_faults=[ @@ -141,6 +145,7 @@ name="进给堵转", description="进给轴卡死,进给速率降为零,主轴电流急剧升高", category="process", + scenario_type="sudden_spike", default_duration=20.0, tags=["进给", "堵转", "突发"], point_faults=[ @@ -163,6 +168,7 @@ name="振动异常", description="工件装夹松动或切削共振,三轴振动突然大幅增加", category="mechanical", + scenario_type="sudden_spike", default_duration=60.0, tags=["振动", "装夹", "突发"], point_faults=[ @@ -185,6 +191,7 @@ name="切削液不足", description="切削液供给不足,冷却润滑失效,热量积累导致振动和电流缓慢升高", category="process", + scenario_type="trend_drift", default_duration=480.0, tags=["切削液", "冷却", "渐进"], point_faults=[ @@ -211,6 +218,7 @@ name="电源波动", description="供电电压不稳定,主轴转速和进给速率出现随机波动", category="electrical", + scenario_type="high_noise", default_duration=90.0, tags=["电源", "波动", "突发"], point_faults=[ @@ -222,6 +230,176 @@ multiplier=1.0, noise_scale=150.0), ], ), + + # ================================================================== + # 以下为新增故障类型 + # ================================================================== + + # ------------------------------------------------------------------ + # 传感器强干扰 — 高噪声波动型 + # 场景:电磁干扰、接地不良、信号线屏蔽失效等导致传感器读数剧烈抖动 + # 特征:均值基本不变,但噪声幅度突然增大数倍,信号看起来"毛刺"严重 + # 区别于真实故障:设备本身没有坏,只是采集信号质量变差 + # 模式:瞬间注入,持续期间每次采样都叠加大幅随机噪声 + # ------------------------------------------------------------------ + FaultTypeDefinition( + id="sensor_noise", + name="传感器强干扰", + description=( + "【高噪声波动型】电磁干扰或接地不良导致传感器信号质量恶化。" + "均值基本不变,但每次采样叠加大幅随机噪声,曲线呈现密集毛刺。" + "典型场景:变频器附近的传感器、信号线屏蔽层破损、接地回路故障。" + ), + category="electrical", + scenario_type="high_noise", + default_duration=120.0, + tags=["传感器", "干扰", "噪声", "高噪声波动型"], + point_faults=[ + PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=8.0), + PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=2.5), + PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=2.5), + PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=3.0), + PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=80.0), + ], + ), + + # ------------------------------------------------------------------ + # 换工件/换程序段 — 工况切换型(高速加工 → 低速精加工) + # 场景:CNC 机床切换加工程序,从粗加工切换到精加工 + # 特征:转速降低、进给降低、电流降低,所有指标跳到新的正常范围并稳定 + # 关键:这不是故障!数据本身没有坏,只是工况变了,正常范围完全不同 + # 模式:STEP 阶跃,立即跳到新基线并在整个 duration 内保持 + # ------------------------------------------------------------------ + FaultTypeDefinition( + id="mode_switch_fine_machining", + name="切换精加工工况", + description=( + "【工况切换型】从粗加工切换到精加工程序段。" + "主轴转速升高、进给速率降低、切削电流降低,各指标立即跳到新的正常范围并保持稳定。" + "数据本身没有异常,但与粗加工基线相比会触发阈值告警。" + "典型场景:换刀后进入精加工、加工不同特征面、程序跳段。" + ), + category="process", + scenario_type="mode_switch", + default_duration=300.0, + tags=["工况切换", "精加工", "程序段", "工况切换型"], + point_faults=[ + PointFaultConfig(point="spindle_speed", mode=FaultMode.STEP, + multiplier=1.4, noise_scale=30.0), + PointFaultConfig(point="feed_rate", mode=FaultMode.STEP, + multiplier=0.3, noise_scale=10.0), + PointFaultConfig(point="spindle_current", mode=FaultMode.STEP, + multiplier=0.55, noise_scale=0.5), + PointFaultConfig(point="vibration_x", mode=FaultMode.STEP, + multiplier=0.6, noise_scale=0.1), + PointFaultConfig(point="vibration_y", mode=FaultMode.STEP, + multiplier=0.6, noise_scale=0.1), + PointFaultConfig(point="vibration_z", mode=FaultMode.STEP, + multiplier=0.6, noise_scale=0.1), + ], + ), + + # ------------------------------------------------------------------ + # 进入空载工况 — 工况切换型(加工中 → 空载运行) + # 场景:加工完成、等待上料、程序暂停,主轴空转 + # 特征:进给降为 0,电流大幅下降到空载水平,转速维持,振动降低 + # 模式:STEP 阶跃,立即切换到空载基线 + # ------------------------------------------------------------------ + FaultTypeDefinition( + id="mode_switch_idle", + name="切换空载工况", + description=( + "【工况切换型】机床进入空载运行状态(加工完成等待上料、程序暂停)。" + "进给速率降为零,主轴电流降至空载水平(约为加工时的 20-30%)," + "主轴转速维持,振动明显降低。" + "典型场景:换料等待、程序暂停、加工间隙、换刀等待。" + ), + category="process", + scenario_type="mode_switch", + default_duration=180.0, + tags=["工况切换", "空载", "等待", "工况切换型"], + point_faults=[ + PointFaultConfig(point="feed_rate", mode=FaultMode.STEP, + target_value=0.0, noise_scale=2.0), + PointFaultConfig(point="spindle_current", mode=FaultMode.STEP, + multiplier=0.22, noise_scale=0.3), + PointFaultConfig(point="vibration_x", mode=FaultMode.STEP, + multiplier=0.25, noise_scale=0.05), + PointFaultConfig(point="vibration_y", mode=FaultMode.STEP, + multiplier=0.25, noise_scale=0.05), + PointFaultConfig(point="vibration_z", mode=FaultMode.STEP, + multiplier=0.25, noise_scale=0.05), + ], + ), + + # ------------------------------------------------------------------ + # 突发电流尖峰 — 突发脉冲型 + # 场景:切削过程中遇到硬质夹杂物、刀具切入角突变、工件材质不均 + # 特征:主轴电流瞬间冲高(持续 2-5 秒),然后恢复正常,其他指标基本不变 + # 区别于刀具崩刃:电流尖峰后能自动恢复,不会导致停机 + # 模式:瞬间注入,持续时间极短 + # ------------------------------------------------------------------ + FaultTypeDefinition( + id="current_spike", + name="突发电流尖峰", + description=( + "【突发脉冲型】切削过程中遇到硬质夹杂物或材质不均,主轴电流瞬间冲高后自动恢复。" + "电流短暂升至正常值的 3-4 倍,持续仅数秒,振动轻微抖动,进给基本不受影响。" + "典型场景:铸件内部硬质点、焊缝区域、材料硬度不均匀。" + "与刀具崩刃的区别:能自动恢复,不触发停机报警。" + ), + category="mechanical", + scenario_type="sudden_spike", + default_duration=5.0, + tags=["电流", "尖峰", "脉冲", "突发脉冲型"], + point_faults=[ + PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT, + multiplier=3.5, noise_scale=1.5), + PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT, + multiplier=2.0, noise_scale=0.5), + PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT, + multiplier=2.0, noise_scale=0.5), + PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT, + multiplier=2.5, noise_scale=0.8), + ], + ), + + # ------------------------------------------------------------------ + # 主轴负载异常 — 关系约束型 + # 场景:刀具钝化但未完全磨损、切削参数不匹配、工件材料变硬 + # 特征:主轴转速正常、进给速率正常,但主轴电流异常升高 + # 关键:单看任何一个指标都"正常",只有多指标关系才能发现异常 + # 模式:渐进式,电流缓慢爬升,转速和进给保持不变 + # ------------------------------------------------------------------ + FaultTypeDefinition( + id="spindle_load_anomaly", + name="主轴负载异常", + description=( + "【关系约束型】主轴转速正常、进给速率正常,但主轴电流异常升高。" + "单看任何一个指标都在正常范围内,只有分析多指标关系才能发现异常。" + "物理含义:切削阻力增大(刀具钝化初期、材料变硬)," + "系统尚未触发保护降速,但电流已超出正常切削功率范围。" + "典型场景:刀具轻度钝化、切削液浓度不足、工件材料批次差异。" + ), + category="mechanical", + scenario_type="relation_constraint", + default_duration=240.0, + tags=["主轴", "负载", "关系约束", "关系约束型"], + point_faults=[ + PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL, + multiplier=2.8, noise_scale=1.0), + # 转速和进给保持不变(multiplier=1.0),只叠加极小噪声维持真实感 + PointFaultConfig(point="spindle_speed", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=15.0), + PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT, + multiplier=1.0, noise_scale=5.0), + ], + ), ] # 按 id 索引 @@ -374,6 +552,15 @@ def _compute_value( target = baseline * (1.0 + (pf.multiplier - 1.0) * intensity) else: target = baseline + elif pf.mode == FaultMode.STEP: + # 阶跃模式:立即跳到新基线并在整个 duration 内保持(工况切换专用) + # 与 INSTANT 的区别:STEP 的 multiplier 表示新工况的正常倍数,不受 intensity 缩放 + if pf.target_value is not None: + target = pf.target_value + elif pf.multiplier is not None: + target = baseline * pf.multiplier + else: + target = baseline else: # 渐进模式:随 progress 线性劣化 if pf.target_value is not None: diff --git a/protoforge/models/fault.py b/protoforge/models/fault.py index cc038e0..025da96 100644 --- a/protoforge/models/fault.py +++ b/protoforge/models/fault.py @@ -8,6 +8,7 @@ class FaultMode(str, Enum): """故障注入模式""" INSTANT = "instant" # 瞬间跳变到异常值,持续 duration 后恢复 GRADUAL = "gradual" # 渐进式劣化,随时间线性恶化,到 duration 时达到峰值后恢复 + STEP = "step" # 阶跃切换到新工况基线,整个 duration 内保持新基线(工况切换专用) class FaultStatus(str, Enum): @@ -36,6 +37,7 @@ class FaultTypeDefinition(BaseModel): name: str description: str category: str # 故障分类:mechanical / electrical / thermal / process + scenario_type: str = "trend_drift" # 异常场景类型:trend_drift / sudden_spike / high_noise / mode_switch / relation_constraint default_duration: float = 120.0 # 默认持续时间(秒) point_faults: list[PointFaultConfig] = Field(default_factory=list) tags: list[str] = Field(default_factory=list) diff --git a/web/src/views/Devices.vue b/web/src/views/Devices.vue index 65e0535..36141a4 100644 --- a/web/src/views/Devices.vue +++ b/web/src/views/Devices.vue @@ -92,24 +92,55 @@ - + - 设备:{{ faultTargetDevice?.name }} + 目标设备:{{ faultTargetDevice?.name }} + - -
{{ selectedFaultType.name }} · {{ faultCategoryLabel(selectedFaultType.category) }}
-
{{ selectedFaultType.description }}
-
- 影响测点:{{ selectedFaultType.point_faults.map(p => p.point).join('、') }} + + +
+ + + {{ selectedFaultType.name }} + + {{ scenarioTypeLabel(selectedFaultType.scenario_type) }} + + + {{ faultCategoryLabel(selectedFaultType.category) }} + + + + + {{ selectedFaultType.description }} + + +
+ 影响测点: + + + {{ pf.point }} + + {{ pointFaultModeLabel(pf) }} + + +
- +
+ + {{ faultIntensityLabel }}({{ faultIntensity }}) + + · 工况切换型强度不影响切换幅度 + @@ -243,12 +278,17 @@ const columns = [ }, { title: '测点', key: 'points', width: 70, render: (row) => (row.points || []).length }, { - title: '故障', key: 'fault', width: 90, + title: '故障', key: 'fault', width: 130, render: (row) => { const fault = activeFaults.value[row.id] if (!fault || fault.status === 'none') return h(NTag, { size: 'tiny', bordered: false }, () => '正常') const pct = Math.round((fault.progress || 0) * 100) - return h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`) + const ft = faultTypes.value.find(t => t.id === fault.fault_type_id) + const scenarioLabel = ft ? scenarioTypeLabel(ft.scenario_type) : '' + return h(NSpace, { size: 2, vertical: false, align: 'center' }, () => [ + h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`), + scenarioLabel ? h(NTag, { size: 'tiny', bordered: false, style: 'font-size:10px;background:#2d1b1b;color:#f87171' }, () => scenarioLabel) : null, + ]) } }, { @@ -351,6 +391,25 @@ const faultTypeOptions = computed(() => faultTypes.value.map(t => ({ label: `${t.name}(${faultCategoryLabel(t.category)})`, value: t.id })) ) +// 按场景类型分组的故障选项 +const SCENARIO_ORDER = ['trend_drift', 'sudden_spike', 'high_noise', 'mode_switch', 'relation_constraint'] +const faultTypeGroupedOptions = computed(() => { + const groups = {} + for (const t of faultTypes.value) { + const st = t.scenario_type || 'trend_drift' + if (!groups[st]) groups[st] = [] + groups[st].push({ label: t.name, value: t.id }) + } + return SCENARIO_ORDER + .filter(st => groups[st]) + .map(st => ({ + type: 'group', + label: scenarioTypeLabel(st), + key: st, + children: groups[st], + })) +}) + const selectedFaultType = computed(() => faultTypes.value.find(t => t.id === faultTypeId.value) || null ) @@ -368,6 +427,42 @@ function faultCategoryLabel(category) { return map[category] || category } +function scenarioTypeLabel(scenarioType) { + const map = { + trend_drift: '趋势漂移型', + sudden_spike: '突发脉冲型', + high_noise: '高噪声波动型', + mode_switch: '工况切换型', + relation_constraint: '关系约束型', + } + return map[scenarioType] || scenarioType +} + +function scenarioTagType(scenarioType) { + const map = { + trend_drift: 'warning', + sudden_spike: 'error', + high_noise: 'info', + mode_switch: 'success', + relation_constraint: 'default', + } + return map[scenarioType] || 'default' +} + +function pointFaultModeLabel(pf) { + if (pf.mode === 'step') return '→ 阶跃' + if (pf.mode === 'gradual') { + if (pf.multiplier != null) return `→ ×${pf.multiplier}` + if (pf.target_value != null) return `→ ${pf.target_value}` + } + if (pf.mode === 'instant') { + if (pf.target_value != null) return `→ ${pf.target_value}` + if (pf.multiplier != null && pf.multiplier !== 1.0) return `→ ×${pf.multiplier}` + return '± 噪声' + } + return '' +} + function onFaultTypeChange(val) { const t = faultTypes.value.find(f => f.id === val) if (t && t.default_duration) faultDuration.value = t.default_duration From 685ae6b79c28cf8094e20fc6f9f7c00a32d0fe4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?= Date: Mon, 25 May 2026 14:00:07 +0800 Subject: [PATCH 31/31] Revert "feat(protoforge): fault update" This reverts commit f9b6506452a75ffd1f1f8beee1ebd1008fdc724d. --- protoforge/core/demo.py | 2 +- protoforge/core/fault.py | 187 ------------------------------------- protoforge/models/fault.py | 2 - web/src/views/Devices.vue | 117 +++-------------------- 4 files changed, 12 insertions(+), 296 deletions(-) diff --git a/protoforge/core/demo.py b/protoforge/core/demo.py index b7ccae7..ff0b333 100644 --- a/protoforge/core/demo.py +++ b/protoforge/core/demo.py @@ -108,7 +108,7 @@ async def seed_demo_data(engine: Any, template_manager: Any) -> None: "points": [ {"name": "weight", "address": "net_weight", "data_type": "float32", "generator_type": "random", "min_value": 0.5, "max_value": 50.0}, {"name": "tare", "address": "tare_weight", "data_type": "float32", "generator_type": "fixed", "fixed_value": 2.5}, - {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": True}, + {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": true}, ], }, ] diff --git a/protoforge/core/fault.py b/protoforge/core/fault.py index 5beba87..e72842d 100644 --- a/protoforge/core/fault.py +++ b/protoforge/core/fault.py @@ -41,7 +41,6 @@ name="刀具磨损", description="刀具切削刃磨损,切削阻力增大,主轴电流升高,振动增大,进给速率下降", category="mechanical", - scenario_type="trend_drift", default_duration=300.0, tags=["刀具", "磨损", "渐进"], point_faults=[ @@ -68,7 +67,6 @@ name="刀具崩刃", description="刀具突发性崩刃,振动剧烈突增,主轴电流峰值,进给停止", category="mechanical", - scenario_type="sudden_spike", default_duration=15.0, tags=["刀具", "崩刃", "突发"], point_faults=[ @@ -95,7 +93,6 @@ name="主轴过热", description="主轴长时间高负荷运转或冷却不足,电流持续偏高,转速因热保护下降", category="thermal", - scenario_type="trend_drift", default_duration=240.0, tags=["主轴", "过热", "渐进"], point_faults=[ @@ -120,7 +117,6 @@ name="主轴轴承故障", description="主轴轴承磨损或润滑不足,振动幅度持续升高,伴随电流轻微上升", category="mechanical", - scenario_type="trend_drift", default_duration=360.0, tags=["主轴", "轴承", "渐进"], point_faults=[ @@ -145,7 +141,6 @@ name="进给堵转", description="进给轴卡死,进给速率降为零,主轴电流急剧升高", category="process", - scenario_type="sudden_spike", default_duration=20.0, tags=["进给", "堵转", "突发"], point_faults=[ @@ -168,7 +163,6 @@ name="振动异常", description="工件装夹松动或切削共振,三轴振动突然大幅增加", category="mechanical", - scenario_type="sudden_spike", default_duration=60.0, tags=["振动", "装夹", "突发"], point_faults=[ @@ -191,7 +185,6 @@ name="切削液不足", description="切削液供给不足,冷却润滑失效,热量积累导致振动和电流缓慢升高", category="process", - scenario_type="trend_drift", default_duration=480.0, tags=["切削液", "冷却", "渐进"], point_faults=[ @@ -218,7 +211,6 @@ name="电源波动", description="供电电压不稳定,主轴转速和进给速率出现随机波动", category="electrical", - scenario_type="high_noise", default_duration=90.0, tags=["电源", "波动", "突发"], point_faults=[ @@ -230,176 +222,6 @@ multiplier=1.0, noise_scale=150.0), ], ), - - # ================================================================== - # 以下为新增故障类型 - # ================================================================== - - # ------------------------------------------------------------------ - # 传感器强干扰 — 高噪声波动型 - # 场景:电磁干扰、接地不良、信号线屏蔽失效等导致传感器读数剧烈抖动 - # 特征:均值基本不变,但噪声幅度突然增大数倍,信号看起来"毛刺"严重 - # 区别于真实故障:设备本身没有坏,只是采集信号质量变差 - # 模式:瞬间注入,持续期间每次采样都叠加大幅随机噪声 - # ------------------------------------------------------------------ - FaultTypeDefinition( - id="sensor_noise", - name="传感器强干扰", - description=( - "【高噪声波动型】电磁干扰或接地不良导致传感器信号质量恶化。" - "均值基本不变,但每次采样叠加大幅随机噪声,曲线呈现密集毛刺。" - "典型场景:变频器附近的传感器、信号线屏蔽层破损、接地回路故障。" - ), - category="electrical", - scenario_type="high_noise", - default_duration=120.0, - tags=["传感器", "干扰", "噪声", "高噪声波动型"], - point_faults=[ - PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=8.0), - PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=2.5), - PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=2.5), - PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=3.0), - PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=80.0), - ], - ), - - # ------------------------------------------------------------------ - # 换工件/换程序段 — 工况切换型(高速加工 → 低速精加工) - # 场景:CNC 机床切换加工程序,从粗加工切换到精加工 - # 特征:转速降低、进给降低、电流降低,所有指标跳到新的正常范围并稳定 - # 关键:这不是故障!数据本身没有坏,只是工况变了,正常范围完全不同 - # 模式:STEP 阶跃,立即跳到新基线并在整个 duration 内保持 - # ------------------------------------------------------------------ - FaultTypeDefinition( - id="mode_switch_fine_machining", - name="切换精加工工况", - description=( - "【工况切换型】从粗加工切换到精加工程序段。" - "主轴转速升高、进给速率降低、切削电流降低,各指标立即跳到新的正常范围并保持稳定。" - "数据本身没有异常,但与粗加工基线相比会触发阈值告警。" - "典型场景:换刀后进入精加工、加工不同特征面、程序跳段。" - ), - category="process", - scenario_type="mode_switch", - default_duration=300.0, - tags=["工况切换", "精加工", "程序段", "工况切换型"], - point_faults=[ - PointFaultConfig(point="spindle_speed", mode=FaultMode.STEP, - multiplier=1.4, noise_scale=30.0), - PointFaultConfig(point="feed_rate", mode=FaultMode.STEP, - multiplier=0.3, noise_scale=10.0), - PointFaultConfig(point="spindle_current", mode=FaultMode.STEP, - multiplier=0.55, noise_scale=0.5), - PointFaultConfig(point="vibration_x", mode=FaultMode.STEP, - multiplier=0.6, noise_scale=0.1), - PointFaultConfig(point="vibration_y", mode=FaultMode.STEP, - multiplier=0.6, noise_scale=0.1), - PointFaultConfig(point="vibration_z", mode=FaultMode.STEP, - multiplier=0.6, noise_scale=0.1), - ], - ), - - # ------------------------------------------------------------------ - # 进入空载工况 — 工况切换型(加工中 → 空载运行) - # 场景:加工完成、等待上料、程序暂停,主轴空转 - # 特征:进给降为 0,电流大幅下降到空载水平,转速维持,振动降低 - # 模式:STEP 阶跃,立即切换到空载基线 - # ------------------------------------------------------------------ - FaultTypeDefinition( - id="mode_switch_idle", - name="切换空载工况", - description=( - "【工况切换型】机床进入空载运行状态(加工完成等待上料、程序暂停)。" - "进给速率降为零,主轴电流降至空载水平(约为加工时的 20-30%)," - "主轴转速维持,振动明显降低。" - "典型场景:换料等待、程序暂停、加工间隙、换刀等待。" - ), - category="process", - scenario_type="mode_switch", - default_duration=180.0, - tags=["工况切换", "空载", "等待", "工况切换型"], - point_faults=[ - PointFaultConfig(point="feed_rate", mode=FaultMode.STEP, - target_value=0.0, noise_scale=2.0), - PointFaultConfig(point="spindle_current", mode=FaultMode.STEP, - multiplier=0.22, noise_scale=0.3), - PointFaultConfig(point="vibration_x", mode=FaultMode.STEP, - multiplier=0.25, noise_scale=0.05), - PointFaultConfig(point="vibration_y", mode=FaultMode.STEP, - multiplier=0.25, noise_scale=0.05), - PointFaultConfig(point="vibration_z", mode=FaultMode.STEP, - multiplier=0.25, noise_scale=0.05), - ], - ), - - # ------------------------------------------------------------------ - # 突发电流尖峰 — 突发脉冲型 - # 场景:切削过程中遇到硬质夹杂物、刀具切入角突变、工件材质不均 - # 特征:主轴电流瞬间冲高(持续 2-5 秒),然后恢复正常,其他指标基本不变 - # 区别于刀具崩刃:电流尖峰后能自动恢复,不会导致停机 - # 模式:瞬间注入,持续时间极短 - # ------------------------------------------------------------------ - FaultTypeDefinition( - id="current_spike", - name="突发电流尖峰", - description=( - "【突发脉冲型】切削过程中遇到硬质夹杂物或材质不均,主轴电流瞬间冲高后自动恢复。" - "电流短暂升至正常值的 3-4 倍,持续仅数秒,振动轻微抖动,进给基本不受影响。" - "典型场景:铸件内部硬质点、焊缝区域、材料硬度不均匀。" - "与刀具崩刃的区别:能自动恢复,不触发停机报警。" - ), - category="mechanical", - scenario_type="sudden_spike", - default_duration=5.0, - tags=["电流", "尖峰", "脉冲", "突发脉冲型"], - point_faults=[ - PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT, - multiplier=3.5, noise_scale=1.5), - PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT, - multiplier=2.0, noise_scale=0.5), - PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT, - multiplier=2.0, noise_scale=0.5), - PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT, - multiplier=2.5, noise_scale=0.8), - ], - ), - - # ------------------------------------------------------------------ - # 主轴负载异常 — 关系约束型 - # 场景:刀具钝化但未完全磨损、切削参数不匹配、工件材料变硬 - # 特征:主轴转速正常、进给速率正常,但主轴电流异常升高 - # 关键:单看任何一个指标都"正常",只有多指标关系才能发现异常 - # 模式:渐进式,电流缓慢爬升,转速和进给保持不变 - # ------------------------------------------------------------------ - FaultTypeDefinition( - id="spindle_load_anomaly", - name="主轴负载异常", - description=( - "【关系约束型】主轴转速正常、进给速率正常,但主轴电流异常升高。" - "单看任何一个指标都在正常范围内,只有分析多指标关系才能发现异常。" - "物理含义:切削阻力增大(刀具钝化初期、材料变硬)," - "系统尚未触发保护降速,但电流已超出正常切削功率范围。" - "典型场景:刀具轻度钝化、切削液浓度不足、工件材料批次差异。" - ), - category="mechanical", - scenario_type="relation_constraint", - default_duration=240.0, - tags=["主轴", "负载", "关系约束", "关系约束型"], - point_faults=[ - PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL, - multiplier=2.8, noise_scale=1.0), - # 转速和进给保持不变(multiplier=1.0),只叠加极小噪声维持真实感 - PointFaultConfig(point="spindle_speed", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=15.0), - PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT, - multiplier=1.0, noise_scale=5.0), - ], - ), ] # 按 id 索引 @@ -552,15 +374,6 @@ def _compute_value( target = baseline * (1.0 + (pf.multiplier - 1.0) * intensity) else: target = baseline - elif pf.mode == FaultMode.STEP: - # 阶跃模式:立即跳到新基线并在整个 duration 内保持(工况切换专用) - # 与 INSTANT 的区别:STEP 的 multiplier 表示新工况的正常倍数,不受 intensity 缩放 - if pf.target_value is not None: - target = pf.target_value - elif pf.multiplier is not None: - target = baseline * pf.multiplier - else: - target = baseline else: # 渐进模式:随 progress 线性劣化 if pf.target_value is not None: diff --git a/protoforge/models/fault.py b/protoforge/models/fault.py index 025da96..cc038e0 100644 --- a/protoforge/models/fault.py +++ b/protoforge/models/fault.py @@ -8,7 +8,6 @@ class FaultMode(str, Enum): """故障注入模式""" INSTANT = "instant" # 瞬间跳变到异常值,持续 duration 后恢复 GRADUAL = "gradual" # 渐进式劣化,随时间线性恶化,到 duration 时达到峰值后恢复 - STEP = "step" # 阶跃切换到新工况基线,整个 duration 内保持新基线(工况切换专用) class FaultStatus(str, Enum): @@ -37,7 +36,6 @@ class FaultTypeDefinition(BaseModel): name: str description: str category: str # 故障分类:mechanical / electrical / thermal / process - scenario_type: str = "trend_drift" # 异常场景类型:trend_drift / sudden_spike / high_noise / mode_switch / relation_constraint default_duration: float = 120.0 # 默认持续时间(秒) point_faults: list[PointFaultConfig] = Field(default_factory=list) tags: list[str] = Field(default_factory=list) diff --git a/web/src/views/Devices.vue b/web/src/views/Devices.vue index 36141a4..65e0535 100644 --- a/web/src/views/Devices.vue +++ b/web/src/views/Devices.vue @@ -92,55 +92,24 @@ - + - 目标设备:{{ faultTargetDevice?.name }} - + 设备:{{ faultTargetDevice?.name }} - - -
- - - {{ selectedFaultType.name }} - - {{ scenarioTypeLabel(selectedFaultType.scenario_type) }} - - - {{ faultCategoryLabel(selectedFaultType.category) }} - - - - - {{ selectedFaultType.description }} - - -
- 影响测点: - - - {{ pf.point }} - - {{ pointFaultModeLabel(pf) }} - - - + +
{{ selectedFaultType.name }} · {{ faultCategoryLabel(selectedFaultType.category) }}
+
{{ selectedFaultType.description }}
+
+ 影响测点:{{ selectedFaultType.point_faults.map(p => p.point).join('、') }}
-
- + - {{ faultIntensityLabel }}({{ faultIntensity }}) - - · 工况切换型强度不影响切换幅度 - @@ -278,17 +243,12 @@ const columns = [ }, { title: '测点', key: 'points', width: 70, render: (row) => (row.points || []).length }, { - title: '故障', key: 'fault', width: 130, + title: '故障', key: 'fault', width: 90, render: (row) => { const fault = activeFaults.value[row.id] if (!fault || fault.status === 'none') return h(NTag, { size: 'tiny', bordered: false }, () => '正常') const pct = Math.round((fault.progress || 0) * 100) - const ft = faultTypes.value.find(t => t.id === fault.fault_type_id) - const scenarioLabel = ft ? scenarioTypeLabel(ft.scenario_type) : '' - return h(NSpace, { size: 2, vertical: false, align: 'center' }, () => [ - h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`), - scenarioLabel ? h(NTag, { size: 'tiny', bordered: false, style: 'font-size:10px;background:#2d1b1b;color:#f87171' }, () => scenarioLabel) : null, - ]) + return h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`) } }, { @@ -391,25 +351,6 @@ const faultTypeOptions = computed(() => faultTypes.value.map(t => ({ label: `${t.name}(${faultCategoryLabel(t.category)})`, value: t.id })) ) -// 按场景类型分组的故障选项 -const SCENARIO_ORDER = ['trend_drift', 'sudden_spike', 'high_noise', 'mode_switch', 'relation_constraint'] -const faultTypeGroupedOptions = computed(() => { - const groups = {} - for (const t of faultTypes.value) { - const st = t.scenario_type || 'trend_drift' - if (!groups[st]) groups[st] = [] - groups[st].push({ label: t.name, value: t.id }) - } - return SCENARIO_ORDER - .filter(st => groups[st]) - .map(st => ({ - type: 'group', - label: scenarioTypeLabel(st), - key: st, - children: groups[st], - })) -}) - const selectedFaultType = computed(() => faultTypes.value.find(t => t.id === faultTypeId.value) || null ) @@ -427,42 +368,6 @@ function faultCategoryLabel(category) { return map[category] || category } -function scenarioTypeLabel(scenarioType) { - const map = { - trend_drift: '趋势漂移型', - sudden_spike: '突发脉冲型', - high_noise: '高噪声波动型', - mode_switch: '工况切换型', - relation_constraint: '关系约束型', - } - return map[scenarioType] || scenarioType -} - -function scenarioTagType(scenarioType) { - const map = { - trend_drift: 'warning', - sudden_spike: 'error', - high_noise: 'info', - mode_switch: 'success', - relation_constraint: 'default', - } - return map[scenarioType] || 'default' -} - -function pointFaultModeLabel(pf) { - if (pf.mode === 'step') return '→ 阶跃' - if (pf.mode === 'gradual') { - if (pf.multiplier != null) return `→ ×${pf.multiplier}` - if (pf.target_value != null) return `→ ${pf.target_value}` - } - if (pf.mode === 'instant') { - if (pf.target_value != null) return `→ ${pf.target_value}` - if (pf.multiplier != null && pf.multiplier !== 1.0) return `→ ×${pf.multiplier}` - return '± 噪声' - } - return '' -} - function onFaultTypeChange(val) { const t = faultTypes.value.find(f => f.id === val) if (t && t.default_duration) faultDuration.value = t.default_duration