From 49f2fc443887fba0aeddf216bc966d0d47fbc483 Mon Sep 17 00:00:00 2001
From: zhangshaozhi <zhangshaozhi@zhangshaozhideMacBook-Pro.local>
Date: Sun, 17 May 2026 20:26:16 +0800
Subject: [PATCH 01/31] fix: make application runnable in production

- Dockerfile: include README.md in COPY statement
- main.py: add static file serving for built frontend (SPA routing support)
- App.vue: wrap template with Naive UI message/dialog/notification providers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Dockerfile         |  2 +-
 protoforge/main.py | 10 ++++++++++
 web/src/App.vue    |  8 +++++++-
 3 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index babbf5f..2528ade 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,7 @@ WORKDIR /app
 
 RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
 
-COPY pyproject.toml .
+COPY pyproject.toml README.md ./
 COPY protoforge/ protoforge/
 
 COPY --from=frontend-builder /app/web/dist /app/static
diff --git a/protoforge/main.py b/protoforge/main.py
index 894c5bc..0e43951 100644
--- a/protoforge/main.py
+++ b/protoforge/main.py
@@ -1,8 +1,11 @@
 import logging
 from contextlib import asynccontextmanager
+from pathlib import Path
 
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
 
 from protoforge.api.v1.router import router
 from protoforge.core.engine import SimulationEngine
@@ -184,12 +187,19 @@ def create_app() -> FastAPI:
 
     @app.get("/")
     async def root():
+        index = Path("/app/static/index.html")
+        if index.exists():
+            return FileResponse(index)
         return {
             "name": "ProtoForge",
             "version": "0.1.0",
             "description": "物联网协议仿真与测试平台",
         }
 
+    static_dir = Path("/app/static")
+    if static_dir.exists():
+        app.mount("/assets", StaticFiles(directory=static_dir / "assets"), name="assets")
+
     @app.get("/health")
     async def health():
         return {"status": "ok"}
diff --git a/web/src/App.vue b/web/src/App.vue
index ae6d315..22eabe9 100644
--- a/web/src/App.vue
+++ b/web/src/App.vue
@@ -1,4 +1,7 @@
 <template>
+  <n-message-provider>
+  <n-dialog-provider>
+  <n-notification-provider>
   <div v-if="!loggedIn" class="login-wrapper">
     <Login @login-success="onLogin" />
   </div>
@@ -73,12 +76,15 @@
       </n-layout-content>
     </n-layout>
   </n-layout>
+  </n-notification-provider>
+  </n-dialog-provider>
+  </n-message-provider>
 </template>
 
 <script setup>
 import { ref, computed, onMounted, onUnmounted, h } from 'vue'
 import { useRouter, useRoute } from 'vue-router'
-import { NLayout, NLayoutSider, NLayoutHeader, NLayoutContent, NMenu, NSpace, NAutoComplete, NTag, NButton, NDropdown } from 'naive-ui'
+import { NLayout, NLayoutSider, NLayoutHeader, NLayoutContent, NMenu, NSpace, NAutoComplete, NTag, NButton, NDropdown, NMessageProvider, NDialogProvider, NNotificationProvider } from 'naive-ui'
 import api from './api.js'
 import Login from './views/Login.vue'
 import Welcome from './views/Welcome.vue'

From 8fcf6c17b7786b7350c6ff540f4e665ea481e1a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 18 May 2026 13:59:49 +0800
Subject: [PATCH 02/31] fix: support container and local env

---
 protoforge/main.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/protoforge/main.py b/protoforge/main.py
index 0e43951..73489ae 100644
--- a/protoforge/main.py
+++ b/protoforge/main.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from contextlib import asynccontextmanager
 from pathlib import Path
 
@@ -142,7 +143,6 @@ async def lifespan(app: FastAPI):
     except Exception as e:
         logger.warning("Failed to start webhook manager: %s", e)
 
-    import os
     if os.environ.get("PROTOFORGE_DEMO_MODE"):
         try:
             from protoforge.core.demo import seed_demo_data
@@ -185,19 +185,26 @@ def create_app() -> FastAPI:
 
     app.include_router(router)
 
+    # 按优先级查找静态文件目录：环境变量 > 容器路径 > 本地构建产物
+    _repo_root = Path(__file__).parent.parent
+    _static_candidates = [
+        Path(os.environ["STATIC_DIR"]) if "STATIC_DIR" in os.environ else None,
+        Path("/app/static"),
+        _repo_root / "web" / "dist",
+    ]
+    static_dir = next((p for p in _static_candidates if p and p.is_dir()), None)
+
     @app.get("/")
     async def root():
-        index = Path("/app/static/index.html")
-        if index.exists():
-            return FileResponse(index)
+        if static_dir and (static_dir / "index.html").exists():
+            return FileResponse(static_dir / "index.html")
         return {
             "name": "ProtoForge",
             "version": "0.1.0",
             "description": "物联网协议仿真与测试平台",
         }
 
-    static_dir = Path("/app/static")
-    if static_dir.exists():
+    if static_dir and (static_dir / "assets").is_dir():
         app.mount("/assets", StaticFiles(directory=static_dir / "assets"), name="assets")
 
     @app.get("/health")

From 1fa9c0889b9351f9bc4d404f6615cafec1075f85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 18 May 2026 14:30:59 +0800
Subject: [PATCH 03/31] fix(metrics): support prometheus metric

---
 protoforge/core/metrics.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/protoforge/core/metrics.py b/protoforge/core/metrics.py
index 9300372..0e01abb 100644
--- a/protoforge/core/metrics.py
+++ b/protoforge/core/metrics.py
@@ -46,6 +46,25 @@ def collect_from_engine(self, engine: Any) -> None:
                                 if p.status.value == "running")
         self.set_gauge("protoforge_protocols_running", protocols_running)
 
+        for device in engine._devices.values():
+            if device.status.value != "online":
+                continue
+            labels_base = {
+                "device_id": device.config.id,
+                "device_name": device.config.name,
+                "protocol": device.config.protocol,
+            }
+            for point in device.read_all_points():
+                if not isinstance(point.value, (int, float)):
+                    continue
+                labels = {**labels_base, "point": point.name}
+                point_config = next(
+                    (p for p in device.config.points if p.name == point.name), None
+                )
+                if point_config and point_config.unit:
+                    labels["unit"] = point_config.unit
+                self.set_gauge("protoforge_device_point", float(point.value), labels)
+
     def collect_from_test_runner(self, runner: Any) -> None:
         self.set_gauge("protoforge_test_cases_total", len(runner._test_cases))
         self.set_gauge("protoforge_test_suites_total", len(runner._test_suites))

From 382e939efc63779b60f0a6be768aa210f47c6940 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 18 May 2026 14:38:29 +0800
Subject: [PATCH 04/31] fix

---
 protoforge/core/metrics.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/protoforge/core/metrics.py b/protoforge/core/metrics.py
index 0e01abb..8a5a2bb 100644
--- a/protoforge/core/metrics.py
+++ b/protoforge/core/metrics.py
@@ -47,8 +47,6 @@ def collect_from_engine(self, engine: Any) -> None:
         self.set_gauge("protoforge_protocols_running", protocols_running)
 
         for device in engine._devices.values():
-            if device.status.value != "online":
-                continue
             labels_base = {
                 "device_id": device.config.id,
                 "device_name": device.config.name,

From 80737ecd5a2099121e10ee624d6363f6e378b7d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 18 May 2026 14:42:38 +0800
Subject: [PATCH 05/31] fix(metric): update metric name

---
 protoforge/core/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protoforge/core/metrics.py b/protoforge/core/metrics.py
index 8a5a2bb..97f7bb9 100644
--- a/protoforge/core/metrics.py
+++ b/protoforge/core/metrics.py
@@ -61,7 +61,7 @@ def collect_from_engine(self, engine: Any) -> None:
                 )
                 if point_config and point_config.unit:
                     labels["unit"] = point_config.unit
-                self.set_gauge("protoforge_device_point", float(point.value), labels)
+                self.set_gauge(f"protoforge_{point.name}", float(point.value), labels)
 
     def collect_from_test_runner(self, runner: Any) -> None:
         self.set_gauge("protoforge_test_cases_total", len(runner._test_cases))

From 8f6ea66402ba0042edcdd4ba5dbcc7414cbfe240 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 18 May 2026 14:52:29 +0800
Subject: [PATCH 06/31] fix

---
 protoforge/core/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protoforge/core/metrics.py b/protoforge/core/metrics.py
index 97f7bb9..18fb327 100644
--- a/protoforge/core/metrics.py
+++ b/protoforge/core/metrics.py
@@ -61,7 +61,7 @@ def collect_from_engine(self, engine: Any) -> None:
                 )
                 if point_config and point_config.unit:
                     labels["unit"] = point_config.unit
-                self.set_gauge(f"protoforge_{point.name}", float(point.value), labels)
+                self.set_gauge(point.name, float(point.value), labels)
 
     def collect_from_test_runner(self, runner: Any) -> None:
         self.set_gauge("protoforge_test_cases_total", len(runner._test_cases))

From 94d779adee9ad861b6f66436b1ac156025670034 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 10:22:34 +0800
Subject: [PATCH 07/31] fix(monitor): collect monitor data

---
 protoforge/core/device.py  | 3 +++
 protoforge/core/metrics.py | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/protoforge/core/device.py b/protoforge/core/device.py
index a344c7c..efd21ef 100644
--- a/protoforge/core/device.py
+++ b/protoforge/core/device.py
@@ -59,9 +59,11 @@ def read_point(self, point_name: str) -> Optional[PointValue]:
             name=point_name,
             value=self._point_values[point_name],
             timestamp=time.time(),
+            quality="good" if self._status == DeviceStatus.ONLINE else "bad",
         )
 
     def read_all_points(self) -> list[PointValue]:
+        quality = "good" if self._status == DeviceStatus.ONLINE else "bad"
         result = []
         now = time.time()
         for name in self._point_values:
@@ -70,6 +72,7 @@ def read_all_points(self) -> list[PointValue]:
                     name=name,
                     value=self._point_values[name],
                     timestamp=now,
+                    quality=quality,
                 )
             )
         return result
diff --git a/protoforge/core/metrics.py b/protoforge/core/metrics.py
index 18fb327..9670525 100644
--- a/protoforge/core/metrics.py
+++ b/protoforge/core/metrics.py
@@ -53,15 +53,17 @@ def collect_from_engine(self, engine: Any) -> None:
                 "protocol": device.config.protocol,
             }
             for point in device.read_all_points():
-                if not isinstance(point.value, (int, float)):
-                    continue
                 labels = {**labels_base, "point": point.name}
                 point_config = next(
                     (p for p in device.config.points if p.name == point.name), None
                 )
                 if point_config and point_config.unit:
                     labels["unit"] = point_config.unit
-                self.set_gauge(point.name, float(point.value), labels)
+                key = self._make_key(point.name, labels)
+                if point.quality != "good":
+                    self._gauges.pop(key, None)
+                elif isinstance(point.value, (int, float)):
+                    self.set_gauge(point.name, float(point.value), labels)
 
     def collect_from_test_runner(self, runner: Any) -> None:
         self.set_gauge("protoforge_test_cases_total", len(runner._test_cases))

From e515c50708a4181d5fd26751f8853cec203c566f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 17:54:26 +0800
Subject: [PATCH 08/31] fix(fanuc): update fanuc function

---
 protoforge/templates/fanuc/fanuc_0if_cnc.json    | 5 +++--
 protoforge/templates/modbus/fanuc_cnc.json       | 7 ++++---
 protoforge/templates/mtconnect/mill_machine.json | 5 +++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/protoforge/templates/fanuc/fanuc_0if_cnc.json b/protoforge/templates/fanuc/fanuc_0if_cnc.json
index 3231d9b..a10b74f 100644
--- a/protoforge/templates/fanuc/fanuc_0if_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_0if_cnc.json
@@ -75,9 +75,10 @@
             "unit": "mm/min",
             "description": "进给速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "sine",
             "min_value": 100,
-            "max_value": 5000
+            "max_value": 5000,
+            "generator_config": {"period": 60, "phase": 0.0}
         },
         {
             "name": "alarm_status",
diff --git a/protoforge/templates/modbus/fanuc_cnc.json b/protoforge/templates/modbus/fanuc_cnc.json
index 843151e..2970682 100644
--- a/protoforge/templates/modbus/fanuc_cnc.json
+++ b/protoforge/templates/modbus/fanuc_cnc.json
@@ -24,9 +24,10 @@
             "unit": "mm/min",
             "description": "实际进给速度",
             "access": "r",
-            "generator_type": "random",
-            "min_value": 0.0,
-            "max_value": 10000.0
+            "generator_type": "sine",
+            "min_value": 200.0,
+            "max_value": 3000.0,
+            "generator_config": {"period": 90, "phase": 1.0}
         },
         {
             "name": "spindle_override",
diff --git a/protoforge/templates/mtconnect/mill_machine.json b/protoforge/templates/mtconnect/mill_machine.json
index eaeef74..6b06daa 100644
--- a/protoforge/templates/mtconnect/mill_machine.json
+++ b/protoforge/templates/mtconnect/mill_machine.json
@@ -66,9 +66,10 @@
             "unit": "mm/min",
             "description": "进给速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "sine",
             "min_value": 200,
-            "max_value": 3000
+            "max_value": 3000,
+            "generator_config": {"period": 75, "phase": 2.1}
         },
         {
             "name": "part_count",

From d82cc9374d4001d87ed7d52f86534567aca5487d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 18:14:27 +0800
Subject: [PATCH 09/31] fix(templates): update templates

---
 protoforge/templates/fanuc/fanuc_0if_cnc.json | 45 +++++++++++++++++++
 .../templates/fanuc/fanuc_31ib_cnc.json       | 45 +++++++++++++++++++
 protoforge/templates/modbus/fanuc_cnc.json    | 45 +++++++++++++++++++
 .../templates/mtconnect/mill_machine.json     | 45 +++++++++++++++++++
 4 files changed, 180 insertions(+)

diff --git a/protoforge/templates/fanuc/fanuc_0if_cnc.json b/protoforge/templates/fanuc/fanuc_0if_cnc.json
index a10b74f..476ad73 100644
--- a/protoforge/templates/fanuc/fanuc_0if_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_0if_cnc.json
@@ -80,6 +80,51 @@
             "max_value": 5000,
             "generator_config": {"period": 60, "phase": 0.0}
         },
+        {
+            "name": "spindle_current",
+            "address": "spindle_current",
+            "data_type": "float32",
+            "unit": "A",
+            "description": "主轴电流",
+            "access": "r",
+            "generator_type": "sine",
+            "min_value": 8.0,
+            "max_value": 32.0,
+            "generator_config": {"period": 120, "phase": 0.5}
+        },
+        {
+            "name": "vibration_x",
+            "address": "vibration_x",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "X轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.5
+        },
+        {
+            "name": "vibration_y",
+            "address": "vibration_y",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Y轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.5
+        },
+        {
+            "name": "vibration_z",
+            "address": "vibration_z",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Z轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 3.0
+        },
         {
             "name": "alarm_status",
             "address": "alarm",
diff --git a/protoforge/templates/fanuc/fanuc_31ib_cnc.json b/protoforge/templates/fanuc/fanuc_31ib_cnc.json
index 89f6ef1..97d18fd 100644
--- a/protoforge/templates/fanuc/fanuc_31ib_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_31ib_cnc.json
@@ -80,6 +80,51 @@
             "generator_type": "fixed",
             "fixed_value": 100
         },
+        {
+            "name": "spindle_current",
+            "address": "spindle_current",
+            "data_type": "float32",
+            "unit": "A",
+            "description": "主轴电流",
+            "access": "r",
+            "generator_type": "sine",
+            "min_value": 10.0,
+            "max_value": 45.0,
+            "generator_config": {"period": 120, "phase": 1.2}
+        },
+        {
+            "name": "vibration_x",
+            "address": "vibration_x",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "X轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 3.0
+        },
+        {
+            "name": "vibration_y",
+            "address": "vibration_y",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Y轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 3.0
+        },
+        {
+            "name": "vibration_z",
+            "address": "vibration_z",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Z轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 4.0
+        },
         {
             "name": "tool_number",
             "address": "tool_number",
diff --git a/protoforge/templates/modbus/fanuc_cnc.json b/protoforge/templates/modbus/fanuc_cnc.json
index 2970682..b154318 100644
--- a/protoforge/templates/modbus/fanuc_cnc.json
+++ b/protoforge/templates/modbus/fanuc_cnc.json
@@ -29,6 +29,51 @@
             "max_value": 3000.0,
             "generator_config": {"period": 90, "phase": 1.0}
         },
+        {
+            "name": "spindle_current",
+            "address": "2",
+            "data_type": "float32",
+            "unit": "A",
+            "description": "主轴电流",
+            "access": "r",
+            "generator_type": "sine",
+            "min_value": 8.0,
+            "max_value": 35.0,
+            "generator_config": {"period": 120, "phase": 2.0}
+        },
+        {
+            "name": "vibration_x",
+            "address": "23",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "X轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.5
+        },
+        {
+            "name": "vibration_y",
+            "address": "25",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Y轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.5
+        },
+        {
+            "name": "vibration_z",
+            "address": "27",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Z轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 3.0
+        },
         {
             "name": "spindle_override",
             "address": "3",
diff --git a/protoforge/templates/mtconnect/mill_machine.json b/protoforge/templates/mtconnect/mill_machine.json
index 6b06daa..1262d0f 100644
--- a/protoforge/templates/mtconnect/mill_machine.json
+++ b/protoforge/templates/mtconnect/mill_machine.json
@@ -71,6 +71,51 @@
             "max_value": 3000,
             "generator_config": {"period": 75, "phase": 2.1}
         },
+        {
+            "name": "spindle_current",
+            "address": "SpindleCurrent",
+            "data_type": "float32",
+            "unit": "A",
+            "description": "主轴电流",
+            "access": "r",
+            "generator_type": "sine",
+            "min_value": 6.0,
+            "max_value": 28.0,
+            "generator_config": {"period": 120, "phase": 3.1}
+        },
+        {
+            "name": "vibration_x",
+            "address": "VibrationX",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "X轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.0
+        },
+        {
+            "name": "vibration_y",
+            "address": "VibrationY",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Y轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.0
+        },
+        {
+            "name": "vibration_z",
+            "address": "VibrationZ",
+            "data_type": "float32",
+            "unit": "m/s²",
+            "description": "Z轴振动加速度",
+            "access": "r",
+            "generator_type": "random",
+            "min_value": 0.1,
+            "max_value": 2.5
+        },
         {
             "name": "part_count",
             "address": "PartCount",

From 566910feb851276a5d542a30cff8b0bd9fb5f86c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 19:46:40 +0800
Subject: [PATCH 10/31] fix(router): update router

---
 docs/curl.md                |  6 ++++++
 protoforge/api/v1/router.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 docs/curl.md

diff --git a/docs/curl.md b/docs/curl.md
new file mode 100644
index 0000000..ae3cbb1
--- /dev/null
+++ b/docs/curl.md
@@ -0,0 +1,6 @@
+# 更新设备测试点请求
+
+```bash
+# /api/v1/devices/{device_id}/sync-from-template
+curl -X POST http://localhost:8000/api/v1/devices/fanuc-cnc数控系统/sync-from-template
+```
diff --git a/protoforge/api/v1/router.py b/protoforge/api/v1/router.py
index e06b966..8eaab1b 100644
--- a/protoforge/api/v1/router.py
+++ b/protoforge/api/v1/router.py
@@ -202,6 +202,38 @@ async def batch_stop_devices(device_ids: list[str]):
     return {"status": "ok", "stopped": stopped, "errors": errors}
 
 
+@router.post("/devices/{device_id}/sync-from-template")
+async def sync_device_from_template(device_id: str):
+    engine = _get_engine()
+    db = _get_database()
+    tm = _get_template_manager()
+    try:
+        instance = engine._devices.get(device_id)
+        if not instance:
+            raise HTTPException(status_code=404, detail=f"Device not found: {device_id}")
+        template_id = instance.config.template_id
+        if not template_id:
+            raise HTTPException(status_code=400, detail="Device has no associated template")
+        template = tm.get_template(template_id)
+        if not template:
+            raise HTTPException(status_code=404, detail=f"Template not found: {template_id}")
+        new_config = DeviceConfig(
+            id=device_id,
+            name=instance.config.name,
+            protocol=instance.config.protocol,
+            template_id=template_id,
+            points=template.points,
+            protocol_config=instance.config.protocol_config,
+        )
+        result = await engine.update_device(device_id, new_config)
+        await db.save_device(new_config)
+        return {"status": "ok", "point_count": len(template.points), "device": result}
+    except HTTPException:
+        raise
+    except ValueError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+
+
 @router.get("/devices/{device_id}", response_model=DeviceInfo)
 async def get_device(device_id: str):
     engine = _get_engine()

From 741bf518a3eccb6997f844063967b37d413878bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 20:22:12 +0800
Subject: [PATCH 11/31] fix(template): update template

---
 protoforge/templates/fanuc/fanuc_0if_cnc.json    | 5 +++--
 protoforge/templates/fanuc/fanuc_31ib_cnc.json   | 5 +++--
 protoforge/templates/modbus/fanuc_cnc.json       | 5 +++--
 protoforge/templates/mtconnect/mill_machine.json | 5 +++--
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/protoforge/templates/fanuc/fanuc_0if_cnc.json b/protoforge/templates/fanuc/fanuc_0if_cnc.json
index 476ad73..3e79750 100644
--- a/protoforge/templates/fanuc/fanuc_0if_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_0if_cnc.json
@@ -64,9 +64,10 @@
             "unit": "RPM",
             "description": "主轴转速",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "sawtooth",
             "min_value": 1000,
-            "max_value": 8000
+            "max_value": 8000,
+            "generator_config": {"period": 120}
         },
         {
             "name": "feed_rate",
diff --git a/protoforge/templates/fanuc/fanuc_31ib_cnc.json b/protoforge/templates/fanuc/fanuc_31ib_cnc.json
index 97d18fd..83fee77 100644
--- a/protoforge/templates/fanuc/fanuc_31ib_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_31ib_cnc.json
@@ -66,9 +66,10 @@
             "unit": "RPM",
             "description": "主轴转速",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "sawtooth",
             "min_value": 2000,
-            "max_value": 15000
+            "max_value": 15000,
+            "generator_config": {"period": 150}
         },
         {
             "name": "feed_override",
diff --git a/protoforge/templates/modbus/fanuc_cnc.json b/protoforge/templates/modbus/fanuc_cnc.json
index b154318..265ae56 100644
--- a/protoforge/templates/modbus/fanuc_cnc.json
+++ b/protoforge/templates/modbus/fanuc_cnc.json
@@ -13,9 +13,10 @@
             "unit": "RPM",
             "description": "主轴实际转速",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "sawtooth",
             "min_value": 0,
-            "max_value": 12000
+            "max_value": 12000,
+            "generator_config": {"period": 180}
         },
         {
             "name": "feed_rate",
diff --git a/protoforge/templates/mtconnect/mill_machine.json b/protoforge/templates/mtconnect/mill_machine.json
index 1262d0f..5ce7d63 100644
--- a/protoforge/templates/mtconnect/mill_machine.json
+++ b/protoforge/templates/mtconnect/mill_machine.json
@@ -55,9 +55,10 @@
             "unit": "RPM",
             "description": "主轴转速",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "sawtooth",
             "min_value": 3000,
-            "max_value": 12000
+            "max_value": 12000,
+            "generator_config": {"period": 135}
         },
         {
             "name": "feed_rate",

From 3cc3be3e59319f41bcddf4e8f642c4f9618d78e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 20:35:05 +0800
Subject: [PATCH 12/31] fix(template): update template

---
 protoforge/templates/modbus/fanuc_cnc.json       | 7 +++++--
 protoforge/templates/mtconnect/mill_machine.json | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/protoforge/templates/modbus/fanuc_cnc.json b/protoforge/templates/modbus/fanuc_cnc.json
index 265ae56..dc7a146 100644
--- a/protoforge/templates/modbus/fanuc_cnc.json
+++ b/protoforge/templates/modbus/fanuc_cnc.json
@@ -194,9 +194,12 @@
             "data_type": "uint16",
             "description": "加工计数",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0,
-            "max_value": 99999
+            "max_value": 99999,
+            "generator_config": {
+                "script": "key = 'part_count_modbus'; last = cache.get(key, 0); interval = 45; result = min(int(elapsed / interval), 99999); cache[key] = result"
+            }
         },
         {
             "name": "cycle_time",
diff --git a/protoforge/templates/mtconnect/mill_machine.json b/protoforge/templates/mtconnect/mill_machine.json
index 5ce7d63..5a11593 100644
--- a/protoforge/templates/mtconnect/mill_machine.json
+++ b/protoforge/templates/mtconnect/mill_machine.json
@@ -124,9 +124,12 @@
             "unit": "件",
             "description": "加工件数",
             "access": "r",
-            "generator_type": "sawtooth",
+            "generator_type": "script",
             "min_value": 0,
-            "max_value": 999
+            "max_value": 999,
+            "generator_config": {
+                "script": "key = 'part_count_mtconnect'; interval = 60; result = min(int(elapsed / interval), 999); cache[key] = result"
+            }
         }
     ],
     "protocol_config": {

From 5bdbcd4ecbbcea080e36741f34fa3662ce669345 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 21:01:59 +0800
Subject: [PATCH 13/31] fix(template): update tempalte

---
 protoforge/templates/modbus/fanuc_cnc.json       | 2 +-
 protoforge/templates/mtconnect/mill_machine.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/protoforge/templates/modbus/fanuc_cnc.json b/protoforge/templates/modbus/fanuc_cnc.json
index dc7a146..22db212 100644
--- a/protoforge/templates/modbus/fanuc_cnc.json
+++ b/protoforge/templates/modbus/fanuc_cnc.json
@@ -198,7 +198,7 @@
             "min_value": 0,
             "max_value": 99999,
             "generator_config": {
-                "script": "key = 'part_count_modbus'; last = cache.get(key, 0); interval = 45; result = min(int(elapsed / interval), 99999); cache[key] = result"
+                "script": "elapsed = context['elapsed']; result = min(int(elapsed / 45), 99999)"
             }
         },
         {
diff --git a/protoforge/templates/mtconnect/mill_machine.json b/protoforge/templates/mtconnect/mill_machine.json
index 5a11593..b701a48 100644
--- a/protoforge/templates/mtconnect/mill_machine.json
+++ b/protoforge/templates/mtconnect/mill_machine.json
@@ -128,7 +128,7 @@
             "min_value": 0,
             "max_value": 999,
             "generator_config": {
-                "script": "key = 'part_count_mtconnect'; interval = 60; result = min(int(elapsed / interval), 999); cache[key] = result"
+                "script": "elapsed = context['elapsed']; result = min(int(elapsed / 60), 999)"
             }
         }
     ],

From 5d2d9fe0634aa4bcba08552b09c5da3d84b76db7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Tue, 19 May 2026 21:41:04 +0800
Subject: [PATCH 14/31] fix(template): update tempalte

---
 protoforge/templates/fanuc/fanuc_0if_cnc.json | 21 +++++++++++++------
 .../templates/fanuc/fanuc_31ib_cnc.json       | 21 +++++++++++++------
 protoforge/templates/modbus/fanuc_cnc.json    | 21 +++++++++++++------
 .../templates/mtconnect/mill_machine.json     | 21 +++++++++++++------
 4 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/protoforge/templates/fanuc/fanuc_0if_cnc.json b/protoforge/templates/fanuc/fanuc_0if_cnc.json
index 3e79750..39437f3 100644
--- a/protoforge/templates/fanuc/fanuc_0if_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_0if_cnc.json
@@ -100,9 +100,12 @@
             "unit": "m/s²",
             "description": "X轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.5
+            "max_value": 2.5,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.5 + 0.3 * math.sin(2 * math.pi * elapsed / 90); noise = random.uniform(-0.15, 0.15); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_y",
@@ -111,9 +114,12 @@
             "unit": "m/s²",
             "description": "Y轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.5
+            "max_value": 2.5,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.5 + 0.3 * math.sin(2 * math.pi * elapsed / 75 + 1.0); noise = random.uniform(-0.15, 0.15); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_z",
@@ -122,9 +128,12 @@
             "unit": "m/s²",
             "description": "Z轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 3.0
+            "max_value": 3.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.7 + 0.4 * math.sin(2 * math.pi * elapsed / 60 + 2.1); noise = random.uniform(-0.2, 0.2); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "alarm_status",
diff --git a/protoforge/templates/fanuc/fanuc_31ib_cnc.json b/protoforge/templates/fanuc/fanuc_31ib_cnc.json
index 83fee77..808fbc6 100644
--- a/protoforge/templates/fanuc/fanuc_31ib_cnc.json
+++ b/protoforge/templates/fanuc/fanuc_31ib_cnc.json
@@ -100,9 +100,12 @@
             "unit": "m/s²",
             "description": "X轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 3.0
+            "max_value": 3.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.6 + 0.4 * math.sin(2 * math.pi * elapsed / 80 + 0.5); noise = random.uniform(-0.2, 0.2); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_y",
@@ -111,9 +114,12 @@
             "unit": "m/s²",
             "description": "Y轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 3.0
+            "max_value": 3.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.6 + 0.4 * math.sin(2 * math.pi * elapsed / 65 + 1.5); noise = random.uniform(-0.2, 0.2); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_z",
@@ -122,9 +128,12 @@
             "unit": "m/s²",
             "description": "Z轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 4.0
+            "max_value": 4.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.8 + 0.5 * math.sin(2 * math.pi * elapsed / 55 + 2.5); noise = random.uniform(-0.25, 0.25); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "tool_number",
diff --git a/protoforge/templates/modbus/fanuc_cnc.json b/protoforge/templates/modbus/fanuc_cnc.json
index 22db212..43622cf 100644
--- a/protoforge/templates/modbus/fanuc_cnc.json
+++ b/protoforge/templates/modbus/fanuc_cnc.json
@@ -49,9 +49,12 @@
             "unit": "m/s²",
             "description": "X轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.5
+            "max_value": 2.5,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.5 + 0.3 * math.sin(2 * math.pi * elapsed / 85 + 0.8); noise = random.uniform(-0.15, 0.15); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_y",
@@ -60,9 +63,12 @@
             "unit": "m/s²",
             "description": "Y轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.5
+            "max_value": 2.5,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.5 + 0.3 * math.sin(2 * math.pi * elapsed / 70 + 1.8); noise = random.uniform(-0.15, 0.15); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_z",
@@ -71,9 +77,12 @@
             "unit": "m/s²",
             "description": "Z轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 3.0
+            "max_value": 3.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.7 + 0.4 * math.sin(2 * math.pi * elapsed / 58 + 2.8); noise = random.uniform(-0.2, 0.2); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "spindle_override",
diff --git a/protoforge/templates/mtconnect/mill_machine.json b/protoforge/templates/mtconnect/mill_machine.json
index b701a48..fd08bdd 100644
--- a/protoforge/templates/mtconnect/mill_machine.json
+++ b/protoforge/templates/mtconnect/mill_machine.json
@@ -91,9 +91,12 @@
             "unit": "m/s²",
             "description": "X轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.0
+            "max_value": 2.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.4 + 0.25 * math.sin(2 * math.pi * elapsed / 95 + 0.3); noise = random.uniform(-0.12, 0.12); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_y",
@@ -102,9 +105,12 @@
             "unit": "m/s²",
             "description": "Y轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.0
+            "max_value": 2.0,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.4 + 0.25 * math.sin(2 * math.pi * elapsed / 78 + 1.3); noise = random.uniform(-0.12, 0.12); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "vibration_z",
@@ -113,9 +119,12 @@
             "unit": "m/s²",
             "description": "Z轴振动加速度",
             "access": "r",
-            "generator_type": "random",
+            "generator_type": "script",
             "min_value": 0.1,
-            "max_value": 2.5
+            "max_value": 2.5,
+            "generator_config": {
+                "script": "elapsed = context['elapsed']; base = 0.6 + 0.35 * math.sin(2 * math.pi * elapsed / 62 + 2.3); noise = random.uniform(-0.18, 0.18); result = round(max(0.1, base + noise), 3)"
+            }
         },
         {
             "name": "part_count",

From c57a366c70ceacb599786c08934cef60a69d0d25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 13:40:09 +0800
Subject: [PATCH 15/31] feat(fault): support fault

---
 FAULT_INJECTION.md          | 321 +++++++++++++++++++++++++++
 protoforge/api/v1/router.py |  57 +++++
 protoforge/core/device.py   |  18 +-
 protoforge/core/engine.py   |  32 +++
 protoforge/core/fault.py    | 419 ++++++++++++++++++++++++++++++++++++
 protoforge/models/fault.py  |  77 +++++++
 6 files changed, 923 insertions(+), 1 deletion(-)
 create mode 100644 FAULT_INJECTION.md
 create mode 100644 protoforge/core/fault.py
 create mode 100644 protoforge/models/fault.py

diff --git a/FAULT_INJECTION.md b/FAULT_INJECTION.md
new file mode 100644
index 0000000..951648d
--- /dev/null
+++ b/FAULT_INJECTION.md
@@ -0,0 +1,321 @@
+# 故障注入使用文档
+
+本文档描述 ProtoForge 故障注入模块的设计、使用方式及内置故障类型。
+
+---
+
+## 概述
+
+故障注入模块允许你在运行中的模拟设备上注入真实工业场景的异常，用于：
+
+- 验证监控系统的异常检测能力
+- 训练工业 AI 异常检测模型（提供异常样本）
+- 测试报警规则和联动逻辑
+
+支持四种异常场景：
+
+| 场景 | 说明 |
+|------|------|
+| 异常注入 | 立即将指定测点推入异常区间 |
+| 自动恢复 | 故障持续指定时间后自动恢复正常 |
+| 多指标联动 | 一次注入同时影响多个相关测点 |
+| 渐进式劣化 | 指标随时间线性恶化，模拟真实磨损过程 |
+
+---
+
+## 架构设计
+
+```
+FaultInjector（独立模块）
+    │
+    ├── inject(device, request)   注入故障
+    ├── apply(device)             每次 tick 后覆盖测点值（通过钩子机制）
+    ├── clear(device_id)          手动清除
+    └── 自动到期恢复
+
+DeviceInstance.tick()
+    └── 执行正常生成器
+    └── 执行 post_tick_hooks（FaultInjector.apply 挂载于此）
+```
+
+故障模块通过 `register_post_tick_hook` 挂载到设备，不修改设备本身的生成逻辑，完全解耦。
+
+---
+
+## API 接口
+
+### 查询故障类型
+
+```
+GET /api/v1/faults/types
+```
+
+返回所有内置故障类型列表。
+
+```
+GET /api/v1/faults/types/{fault_type_id}
+```
+
+返回指定故障类型的详细定义，包含影响的测点和行为参数。
+
+### 查询活跃故障
+
+```
+GET /api/v1/faults/active
+```
+
+返回当前所有设备上正在运行的故障实例。
+
+### 注入故障
+
+```
+POST /api/v1/devices/{device_id}/fault
+```
+
+请求体：
+
+```json
+{
+    "fault_type_id": "tool_wear",
+    "duration": 300,
+    "intensity": 0.8
+}
+```
+
+| 字段 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| `fault_type_id` | string | 是 | 故障类型 ID，见下方故障类型列表 |
+| `duration` | float | 否 | 持续时间（秒），不填则使用类型默认值 |
+| `intensity` | float | 否 | 故障强度 0.0~1.0，默认 1.0，影响劣化幅度 |
+
+响应示例：
+
+```json
+{
+    "fault_id": "a3f2c1d4e5b6",
+    "device_id": "fanuc-cnc-01",
+    "fault_type_id": "tool_wear",
+    "fault_type_name": "刀具磨损",
+    "status": "active",
+    "intensity": 0.8,
+    "duration": 300.0,
+    "elapsed": 0.0,
+    "progress": 0.0,
+    "affected_points": ["spindle_current", "vibration_x", "vibration_y", "vibration_z", "feed_rate"],
+    "started_at": 1716192000.0
+}
+```
+
+### 查询设备当前故障
+
+```
+GET /api/v1/devices/{device_id}/fault
+```
+
+无故障时返回 `{"status": "none"}`，有故障时返回故障详情（含实时 `elapsed` 和 `progress`）。
+
+### 手动清除故障
+
+```
+DELETE /api/v1/devices/{device_id}/fault
+```
+
+立即清除故障，测点值由生成器在下一个 tick 自然恢复正常。
+
+---
+
+## 内置故障类型
+
+### tool_wear — 刀具磨损
+
+- **分类**：mechanical
+- **模式**：渐进式
+- **默认持续时间**：300 秒
+- **真实场景**：刀具切削刃逐渐磨损，切削阻力增大，系统自动压低进给速率
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `spindle_current` | 升高 | ×2.2 |
+| `vibration_x` | 升高 | ×3.0 |
+| `vibration_y` | 升高 | ×3.0 |
+| `vibration_z` | 升高 | ×3.5 |
+| `feed_rate` | 降低 | ×0.45 |
+
+---
+
+### tool_breakage — 刀具崩刃
+
+- **分类**：mechanical
+- **模式**：瞬间注入
+- **默认持续时间**：15 秒
+- **真实场景**：刀具突发性崩刃，机床通常会触发报警并停机
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `spindle_current` | 急升 | ×4.5 |
+| `vibration_x` | 急升 | ×8.0 |
+| `vibration_y` | 急升 | ×8.0 |
+| `vibration_z` | 急升 | ×10.0 |
+| `feed_rate` | 停止 | →0 |
+
+---
+
+### spindle_overheat — 主轴过热
+
+- **分类**：thermal
+- **模式**：渐进式
+- **默认持续时间**：240 秒
+- **真实场景**：长时间高负荷或冷却系统故障，热保护机制逐渐降低转速
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `spindle_current` | 升高 | ×1.8 |
+| `spindle_speed` | 降低 | ×0.6 |
+| `vibration_x` | 升高 | ×1.5 |
+| `vibration_z` | 升高 | ×1.5 |
+
+---
+
+### spindle_bearing_fault — 主轴轴承故障
+
+- **分类**：mechanical
+- **模式**：渐进式
+- **默认持续时间**：360 秒
+- **真实场景**：轴承磨损或润滑不足，振动持续升高
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `vibration_x` | 升高 | ×4.0 |
+| `vibration_y` | 升高 | ×4.0 |
+| `vibration_z` | 升高 | ×5.0 |
+| `spindle_current` | 轻微升高 | ×1.3 |
+
+---
+
+### feed_stall — 进给堵转
+
+- **分类**：process
+- **模式**：瞬间注入
+- **默认持续时间**：20 秒
+- **真实场景**：工件夹紧松动或切削量过大导致进给轴卡死
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `feed_rate` | 停止 | →0 |
+| `spindle_current` | 急升 | ×3.8 |
+| `vibration_z` | 急升 | ×5.0 |
+
+---
+
+### vibration_spike — 振动异常
+
+- **分类**：mechanical
+- **模式**：瞬间注入
+- **默认持续时间**：60 秒
+- **真实场景**：工件装夹松动或切削共振
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `vibration_x` | 急升 | ×6.0 |
+| `vibration_y` | 急升 | ×6.0 |
+| `vibration_z` | 急升 | ×7.0 |
+
+---
+
+### coolant_failure — 切削液不足
+
+- **分类**：process
+- **模式**：渐进式
+- **默认持续时间**：480 秒
+- **真实场景**：切削液供给不足，热量积累，劣化速度较慢
+
+| 测点 | 变化方向 | 峰值倍率 |
+|------|---------|---------|
+| `spindle_current` | 升高 | ×1.6 |
+| `vibration_x` | 升高 | ×2.0 |
+| `vibration_y` | 升高 | ×2.0 |
+| `vibration_z` | 升高 | ×2.5 |
+| `feed_rate` | 降低 | ×0.75 |
+
+---
+
+### power_fluctuation — 电源波动
+
+- **分类**：electrical
+- **模式**：瞬间注入（持续期间持续抖动）
+- **默认持续时间**：90 秒
+- **真实场景**：供电电压不稳定，各指标出现随机波动
+
+| 测点 | 变化方向 | 说明 |
+|------|---------|------|
+| `spindle_speed` | 随机抖动 | ±300 RPM 噪声 |
+| `spindle_current` | 随机抖动 | ±5 A 噪声 |
+| `feed_rate` | 随机抖动 | ±150 mm/min 噪声 |
+
+---
+
+## 使用示例
+
+### 模拟刀具磨损过程
+
+```bash
+# 注入刀具磨损，持续 5 分钟，强度 100%
+curl -X POST http://localhost:8000/api/v1/devices/fanuc-cnc-01/fault \
+  -H "Content-Type: application/json" \
+  -d '{"fault_type_id": "tool_wear", "duration": 300, "intensity": 1.0}'
+
+# 每隔 30 秒查看故障进度
+curl http://localhost:8000/api/v1/devices/fanuc-cnc-01/fault
+
+# 查看 Prometheus 指标变化
+curl http://localhost:8000/api/v1/metrics | grep -E "spindle_current|vibration|feed_rate"
+```
+
+### 模拟突发崩刃后手动恢复
+
+```bash
+# 注入崩刃故障
+curl -X POST http://localhost:8000/api/v1/devices/fanuc-cnc-01/fault \
+  -H "Content-Type: application/json" \
+  -d '{"fault_type_id": "tool_breakage", "duration": 60}'
+
+# 手动提前清除
+curl -X DELETE http://localhost:8000/api/v1/devices/fanuc-cnc-01/fault
+```
+
+### 低强度渐进劣化（用于 AI 模型训练）
+
+```bash
+# 用 50% 强度注入轴承故障，持续 10 分钟，产生轻微异常样本
+curl -X POST http://localhost:8000/api/v1/devices/fanuc-cnc-01/fault \
+  -H "Content-Type: application/json" \
+  -d '{"fault_type_id": "spindle_bearing_fault", "duration": 600, "intensity": 0.5}'
+```
+
+---
+
+## 与 Prometheus 集成
+
+故障注入后，测点值的变化会实时反映在 `/api/v1/metrics` 接口中。可以用 Grafana 观察故障期间各指标的时序变化：
+
+```
+# 主轴电流（故障期间会升高）
+fanuc_cnc_spindle_current
+
+# 三轴振动
+fanuc_cnc_vibration_x
+fanuc_cnc_vibration_y
+fanuc_cnc_vibration_z
+
+# 进给速率（刀具磨损/堵转时会降低）
+fanuc_cnc_feed_rate
+```
+
+---
+
+## 注意事项
+
+- 同一设备同时只能有一个活跃故障，新注入会覆盖旧故障
+- 故障到期后测点值由生成器在下一个 tick 自然恢复，不会瞬间跳回
+- 设备必须处于 `online` 状态才能注入故障
+- 删除设备时会自动清除其故障
diff --git a/protoforge/api/v1/router.py b/protoforge/api/v1/router.py
index 8eaab1b..7a6c050 100644
--- a/protoforge/api/v1/router.py
+++ b/protoforge/api/v1/router.py
@@ -8,6 +8,7 @@
 from fastapi.responses import PlainTextResponse
 
 from protoforge.models.device import DeviceConfig, DeviceInfo, PointValue
+from protoforge.models.fault import FaultInjectRequest
 from protoforge.models.scenario import ScenarioConfig, ScenarioInfo
 from protoforge.models.template import TemplateDetail, TemplateInfo
 
@@ -1207,6 +1208,62 @@ async def setup_demo():
         raise HTTPException(status_code=500, detail=get_friendly_error(str(e)))
 
 
+@router.get("/faults/types")
+async def list_fault_types():
+    engine = _get_engine()
+    types = engine.list_fault_types()
+    return [t.model_dump() for t in types]
+
+
+@router.get("/faults/types/{fault_type_id}")
+async def get_fault_type(fault_type_id: str):
+    from protoforge.core.fault import fault_injector
+    ft = fault_injector.get_fault_type(fault_type_id)
+    if not ft:
+        raise HTTPException(status_code=404, detail=f"Fault type not found: {fault_type_id}")
+    return ft.model_dump()
+
+
+@router.get("/faults/active")
+async def list_active_faults():
+    engine = _get_engine()
+    return [f.model_dump() for f in engine.list_active_faults()]
+
+
+@router.post("/devices/{device_id}/fault")
+async def inject_fault(device_id: str, request: FaultInjectRequest):
+    engine = _get_engine()
+    log_bus = _get_log_bus()
+    try:
+        info = engine.inject_fault(device_id, request)
+        log_bus.emit("", "system", device_id, "fault_injected",
+                     f"Fault {request.fault_type_id} injected into {device_id}",
+                     {"fault_type": request.fault_type_id, "duration": info.duration})
+        return info.model_dump()
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+
+@router.get("/devices/{device_id}/fault")
+async def get_device_fault(device_id: str):
+    engine = _get_engine()
+    info = engine.get_fault(device_id)
+    if not info:
+        return {"status": "none"}
+    return info.model_dump()
+
+
+@router.delete("/devices/{device_id}/fault")
+async def clear_device_fault(device_id: str):
+    engine = _get_engine()
+    log_bus = _get_log_bus()
+    cleared = engine.clear_fault(device_id)
+    if cleared:
+        log_bus.emit("", "system", device_id, "fault_cleared",
+                     f"Fault cleared on {device_id}")
+    return {"status": "ok", "cleared": cleared}
+
+
 @router.get("/setup/status")
 async def setup_status():
     engine = _get_engine()
diff --git a/protoforge/core/device.py b/protoforge/core/device.py
index efd21ef..f04414a 100644
--- a/protoforge/core/device.py
+++ b/protoforge/core/device.py
@@ -1,5 +1,5 @@
 import time
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 from protoforge.core.generator import DataGenerator
 from protoforge.models.device import DeviceConfig, DeviceStatus, GeneratorType, PointConfig, PointValue
@@ -13,6 +13,8 @@ def __init__(self, config: DeviceConfig, generator: DataGenerator):
         self._point_values: dict[str, Any] = {}
         self._point_configs: dict[str, PointConfig] = {}
         self._start_time: Optional[float] = None
+        # 可选的 tick 后处理钩子，由外部模块（如 FaultInjector）注册
+        self._post_tick_hooks: list[Callable[["DeviceInstance"], None]] = []
 
         for point in config.points:
             self._point_configs[point.name] = point
@@ -21,6 +23,14 @@ def __init__(self, config: DeviceConfig, generator: DataGenerator):
             else:
                 self._point_values[point.name] = self._generator.generate(point)
 
+    def register_post_tick_hook(self, hook: Callable[["DeviceInstance"], None]) -> None:
+        """注册 tick 后处理钩子，外部模块通过此接口介入，不修改 tick 逻辑本身"""
+        if hook not in self._post_tick_hooks:
+            self._post_tick_hooks.append(hook)
+
+    def unregister_post_tick_hook(self, hook: Callable[["DeviceInstance"], None]) -> None:
+        self._post_tick_hooks = [h for h in self._post_tick_hooks if h != hook]
+
     @property
     def id(self) -> str:
         return self.config.id
@@ -51,6 +61,12 @@ def tick(self) -> None:
         for name, point in self._point_configs.items():
             if point.generator_type != GeneratorType.FIXED:
                 self._point_values[name] = self._generator.generate(point)
+        # 执行后处理钩子（故障注入等外部模块在此覆盖测点值）
+        for hook in self._post_tick_hooks:
+            try:
+                hook(self)
+            except Exception:
+                pass
 
     def read_point(self, point_name: str) -> Optional[PointValue]:
         if point_name not in self._point_values:
diff --git a/protoforge/core/engine.py b/protoforge/core/engine.py
index f289425..059f10e 100644
--- a/protoforge/core/engine.py
+++ b/protoforge/core/engine.py
@@ -4,9 +4,11 @@
 from typing import Any, Optional
 
 from protoforge.core.device import DeviceInstance
+from protoforge.core.fault import fault_injector
 from protoforge.core.generator import DataGenerator
 from protoforge.core.scenario import Scenario
 from protoforge.models.device import DeviceConfig, DeviceInfo, DeviceStatus, PointValue
+from protoforge.models.fault import FaultInfo, FaultInjectRequest, FaultTypeDefinition
 from protoforge.models.scenario import ScenarioConfig, ScenarioInfo, ScenarioStatus
 from protoforge.protocols.base import ProtocolServer, ProtocolStatus
 
@@ -56,6 +58,8 @@ async def stop_protocol(self, protocol_name: str) -> None:
     async def create_device(self, config: DeviceConfig) -> DeviceInfo:
         instance = DeviceInstance(config, self._generator)
         self._devices[config.id] = instance
+        # 注册故障注入钩子
+        instance.register_post_tick_hook(fault_injector.apply)
 
         server = self._protocol_servers.get(config.protocol)
         if server and server.status == ProtocolStatus.RUNNING:
@@ -70,6 +74,9 @@ async def remove_device(self, device_id: str) -> None:
         if not instance:
             raise ValueError(f"Device not found: {device_id}")
 
+        # 清除该设备的故障
+        fault_injector.clear(device_id)
+
         server = self._protocol_servers.get(instance.protocol)
         if server and server.status == ProtocolStatus.RUNNING:
             await server.remove_device(device_id)
@@ -299,3 +306,28 @@ def _get_device_info(self, instance: DeviceInstance) -> DeviceInfo:
             status=instance.status,
             points=instance.read_all_points(),
         )
+
+    # ------------------------------------------------------------------
+    # 故障管理
+    # ------------------------------------------------------------------
+
+    def inject_fault(self, device_id: str, request: FaultInjectRequest) -> FaultInfo:
+        instance = self._devices.get(device_id)
+        if not instance:
+            raise ValueError(f"Device not found: {device_id}")
+        if instance.status != DeviceStatus.ONLINE:
+            raise ValueError(f"Device {device_id} is not online")
+        return fault_injector.inject(instance, request)
+
+    def clear_fault(self, device_id: str) -> bool:
+        return fault_injector.clear(device_id)
+
+    def get_fault(self, device_id: str) -> Optional[FaultInfo]:
+        return fault_injector.get_fault(device_id)
+
+    def list_active_faults(self) -> list[FaultInfo]:
+        return fault_injector.list_active()
+
+    @staticmethod
+    def list_fault_types() -> list[FaultTypeDefinition]:
+        return fault_injector.list_fault_types()
diff --git a/protoforge/core/fault.py b/protoforge/core/fault.py
new file mode 100644
index 0000000..e72842d
--- /dev/null
+++ b/protoforge/core/fault.py
@@ -0,0 +1,419 @@
+"""
+故障注入模块 (FaultInjector)
+
+设计原则：
+- 完全独立，不修改 device.py / engine.py 现有逻辑
+- 通过 apply(device) 在每次 tick 后覆盖测点值，device 本身无感知
+- 支持四种场景：异常注入、自动恢复、多指标联动、渐进式劣化
+"""
+import logging
+import random
+import time
+import uuid
+from typing import Any, Optional
+
+from protoforge.models.fault import (
+    ActiveFault,
+    FaultInfo,
+    FaultInjectRequest,
+    FaultMode,
+    FaultStatus,
+    FaultTypeDefinition,
+    PointFaultConfig,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# 内置故障类型定义（基于真实工业场景）
+# ---------------------------------------------------------------------------
+
+BUILTIN_FAULT_TYPES: list[FaultTypeDefinition] = [
+
+    # ------------------------------------------------------------------
+    # 刀具磨损 — 最常见的机加工故障
+    # 特征：切削阻力增大 → 主轴电流缓慢爬升，振动幅度增大，进给速率被系统压低
+    # 模式：渐进式，持续数分钟，模拟刀具从轻度磨损到需要换刀的过程
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="tool_wear",
+        name="刀具磨损",
+        description="刀具切削刃磨损，切削阻力增大，主轴电流升高，振动增大，进给速率下降",
+        category="mechanical",
+        default_duration=300.0,
+        tags=["刀具", "磨损", "渐进"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL,
+                             multiplier=2.2, noise_scale=0.8),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.GRADUAL,
+                             multiplier=3.0, noise_scale=0.3),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.GRADUAL,
+                             multiplier=3.0, noise_scale=0.3),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.GRADUAL,
+                             multiplier=3.5, noise_scale=0.4),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.GRADUAL,
+                             multiplier=0.45, noise_scale=20.0),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 刀具崩刃 — 突发性刀具失效
+    # 特征：瞬间冲击 → 振动突增，电流瞬间峰值，进给立即停止
+    # 模式：瞬间注入，持续时间短（机床通常会触发报警停机）
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="tool_breakage",
+        name="刀具崩刃",
+        description="刀具突发性崩刃，振动剧烈突增，主轴电流峰值，进给停止",
+        category="mechanical",
+        default_duration=15.0,
+        tags=["刀具", "崩刃", "突发"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
+                             multiplier=4.5, noise_scale=2.0),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT,
+                             multiplier=8.0, noise_scale=1.5),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT,
+                             multiplier=8.0, noise_scale=1.5),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
+                             multiplier=10.0, noise_scale=2.0),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
+                             target_value=0.0, noise_scale=0.0),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 主轴过热 — 长时间高负荷或冷却系统故障
+    # 特征：主轴电流持续偏高，转速因热保护逐渐降低
+    # 模式：渐进式，持续时间较长
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="spindle_overheat",
+        name="主轴过热",
+        description="主轴长时间高负荷运转或冷却不足，电流持续偏高，转速因热保护下降",
+        category="thermal",
+        default_duration=240.0,
+        tags=["主轴", "过热", "渐进"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL,
+                             multiplier=1.8, noise_scale=1.2),
+            PointFaultConfig(point="spindle_speed", mode=FaultMode.GRADUAL,
+                             multiplier=0.6, noise_scale=50.0),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.GRADUAL,
+                             multiplier=1.5, noise_scale=0.2),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.GRADUAL,
+                             multiplier=1.5, noise_scale=0.2),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 主轴轴承故障 — 轴承磨损或润滑不足
+    # 特征：振动频率特征变化，整体振动幅度升高，电流略升
+    # 模式：渐进式
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="spindle_bearing_fault",
+        name="主轴轴承故障",
+        description="主轴轴承磨损或润滑不足，振动幅度持续升高，伴随电流轻微上升",
+        category="mechanical",
+        default_duration=360.0,
+        tags=["主轴", "轴承", "渐进"],
+        point_faults=[
+            PointFaultConfig(point="vibration_x", mode=FaultMode.GRADUAL,
+                             multiplier=4.0, noise_scale=0.5),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.GRADUAL,
+                             multiplier=4.0, noise_scale=0.5),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.GRADUAL,
+                             multiplier=5.0, noise_scale=0.8),
+            PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL,
+                             multiplier=1.3, noise_scale=0.5),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 进给堵转 — 工件夹紧松动或切削量过大导致进给卡死
+    # 特征：进给速率瞬间降为 0，主轴电流急剧升高
+    # 模式：瞬间注入
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="feed_stall",
+        name="进给堵转",
+        description="进给轴卡死，进给速率降为零，主轴电流急剧升高",
+        category="process",
+        default_duration=20.0,
+        tags=["进给", "堵转", "突发"],
+        point_faults=[
+            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
+                             target_value=0.0, noise_scale=0.0),
+            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
+                             multiplier=3.8, noise_scale=1.5),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
+                             multiplier=5.0, noise_scale=1.0),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 振动异常 — 工件装夹松动或共振
+    # 特征：三轴振动突然大幅增加，其他指标基本正常
+    # 模式：瞬间注入
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="vibration_spike",
+        name="振动异常",
+        description="工件装夹松动或切削共振，三轴振动突然大幅增加",
+        category="mechanical",
+        default_duration=60.0,
+        tags=["振动", "装夹", "突发"],
+        point_faults=[
+            PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT,
+                             multiplier=6.0, noise_scale=1.0),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT,
+                             multiplier=6.0, noise_scale=1.0),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
+                             multiplier=7.0, noise_scale=1.2),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 切削液不足 — 冷却润滑失效
+    # 特征：热量积累 → 振动缓慢升高，电流缓慢升高，进给略降
+    # 模式：渐进式，速度较慢
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="coolant_failure",
+        name="切削液不足",
+        description="切削液供给不足，冷却润滑失效，热量积累导致振动和电流缓慢升高",
+        category="process",
+        default_duration=480.0,
+        tags=["切削液", "冷却", "渐进"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL,
+                             multiplier=1.6, noise_scale=0.8),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.GRADUAL,
+                             multiplier=2.0, noise_scale=0.3),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.GRADUAL,
+                             multiplier=2.0, noise_scale=0.3),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.GRADUAL,
+                             multiplier=2.5, noise_scale=0.4),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.GRADUAL,
+                             multiplier=0.75, noise_scale=15.0),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 电源波动 — 供电不稳定
+    # 特征：主轴转速和进给速率出现随机波动，电流不稳定
+    # 模式：瞬间注入（持续期间持续抖动）
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="power_fluctuation",
+        name="电源波动",
+        description="供电电压不稳定，主轴转速和进给速率出现随机波动",
+        category="electrical",
+        default_duration=90.0,
+        tags=["电源", "波动", "突发"],
+        point_faults=[
+            PointFaultConfig(point="spindle_speed", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=300.0),
+            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=5.0),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=150.0),
+        ],
+    ),
+]
+
+# 按 id 索引
+_FAULT_TYPE_MAP: dict[str, FaultTypeDefinition] = {ft.id: ft for ft in BUILTIN_FAULT_TYPES}
+
+
+# ---------------------------------------------------------------------------
+# FaultInjector
+# ---------------------------------------------------------------------------
+
+class FaultInjector:
+    """
+    故障注入器，完全独立于 DeviceInstance。
+
+    使用方式：
+        injector = FaultInjector()
+        injector.inject(device, request)   # 注入故障
+        injector.apply(device)             # 每次 tick 后调用，覆盖测点值
+        injector.clear(device_id)          # 手动清除
+    """
+
+    def __init__(self):
+        # device_id -> ActiveFault
+        self._active: dict[str, ActiveFault] = {}
+
+    # ------------------------------------------------------------------
+    # 公开接口
+    # ------------------------------------------------------------------
+
+    def inject(self, device: Any, request: FaultInjectRequest) -> FaultInfo:
+        """向设备注入故障，返回故障信息"""
+        fault_type = _FAULT_TYPE_MAP.get(request.fault_type_id)
+        if not fault_type:
+            raise ValueError(f"Unknown fault type: {request.fault_type_id}")
+
+        duration = request.duration if request.duration is not None else fault_type.default_duration
+
+        # 记录注入时各测点的当前基线值
+        baseline: dict[str, float] = {}
+        for pf in fault_type.point_faults:
+            val = device._point_values.get(pf.point)
+            if val is not None:
+                try:
+                    baseline[pf.point] = float(val)
+                except (TypeError, ValueError):
+                    baseline[pf.point] = 0.0
+
+        fault = ActiveFault(
+            fault_id=uuid.uuid4().hex[:12],
+            device_id=device.id,
+            fault_type_id=fault_type.id,
+            fault_type_name=fault_type.name,
+            intensity=max(0.0, min(1.0, request.intensity)),
+            duration=duration,
+            started_at=time.time(),
+            baseline_values=baseline,
+        )
+        self._active[device.id] = fault
+        logger.info("Fault injected: device=%s type=%s duration=%.0fs",
+                    device.id, fault_type.id, duration)
+        return self._to_info(fault, fault_type)
+
+    def apply(self, device: Any) -> None:
+        """
+        在 device.tick() 之后调用，将故障效果覆盖到 _point_values。
+        故障超时后自动清除。
+        """
+        fault = self._active.get(device.id)
+        if not fault:
+            return
+
+        now = time.time()
+        elapsed = now - fault.started_at
+
+        if elapsed >= fault.duration:
+            self._expire(device, fault)
+            return
+
+        fault_type = _FAULT_TYPE_MAP.get(fault.fault_type_id)
+        if not fault_type:
+            return
+
+        # progress: 0.0（刚注入）→ 1.0（达到峰值）
+        progress = min(elapsed / fault.duration, 1.0)
+
+        for pf in fault_type.point_faults:
+            if pf.point not in device._point_values:
+                continue
+            baseline = fault.baseline_values.get(pf.point, 0.0)
+            if baseline == 0.0:
+                # 基线为 0 时用当前值兜底，避免乘法无效
+                try:
+                    baseline = float(device._point_values[pf.point]) or 1.0
+                except (TypeError, ValueError):
+                    continue
+
+            device._point_values[pf.point] = self._compute_value(
+                pf, baseline, progress, fault.intensity
+            )
+
+    def clear(self, device_id: str) -> bool:
+        """手动清除故障，不恢复基线（让生成器自然恢复）"""
+        if device_id not in self._active:
+            return False
+        fault = self._active.pop(device_id)
+        fault.status = FaultStatus.CLEARED
+        fault.cleared_at = time.time()
+        logger.info("Fault cleared manually: device=%s type=%s", device_id, fault.fault_type_id)
+        return True
+
+    def get_fault(self, device_id: str) -> Optional[FaultInfo]:
+        fault = self._active.get(device_id)
+        if not fault:
+            return None
+        fault_type = _FAULT_TYPE_MAP.get(fault.fault_type_id)
+        return self._to_info(fault, fault_type)
+
+    def list_active(self) -> list[FaultInfo]:
+        result = []
+        for fault in self._active.values():
+            fault_type = _FAULT_TYPE_MAP.get(fault.fault_type_id)
+            result.append(self._to_info(fault, fault_type))
+        return result
+
+    @staticmethod
+    def list_fault_types() -> list[FaultTypeDefinition]:
+        return BUILTIN_FAULT_TYPES
+
+    @staticmethod
+    def get_fault_type(fault_type_id: str) -> Optional[FaultTypeDefinition]:
+        return _FAULT_TYPE_MAP.get(fault_type_id)
+
+    # ------------------------------------------------------------------
+    # 内部逻辑
+    # ------------------------------------------------------------------
+
+    def _compute_value(
+        self,
+        pf: PointFaultConfig,
+        baseline: float,
+        progress: float,
+        intensity: float,
+    ) -> float:
+        """根据故障配置和当前进度计算覆盖值"""
+        if pf.mode == FaultMode.INSTANT:
+            # 瞬间模式：直接用目标值，不随时间变化
+            if pf.target_value is not None:
+                target = pf.target_value
+            elif pf.multiplier is not None:
+                target = baseline * (1.0 + (pf.multiplier - 1.0) * intensity)
+            else:
+                target = baseline
+        else:
+            # 渐进模式：随 progress 线性劣化
+            if pf.target_value is not None:
+                target = baseline + (pf.target_value - baseline) * progress * intensity
+            elif pf.multiplier is not None:
+                target = baseline * (1.0 + (pf.multiplier - 1.0) * progress * intensity)
+            else:
+                target = baseline
+
+        # 叠加随机噪声，模拟真实信号抖动
+        if pf.noise_scale > 0:
+            target += random.gauss(0, pf.noise_scale * intensity)
+
+        return round(max(0.0, target), 4)
+
+    def _expire(self, device: Any, fault: ActiveFault) -> None:
+        """故障到期，从 active 中移除，让生成器自然恢复正常值"""
+        self._active.pop(device.id, None)
+        logger.info("Fault expired: device=%s type=%s", device.id, fault.fault_type_id)
+
+    @staticmethod
+    def _to_info(fault: ActiveFault, fault_type: Optional[FaultTypeDefinition]) -> FaultInfo:
+        now = time.time()
+        elapsed = now - fault.started_at
+        progress = min(elapsed / fault.duration, 1.0)
+        affected = [pf.point for pf in fault_type.point_faults] if fault_type else []
+        return FaultInfo(
+            fault_id=fault.fault_id,
+            device_id=fault.device_id,
+            fault_type_id=fault.fault_type_id,
+            fault_type_name=fault.fault_type_name,
+            status=fault.status,
+            intensity=fault.intensity,
+            duration=fault.duration,
+            elapsed=round(elapsed, 1),
+            progress=round(progress, 3),
+            affected_points=affected,
+            started_at=fault.started_at,
+        )
+
+
+# 全局单例
+fault_injector = FaultInjector()
diff --git a/protoforge/models/fault.py b/protoforge/models/fault.py
new file mode 100644
index 0000000..cc038e0
--- /dev/null
+++ b/protoforge/models/fault.py
@@ -0,0 +1,77 @@
+from enum import Enum
+from typing import Any, Optional
+
+from pydantic import BaseModel, Field
+
+
+class FaultMode(str, Enum):
+    """故障注入模式"""
+    INSTANT = "instant"       # 瞬间跳变到异常值，持续 duration 后恢复
+    GRADUAL = "gradual"       # 渐进式劣化，随时间线性恶化，到 duration 时达到峰值后恢复
+
+
+class FaultStatus(str, Enum):
+    ACTIVE = "active"
+    RECOVERING = "recovering"
+    CLEARED = "cleared"
+
+
+class PointFaultConfig(BaseModel):
+    """单个测点的故障行为定义"""
+    point: str
+    mode: FaultMode = FaultMode.INSTANT
+
+    # INSTANT 模式：直接设置为 target_value（若为 None 则用 multiplier 乘以当前值）
+    target_value: Optional[float] = None
+    multiplier: Optional[float] = None     # 异常值 = 当前正常值 × multiplier
+
+    # GRADUAL 模式：从当前值线性劣化到 target_value 或 multiplier 倍
+    # 劣化程度 = progress(0~1) × (target - baseline)
+    noise_scale: float = 0.0               # 叠加随机噪声幅度，模拟真实抖动
+
+
+class FaultTypeDefinition(BaseModel):
+    """故障类型定义，描述一种真实故障场景"""
+    id: str
+    name: str
+    description: str
+    category: str                          # 故障分类：mechanical / electrical / thermal / process
+    default_duration: float = 120.0        # 默认持续时间（秒）
+    point_faults: list[PointFaultConfig] = Field(default_factory=list)
+    tags: list[str] = Field(default_factory=list)
+
+
+class FaultInjectRequest(BaseModel):
+    """故障注入请求"""
+    fault_type_id: str
+    duration: Optional[float] = None       # 覆盖默认持续时间，None 表示用类型默认值
+    intensity: float = 1.0                 # 故障强度系数 0~1，影响劣化幅度
+
+
+class ActiveFault(BaseModel):
+    """当前激活的故障实例"""
+    fault_id: str                          # 唯一实例 ID
+    device_id: str
+    fault_type_id: str
+    fault_type_name: str
+    status: FaultStatus = FaultStatus.ACTIVE
+    intensity: float = 1.0
+    duration: float = 120.0
+    started_at: float = 0.0
+    cleared_at: Optional[float] = None
+    baseline_values: dict[str, float] = Field(default_factory=dict)  # 注入时的正常基线值
+
+
+class FaultInfo(BaseModel):
+    """故障状态信息（API 响应用）"""
+    fault_id: str
+    device_id: str
+    fault_type_id: str
+    fault_type_name: str
+    status: FaultStatus
+    intensity: float
+    duration: float
+    elapsed: float
+    progress: float                        # 0~1，故障进度
+    affected_points: list[str]
+    started_at: float

From 05f993cae5374642bb5447e1aa260e87aec012f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 14:18:21 +0800
Subject: [PATCH 16/31] feat(fault): support fault

---
 web/src/api.js            |   6 ++
 web/src/views/Devices.vue | 169 +++++++++++++++++++++++++++++++++++---
 2 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/web/src/api.js b/web/src/api.js
index 2bc15b3..059d4bd 100644
--- a/web/src/api.js
+++ b/web/src/api.js
@@ -128,4 +128,10 @@ export default {
 
   getSettings: () => d(api.get('/settings')),
   updateSettings: (updates) => d(api.put('/settings', updates)),
+
+  getFaultTypes: () => d(api.get('/faults/types')),
+  getActiveFaults: () => d(api.get('/faults/active')),
+  injectFault: (deviceId, faultTypeId, duration, intensity) => d(api.post(`/devices/${deviceId}/fault`, { fault_type_id: faultTypeId, duration, intensity })),
+  getDeviceFault: (deviceId) => d(api.get(`/devices/${deviceId}/fault`)),
+  clearDeviceFault: (deviceId) => d(api.delete(`/devices/${deviceId}/fault`)),
 }
diff --git a/web/src/views/Devices.vue b/web/src/views/Devices.vue
index 022f3ac..65e0535 100644
--- a/web/src/views/Devices.vue
+++ b/web/src/views/Devices.vue
@@ -90,14 +90,62 @@
       <n-modal v-model:show="showPointsModal" preset="card" title="设备测点" style="width:700px">
         <n-data-table :columns="pointColumns" :data="currentPoints" :bordered="false" size="small" />
       </n-modal>
+
+      <!-- 故障注入 Modal -->
+      <n-modal v-model:show="showFaultModal" preset="card" title="故障注入" style="width:480px">
+        <n-space vertical size="medium">
+          <n-text depth="3" style="font-size:13px">设备：{{ faultTargetDevice?.name }}</n-text>
+          <n-form-item label="故障类型" label-placement="left" label-width="80">
+            <n-select
+              v-model:value="faultTypeId"
+              :options="faultTypeOptions"
+              placeholder="选择故障类型"
+              @update:value="onFaultTypeChange"
+            />
+          </n-form-item>
+          <n-alert v-if="selectedFaultType" type="warning" :bordered="false" style="font-size:12px">
+            <div style="font-weight:500;margin-bottom:4px">{{ selectedFaultType.name }} · {{ faultCategoryLabel(selectedFaultType.category) }}</div>
+            <div style="color:#94a3b8">{{ selectedFaultType.description }}</div>
+            <div style="margin-top:6px;color:#94a3b8">
+              影响测点：{{ selectedFaultType.point_faults.map(p => p.point).join('、') }}
+            </div>
+          </n-alert>
+          <n-form-item label="持续时间" label-placement="left" label-width="80">
+            <n-input-number
+              v-model:value="faultDuration"
+              :min="5"
+              :max="3600"
+              style="width:100%"
+            >
+              <template #suffix>秒</template>
+            </n-input-number>
+          </n-form-item>
+          <n-form-item label="故障强度" label-placement="left" label-width="80">
+            <n-space vertical style="width:100%">
+              <n-slider v-model:value="faultIntensity" :min="0.1" :max="1.0" :step="0.1" />
+              <n-text depth="3" style="font-size:12px">
+                {{ faultIntensityLabel }}（{{ faultIntensity }}）
+              </n-text>
+            </n-space>
+          </n-form-item>
+        </n-space>
+        <template #action>
+          <n-space justify="end">
+            <n-button @click="showFaultModal = false">取消</n-button>
+            <n-button type="error" :loading="faultLoading" :disabled="!faultTypeId" @click="doInjectFault">
+              注入故障
+            </n-button>
+          </n-space>
+        </template>
+      </n-modal>
     </n-space>
   </div>
 </template>
 
 <script setup>
-import { ref, computed, onMounted, h } from 'vue'
+import { ref, computed, onMounted, onUnmounted, h } from 'vue'
 import { NSpace, NSelect, NButton, NDataTable, NModal, NForm, NFormItem, NInput, NTag,
-  NSteps, NStep, NText, NAlert, useMessage, useDialog } from 'naive-ui'
+  NSteps, NStep, NText, NAlert, NInputNumber, NSlider, useMessage, useDialog } from 'naive-ui'
 import { useRouter } from 'vue-router'
 import api from '../api.js'
 
@@ -123,6 +171,17 @@ const qcTemplateId = ref(null)
 const qcDeviceName = ref('')
 const qcLoading = ref(false)
 
+// 故障注入状态
+const showFaultModal = ref(false)
+const faultTargetDevice = ref(null)
+const faultTypes = ref([])
+const faultTypeId = ref(null)
+const faultDuration = ref(120)
+const faultIntensity = ref(1.0)
+const faultLoading = ref(false)
+// device_id -> fault info，用于在列表中显示故障状态
+const activeFaults = ref({})
+
 const protocolLabels = {
   modbus_tcp: 'Modbus TCP', modbus_rtu: 'Modbus RTU', opcua: 'OPC-UA', mqtt: 'MQTT',
   http: 'HTTP', gb28181: 'GB28181', bacnet: 'BACnet', s7: 'S7',
@@ -184,13 +243,25 @@ const columns = [
   },
   { title: '测点', key: 'points', width: 70, render: (row) => (row.points || []).length },
   {
-    title: '操作', key: 'actions', width: 280,
+    title: '故障', key: 'fault', width: 90,
+    render: (row) => {
+      const fault = activeFaults.value[row.id]
+      if (!fault || fault.status === 'none') return h(NTag, { size: 'tiny', bordered: false }, () => '正常')
+      const pct = Math.round((fault.progress || 0) * 100)
+      return h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`)
+    }
+  },
+  {
+    title: '操作', key: 'actions', width: 320,
     render: (row) => h(NSpace, { size: 4 }, () => [
       h(NButton, { size: 'tiny', tertiary: true, onClick: () => viewPoints(row.id) }, () => '测点'),
       h(NButton, { size: 'tiny', tertiary: true, onClick: () => openEditDevice(row) }, () => '编辑'),
       row.status === 'online' || row.status === 'running'
         ? h(NButton, { size: 'tiny', type: 'warning', secondary: true, onClick: () => toggleDevice(row.id, 'stop') }, () => '停止')
         : h(NButton, { size: 'tiny', type: 'primary', secondary: true, onClick: () => toggleDevice(row.id, 'start') }, () => '启动'),
+      activeFaults.value[row.id] && activeFaults.value[row.id].status !== 'none'
+        ? h(NButton, { size: 'tiny', type: 'warning', secondary: true, onClick: () => stopFault(row.id) }, () => '停止故障')
+        : h(NButton, { size: 'tiny', type: 'error', ghost: true, disabled: row.status !== 'online', onClick: () => openFaultModal(row) }, () => '注入故障'),
       h(NButton, { size: 'tiny', type: 'error', secondary: true, onClick: () => confirmDeleteDevice(row) }, () => '删除'),
     ])
   },
@@ -220,13 +291,6 @@ async function doQuickCreate() {
   } finally { qcLoading.value = false }
 }
 
-async function loadData() {
-  try {
-    const [devRes, protoRes, tmplRes] = await Promise.all([api.getDevices(), api.getProtocols(), api.getTemplates()])
-    devices.value = devRes; protocols.value = protoRes; templates.value = tmplRes
-  } catch (e) { message.error('加载数据失败: ' + (e.response?.data?.detail || e.message)) }
-}
-
 async function createDevice() {
   creating.value = true
   try {
@@ -282,5 +346,88 @@ async function viewPoints(id) {
   catch (e) { message.error('读取测点失败: ' + (e.response?.data?.detail || e.message)) }
 }
 
-onMounted(loadData)
+// 故障注入相关
+const faultTypeOptions = computed(() =>
+  faultTypes.value.map(t => ({ label: `${t.name}（${faultCategoryLabel(t.category)}）`, value: t.id }))
+)
+
+const selectedFaultType = computed(() =>
+  faultTypes.value.find(t => t.id === faultTypeId.value) || null
+)
+
+const faultIntensityLabel = computed(() => {
+  const v = faultIntensity.value
+  if (v <= 0.3) return '轻微'
+  if (v <= 0.6) return '中等'
+  if (v <= 0.8) return '严重'
+  return '极严重'
+})
+
+function faultCategoryLabel(category) {
+  const map = { mechanical: '机械', thermal: '热', electrical: '电气', process: '工艺' }
+  return map[category] || category
+}
+
+function onFaultTypeChange(val) {
+  const t = faultTypes.value.find(f => f.id === val)
+  if (t && t.default_duration) faultDuration.value = t.default_duration
+}
+
+function openFaultModal(row) {
+  faultTargetDevice.value = row
+  faultTypeId.value = null
+  faultDuration.value = 120
+  faultIntensity.value = 1.0
+  showFaultModal.value = true
+}
+
+async function doInjectFault() {
+  if (!faultTypeId.value || !faultTargetDevice.value) return
+  faultLoading.value = true
+  try {
+    await api.injectFault(faultTargetDevice.value.id, faultTypeId.value, faultDuration.value, faultIntensity.value)
+    message.success(`已向设备 "${faultTargetDevice.value.name}" 注入故障`)
+    showFaultModal.value = false
+    await loadFaultStatus()
+  } catch (e) {
+    message.error('注入失败: ' + (e.response?.data?.detail || e.message))
+  } finally { faultLoading.value = false }
+}
+
+async function stopFault(deviceId) {
+  try {
+    await api.clearDeviceFault(deviceId)
+    message.success('故障已停止')
+    await loadFaultStatus()
+  } catch (e) {
+    message.error('停止故障失败: ' + (e.response?.data?.detail || e.message))
+  }
+}
+
+async function loadFaultStatus() {
+  try {
+    const list = await api.getActiveFaults()
+    const map = {}
+    for (const f of list) map[f.device_id] = f
+    activeFaults.value = map
+  } catch (e) { /* 静默失败 */ }
+}
+
+async function loadData() {
+  try {
+    const [devRes, protoRes, tmplRes, ftRes] = await Promise.all([
+      api.getDevices(), api.getProtocols(), api.getTemplates(), api.getFaultTypes()
+    ])
+    devices.value = devRes; protocols.value = protoRes; templates.value = tmplRes; faultTypes.value = ftRes
+    await loadFaultStatus()
+  } catch (e) { message.error('加载数据失败: ' + (e.response?.data?.detail || e.message)) }
+}
+
+let faultPollTimer = null
+onMounted(() => {
+  loadData()
+  faultPollTimer = setInterval(loadFaultStatus, 3000)
+})
+
+onUnmounted(() => { if (faultPollTimer) clearInterval(faultPollTimer) })
 </script>

From 550d8e20b98481a5390be716104142d6aec87770 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 19:57:06 +0800
Subject: [PATCH 17/31] feat(ai): support ai

---
 ai/predict.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100755 ai/predict.py

diff --git a/ai/predict.py b/ai/predict.py
new file mode 100755
index 0000000..b70f822
--- /dev/null
+++ b/ai/predict.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+import requests
+import numpy as np
+from datetime import datetime, timedelta
+
+VM_URL = "http://localhost:8428"
+DEVICE_ID = "fanuc-cnc"
+METRIC = f'feed_rate{{device_id="{DEVICE_ID}"}}'
+
+def fetch_history(minutes=30):
+    """从VM拉取历史数据"""
+    end = datetime.now()
+    start = end - timedelta(minutes=minutes)
+    resp = requests.get(f"{VM_URL}/api/v1/query_range", params={
+        "query": METRIC,
+        "start": start.timestamp(),
+        "end": end.timestamp(),
+        "step": "1s",
+    })
+    result = resp.json()["data"]["result"]
+    if not result:
+        return [], []
+    values = result[0]["values"]
+    ts = [float(v[0]) for v in values]
+    ys = [float(v[1]) for v in values]
+    return ts, ys
+
+def predict_next(ts, ys, horizon=60):
+    """
+    用FFT检测主频，拟合正弦波，外推未来horizon秒
+    适合周期性信号
+    """
+    if len(ys) < 60:
+        return [], []
+
+    ys = np.array(ys)
+    n = len(ys)
+    dt = 1.0  # 1秒采样
+
+    # FFT找主频
+    fft = np.fft.rfft(ys - ys.mean())
+    freqs = np.fft.rfftfreq(n, d=dt)
+    dominant_idx = np.argmax(np.abs(fft[1:])) + 1
+    dominant_freq = freqs[dominant_idx]
+    period = 1.0 / dominant_freq if dominant_freq > 0 else 60
+
+    # 拟合：y = A*sin(2π/T * t + φ) + offset
+    from scipy.optimize import curve_fit
+    t_rel = np.arange(n, dtype=float)
+    offset = ys.mean()
+    amplitude = (ys.max() - ys.min()) / 2
+
+    def sine_model(t, A, T, phi, C):
+        return A * np.sin(2 * np.pi / T * t + phi) + C
+
+    try:
+        popt, _ = curve_fit(
+            sine_model, t_rel, ys,
+            p0=[amplitude, period, 0, offset],
+            maxfev=5000
+        )
+        # 外推
+        t_future = np.arange(n, n + horizon, dtype=float)
+        y_pred = sine_model(t_future, *popt)
+        ts_future = [ts[-1] + i + 1 for i in range(horizon)]
+        return ts_future, y_pred.tolist()
+    except Exception:
+        # 拟合失败降级为线性
+        slope = (ys[-1] - ys[-10]) / 10
+        ts_future = [ts[-1] + i + 1 for i in range(horizon)]
+        y_pred = [ys[-1] + slope * (i + 1) for i in range(horizon)]
+        return ts_future, y_pred
+
+def write_predictions(ts_future, y_pred, metric_name="protoforge_feed_rate_predicted"):
+    """写回VictoriaMetrics"""
+    lines = []
+    for t, y in zip(ts_future, y_pred):
+        ts_ms = int(t * 1000)
+        lines.append(f'{metric_name}{{device_id="{DEVICE_ID}"}} {y:.2f} {ts_ms}')
+    payload = "\n".join(lines)
+    requests.post(f"{VM_URL}/api/v1/import/prometheus", data=payload)
+
+def run_once():
+    ts, ys = fetch_history(minutes=30)
+    if len(ys) < 60:
+        print("数据不足")
+        return
+    ts_future, y_pred = predict_next(ts, ys, horizon=120)
+    write_predictions(ts_future, y_pred)
+    print(f"写入 {len(y_pred)} 个预测点，预测到 +{len(y_pred)}s")
+
+if __name__ == "__main__":
+    import time
+    while True:
+        run_once()
+        time.sleep(30)  # 每30秒重新预测一次

From e8f70d09c27d53bff6c8f310b7a2064496632a33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 20:29:13 +0800
Subject: [PATCH 18/31] feat(predict_v2): add predict_v2 python file

---
 ai/predict_v2.py | 206 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100755 ai/predict_v2.py

diff --git a/ai/predict_v2.py b/ai/predict_v2.py
new file mode 100755
index 0000000..df5dd97
--- /dev/null
+++ b/ai/predict_v2.py
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+"""
+ProtoForge 预测服务 v2
+从 VictoriaMetrics 拉取历史数据，用 FFT + 正弦拟合预测未来值，写回 VM。
+预测值时间戳为未来时间，Grafana 中预测线出现在实测线右侧延伸处。
+"""
+
+import logging
+import time
+from datetime import datetime, timedelta
+
+import numpy as np
+import requests
+from scipy.optimize import curve_fit
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# ── 配置 ──────────────────────────────────────────────────────────────────────
+VM_URL = "http://localhost:8428"
+
+# 要预测的指标列表，每项：(查询表达式, 写回指标名)
+PREDICT_TARGETS = [
+    ('feed_rate{device_id="fanuc-cnc"}',       "feed_rate_predicted"),
+    ('spindle_speed{device_id="fanuc-cnc"}',    "spindle_speed_predicted"),
+    ('spindle_current{device_id="fanuc-cnc"}',  "spindle_current_predicted"),
+    ('vibration_x{device_id="fanuc-cnc"}',      "vibration_x_predicted"),
+    ('vibration_y{device_id="fanuc-cnc"}',      "vibration_y_predicted"),
+    ('vibration_z{device_id="fanuc-cnc"}',      "vibration_z_predicted"),
+]
+
+HISTORY_MINUTES = 30   # 拉取多少分钟历史数据用于拟合
+HORIZON_SECONDS = 120  # 预测未来多少秒
+POLL_INTERVAL   = 30   # 每隔多少秒重新预测一次
+MIN_POINTS      = 120  # 至少需要多少个历史点才开始预测
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def fetch_history(query: str, minutes: int = HISTORY_MINUTES):
+    """从 VictoriaMetrics 拉取历史时序数据，返回 (timestamps, values)。"""
+    now = datetime.now()
+    start = now - timedelta(minutes=minutes)
+    try:
+        resp = requests.get(
+            f"{VM_URL}/api/v1/query_range",
+            params={
+                "query": query,
+                "start": start.timestamp(),
+                "end":   now.timestamp(),
+                "step":  "1s",
+            },
+            timeout=10,
+        )
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        logger.error("拉取数据失败 query=%s: %s", query, e)
+        return [], []
+
+    result = resp.json().get("data", {}).get("result", [])
+    if not result:
+        return [], []
+
+    values = result[0]["values"]
+    ts = [float(v[0]) for v in values]
+    ys = [float(v[1]) for v in values]
+    return ts, ys
+
+
+def _sine_model(t, A, T, phi, C):
+    return A * np.sin(2 * np.pi / T * t + phi) + C
+
+
+def predict_next(ts: list, ys: list, horizon: int = HORIZON_SECONDS):
+    """
+    用 FFT 检测主频，拟合正弦波，外推未来 horizon 秒。
+    返回 (future_timestamps, predicted_values)，时间戳均在最后一个真实点之后。
+    降级策略：拟合失败时用最近 10 点线性外推。
+    """
+    ys_arr = np.array(ys)
+    n = len(ys_arr)
+
+    # ── FFT 找主频 ────────────────────────────────────────────────────────────
+    fft_vals = np.fft.rfft(ys_arr - ys_arr.mean())
+    freqs = np.fft.rfftfreq(n, d=1.0)  # d=1 表示 1 秒采样间隔
+    # 跳过直流分量（index 0）
+    dominant_idx = int(np.argmax(np.abs(fft_vals[1:]))) + 1
+    dominant_freq = freqs[dominant_idx]
+    period = 1.0 / dominant_freq if dominant_freq > 0 else 60.0
+    period = float(np.clip(period, 5.0, 3600.0))  # 限制在合理范围
+
+    # ── 正弦拟合 ──────────────────────────────────────────────────────────────
+    t_rel = np.arange(n, dtype=float)
+    amplitude = (ys_arr.max() - ys_arr.min()) / 2.0
+    offset = float(ys_arr.mean())
+
+    # 最后一个真实数据点的 Unix 时间戳（秒）
+    last_ts = ts[-1]
+
+    try:
+        popt, _ = curve_fit(
+            _sine_model,
+            t_rel,
+            ys_arr,
+            p0=[amplitude, period, 0.0, offset],
+            bounds=(
+                [0,       5.0,    -np.pi, ys_arr.min()],
+                [np.inf,  3600.0,  np.pi, ys_arr.max()],
+            ),
+            maxfev=8000,
+        )
+        t_future = np.arange(n, n + horizon, dtype=float)
+        y_pred = _sine_model(t_future, *popt)
+        # 裁剪到历史数据值域，避免外推飞出合理范围
+        y_pred = np.clip(y_pred, ys_arr.min() * 0.5, ys_arr.max() * 1.5)
+
+        # 未来时间戳：last_ts + 1s, +2s, ..., +horizon s
+        ts_future = [last_ts + i + 1 for i in range(horizon)]
+        logger.debug("正弦拟合成功 period=%.1fs amplitude=%.2f", popt[1], popt[0])
+        return ts_future, y_pred.tolist()
+
+    except Exception as e:
+        logger.warning("正弦拟合失败，降级为线性外推: %s", e)
+        tail = min(10, n)
+        slope = (ys_arr[-1] - ys_arr[-tail]) / tail
+        ts_future = [last_ts + i + 1 for i in range(horizon)]
+        y_pred = [float(ys_arr[-1] + slope * (i + 1)) for i in range(horizon)]
+        return ts_future, y_pred
+
+
+def write_predictions(ts_future: list, y_pred: list, metric_name: str, extra_labels: dict = None):
+    """
+    将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。
+    时间戳为毫秒级 Unix 时间戳，对应未来时间点。
+    """
+    label_str = ""
+    if extra_labels:
+        parts = [f'{k}="{v}"' for k, v in extra_labels.items()]
+        label_str = "{" + ",".join(parts) + "}"
+
+    lines = []
+    for t, y in zip(ts_future, y_pred):
+        ts_ms = int(t * 1000)
+        lines.append(f"{metric_name}{label_str} {y:.4f} {ts_ms}")
+
+    payload = "\n".join(lines)
+    try:
+        resp = requests.post(
+            f"{VM_URL}/api/v1/import/prometheus",
+            data=payload,
+            timeout=10,
+        )
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        logger.error("写入预测数据失败 metric=%s: %s", metric_name, e)
+
+
+def _parse_labels(query: str) -> dict:
+    """从查询表达式中解析标签，如 feed_rate{device_id="fanuc-cnc"} → {"device_id": "fanuc-cnc"}"""
+    labels = {}
+    if "{" not in query:
+        return labels
+    label_part = query[query.index("{") + 1: query.index("}")]
+    for item in label_part.split(","):
+        if "=" in item:
+            k, v = item.split("=", 1)
+            labels[k.strip()] = v.strip().strip('"')
+    return labels
+
+
+def run_once():
+    now_str = datetime.now().strftime("%H:%M:%S")
+    for query, pred_metric in PREDICT_TARGETS:
+        ts, ys = fetch_history(query)
+        if len(ys) < MIN_POINTS:
+            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
+            continue
+
+        ts_future, y_pred = predict_next(ts, ys, horizon=HORIZON_SECONDS)
+        if not ts_future:
+            continue
+
+        extra_labels = _parse_labels(query)
+        write_predictions(ts_future, y_pred, pred_metric, extra_labels)
+
+        future_time = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
+        logger.info(
+            "[%s] %-40s → %-35s 写入 %d 点，预测至 %s",
+            now_str, query, pred_metric, len(y_pred), future_time,
+        )
+
+
+def main():
+    logger.info(
+        "预测服务启动  VM=%s  预测窗口=%ds  轮询间隔=%ds",
+        VM_URL, HORIZON_SECONDS, POLL_INTERVAL,
+    )
+    while True:
+        run_once()
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()

From 57df20284645347aba7de4e2399640e05b6d0b3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 21:13:52 +0800
Subject: [PATCH 19/31] fix

---
 ai/predict_v2.py | 536 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 450 insertions(+), 86 deletions(-)

diff --git a/ai/predict_v2.py b/ai/predict_v2.py
index df5dd97..bc425c8 100755
--- a/ai/predict_v2.py
+++ b/ai/predict_v2.py
@@ -1,56 +1,93 @@
 # -*- coding: utf-8 -*-
 """
-ProtoForge 预测服务 v2
-从 VictoriaMetrics 拉取历史数据，用 FFT + 正弦拟合预测未来值，写回 VM。
-预测值时间戳为未来时间，Grafana 中预测线出现在实测线右侧延伸处。
+ProtoForge 预测服务 v3
+
+修复点：
+1. 解决 HORIZON_SECONDS > POLL_INTERVAL 时，多轮预测窗口重叠导致 Grafana 出现毛刺/竖线问题。
+2. 每轮写入新预测前，删除同一个预测 metric 的旧预测序列，只保留最新一轮预测。
+3. 预测时间戳按整秒写入，避免毫秒时间戳和 Grafana step 不对齐。
+4. 拟合使用真实 timestamp 相对时间，不再假设历史数据严格 1 秒等间隔。
+5. 对历史数据做排序、去重、NaN/Inf 清洗。
 """
 
 import logging
+import math
+import re
 import time
 from datetime import datetime, timedelta
+from typing import Dict, List, Tuple
 
 import numpy as np
 import requests
 from scipy.optimize import curve_fit
 
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(message)s",
 )
+
 logger = logging.getLogger(__name__)
 
+
 # ── 配置 ──────────────────────────────────────────────────────────────────────
+
 VM_URL = "http://localhost:8428"
 
-# 要预测的指标列表，每项：(查询表达式, 写回指标名)
 PREDICT_TARGETS = [
-    ('feed_rate{device_id="fanuc-cnc"}',       "feed_rate_predicted"),
-    ('spindle_speed{device_id="fanuc-cnc"}',    "spindle_speed_predicted"),
-    ('spindle_current{device_id="fanuc-cnc"}',  "spindle_current_predicted"),
-    ('vibration_x{device_id="fanuc-cnc"}',      "vibration_x_predicted"),
-    ('vibration_y{device_id="fanuc-cnc"}',      "vibration_y_predicted"),
-    ('vibration_z{device_id="fanuc-cnc"}',      "vibration_z_predicted"),
+    ('feed_rate{device_id="fanuc-cnc"}', "feed_rate_predicted"),
+    ('spindle_speed{device_id="fanuc-cnc"}', "spindle_speed_predicted"),
+    ('spindle_current{device_id="fanuc-cnc"}', "spindle_current_predicted"),
+    ('vibration_x{device_id="fanuc-cnc"}', "vibration_x_predicted"),
+    ('vibration_y{device_id="fanuc-cnc"}', "vibration_y_predicted"),
+    ('vibration_z{device_id="fanuc-cnc"}', "vibration_z_predicted"),
 ]
 
-HISTORY_MINUTES = 30   # 拉取多少分钟历史数据用于拟合
-HORIZON_SECONDS = 120  # 预测未来多少秒
-POLL_INTERVAL   = 30   # 每隔多少秒重新预测一次
-MIN_POINTS      = 120  # 至少需要多少个历史点才开始预测
+HISTORY_MINUTES = 30
+HORIZON_SECONDS = 120
+POLL_INTERVAL = 30
+MIN_POINTS = 120
+QUERY_STEP = "1s"
+
+# 关键修复：每轮写入前删除旧预测，避免 120s 预测窗口和 30s 轮询周期重叠
+CLEAR_OLD_PREDICTIONS = True
+
+# 如果删除旧预测失败，是否跳过本轮写入。
+# 建议 True，避免继续叠加脏数据。
+SKIP_WRITE_IF_CLEAR_FAILED = True
+
+# 给新预测数据加一个稳定标签，方便 Grafana 查询过滤。
+# Grafana 可以查询：feed_rate_predicted{device_id="fanuc-cnc",forecast="latest"}
+EXTRA_PREDICT_LABELS = {
+    "forecast": "latest",
+    "source": "protoforge",
+}
+
+# 正弦周期限制
+MIN_PERIOD_SECONDS = 5.0
+MAX_PERIOD_SECONDS = 3600.0
+
 # ─────────────────────────────────────────────────────────────────────────────
 
 
-def fetch_history(query: str, minutes: int = HISTORY_MINUTES):
-    """从 VictoriaMetrics 拉取历史时序数据，返回 (timestamps, values)。"""
+def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]:
+    """
+    从 VictoriaMetrics 拉取历史时序数据。
+    返回：
+        timestamps: Unix 秒级时间戳
+        values: float 数值
+    """
     now = datetime.now()
     start = now - timedelta(minutes=minutes)
+
     try:
         resp = requests.get(
             f"{VM_URL}/api/v1/query_range",
             params={
                 "query": query,
                 "start": start.timestamp(),
-                "end":   now.timestamp(),
-                "step":  "1s",
+                "end": now.timestamp(),
+                "step": QUERY_STEP,
             },
             timeout=10,
         )
@@ -59,148 +96,475 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES):
         logger.error("拉取数据失败 query=%s: %s", query, e)
         return [], []
 
-    result = resp.json().get("data", {}).get("result", [])
+    try:
+        result = resp.json().get("data", {}).get("result", [])
+    except Exception as e:
+        logger.error("解析 VM 返回失败 query=%s: %s", query, e)
+        return [], []
+
     if not result:
         return [], []
 
-    values = result[0]["values"]
-    ts = [float(v[0]) for v in values]
-    ys = [float(v[1]) for v in values]
+    values = result[0].get("values", [])
+    if not values:
+        return [], []
+
+    ts = []
+    ys = []
+
+    for item in values:
+        if len(item) < 2:
+            continue
+
+        try:
+            t = float(item[0])
+            y = float(item[1])
+        except Exception:
+            continue
+
+        if not math.isfinite(t) or not math.isfinite(y):
+            continue
+
+        ts.append(t)
+        ys.append(y)
+
     return ts, ys
 
 
-def _sine_model(t, A, T, phi, C):
-    return A * np.sin(2 * np.pi / T * t + phi) + C
+def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    清洗历史数据：
+    1. 转换为整秒时间戳
+    2. 排序
+    3. 同一秒多个值时保留最后一个
+    4. 插值补齐中间缺失秒
+    """
+    if not ts or not ys or len(ts) != len(ys):
+        return np.array([]), np.array([])
+
+    data = {}
+
+    for t, y in zip(ts, ys):
+        try:
+            sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(sec) or not math.isfinite(val):
+            continue
+
+        data[sec] = val
+
+    if not data:
+        return np.array([]), np.array([])
+
+    sorted_items = sorted(data.items(), key=lambda x: x[0])
 
+    ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
+    ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
 
-def predict_next(ts: list, ys: list, horizon: int = HORIZON_SECONDS):
+    if len(ts_clean) < 2:
+        return ts_clean, ys_clean
+
+    start_sec = int(ts_clean[0])
+    end_sec = int(ts_clean[-1])
+
+    if end_sec <= start_sec:
+        return ts_clean, ys_clean
+
+    # 统一为 1 秒网格，减少 query_range 缺点、抖动、缺失点对 FFT 的影响
+    ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float)
+    ys_grid = np.interp(ts_grid, ts_clean, ys_clean)
+
+    return ts_grid, ys_grid
+
+
+def _sine_model(t: np.ndarray, A: float, T: float, phi: float, C: float) -> np.ndarray:
+    return A * np.sin(2.0 * np.pi / T * t + phi) + C
+
+
+def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
     """
-    用 FFT 检测主频，拟合正弦波，外推未来 horizon 秒。
-    返回 (future_timestamps, predicted_values)，时间戳均在最后一个真实点之后。
-    降级策略：拟合失败时用最近 10 点线性外推。
+    使用 FFT 估算主周期。
+    ys_arr 默认是 1 秒间隔。
     """
-    ys_arr = np.array(ys)
     n = len(ys_arr)
 
-    # ── FFT 找主频 ────────────────────────────────────────────────────────────
-    fft_vals = np.fft.rfft(ys_arr - ys_arr.mean())
-    freqs = np.fft.rfftfreq(n, d=1.0)  # d=1 表示 1 秒采样间隔
-    # 跳过直流分量（index 0）
-    dominant_idx = int(np.argmax(np.abs(fft_vals[1:]))) + 1
-    dominant_freq = freqs[dominant_idx]
-    period = 1.0 / dominant_freq if dominant_freq > 0 else 60.0
-    period = float(np.clip(period, 5.0, 3600.0))  # 限制在合理范围
+    if n < 4:
+        return 60.0
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return 60.0
+
+    fft_vals = np.fft.rfft(centered)
+    freqs = np.fft.rfftfreq(n, d=1.0)
+
+    if len(freqs) <= 1:
+        return 60.0
+
+    # 跳过直流分量 index 0
+    power = np.abs(fft_vals[1:])
+    if len(power) == 0 or np.max(power) <= 0:
+        return 60.0
+
+    dominant_idx = int(np.argmax(power)) + 1
+    dominant_freq = float(freqs[dominant_idx])
+
+    if dominant_freq <= 0:
+        return 60.0
+
+    period = 1.0 / dominant_freq
+    period = float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    return period
 
-    # ── 正弦拟合 ──────────────────────────────────────────────────────────────
-    t_rel = np.arange(n, dtype=float)
-    amplitude = (ys_arr.max() - ys_arr.min()) / 2.0
-    offset = float(ys_arr.mean())
 
-    # 最后一个真实数据点的 Unix 时间戳（秒）
-    last_ts = ts[-1]
+def predict_next(
+    ts: List[float],
+    ys: List[float],
+    horizon: int = HORIZON_SECONDS,
+    start_from_now: bool = True,
+) -> Tuple[List[float], List[float]]:
+    """
+    用 FFT 检测主频，拟合正弦波，外推未来 horizon 秒。
+    返回：
+        future_timestamps: 未来整秒时间戳
+        predicted_values: 预测值
+    """
+    ts_grid, ys_grid = normalize_history(ts, ys)
+
+    if len(ys_grid) < MIN_POINTS:
+        return [], []
+
+    n = len(ys_grid)
+
+    y_min = float(np.min(ys_grid))
+    y_max = float(np.max(ys_grid))
+    y_mean = float(np.mean(ys_grid))
+    y_range = y_max - y_min
+
+    # 数据几乎不波动时，直接使用最后一个值保持
+    if y_range <= 1e-9:
+        base_ts = int(time.time()) if start_from_now else int(ts_grid[-1])
+        base_ts = max(base_ts, int(ts_grid[-1]))
+
+        ts_future = [base_ts + i + 1 for i in range(horizon)]
+        y_pred = [float(ys_grid[-1])] * horizon
+        return ts_future, y_pred
+
+    period = estimate_period_by_fft(ys_grid)
+
+    # 用真实时间戳做相对时间，而不是 np.arange(n)
+    t_fit = ts_grid - ts_grid[0]
+
+    amplitude = y_range / 2.0
+    offset = y_mean
+
+    # 预测起点统一对齐到整秒
+    if start_from_now:
+        base_ts = int(time.time())
+    else:
+        base_ts = int(ts_grid[-1])
+
+    # 避免因为 VM 查询延迟导致预测点落在最后一个真实点之前
+    base_ts = max(base_ts, int(ts_grid[-1]))
+
+    ts_future_arr = np.arange(base_ts + 1, base_ts + 1 + horizon, 1, dtype=float)
+    t_future = ts_future_arr - ts_grid[0]
 
     try:
         popt, _ = curve_fit(
             _sine_model,
-            t_rel,
-            ys_arr,
+            t_fit,
+            ys_grid,
             p0=[amplitude, period, 0.0, offset],
             bounds=(
-                [0,       5.0,    -np.pi, ys_arr.min()],
-                [np.inf,  3600.0,  np.pi, ys_arr.max()],
+                [0.0, MIN_PERIOD_SECONDS, -2.0 * np.pi, y_min - y_range],
+                [np.inf, MAX_PERIOD_SECONDS, 2.0 * np.pi, y_max + y_range],
             ),
-            maxfev=8000,
+            maxfev=12000,
+        )
+
+        y_pred_arr = _sine_model(t_future, *popt)
+
+        # 裁剪到合理范围，避免拟合异常时飞出去
+        margin = y_range * 0.2
+        lower = y_min - margin
+        upper = y_max + margin
+        y_pred_arr = np.clip(y_pred_arr, lower, upper)
+
+        if not np.all(np.isfinite(y_pred_arr)):
+            raise ValueError("预测结果包含 NaN/Inf")
+
+        logger.debug(
+            "正弦拟合成功 period=%.2fs amplitude=%.4f offset=%.4f",
+            popt[1],
+            popt[0],
+            popt[3],
         )
-        t_future = np.arange(n, n + horizon, dtype=float)
-        y_pred = _sine_model(t_future, *popt)
-        # 裁剪到历史数据值域，避免外推飞出合理范围
-        y_pred = np.clip(y_pred, ys_arr.min() * 0.5, ys_arr.max() * 1.5)
 
-        # 未来时间戳：last_ts + 1s, +2s, ..., +horizon s
-        ts_future = [last_ts + i + 1 for i in range(horizon)]
-        logger.debug("正弦拟合成功 period=%.1fs amplitude=%.2f", popt[1], popt[0])
-        return ts_future, y_pred.tolist()
+        return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist()
 
     except Exception as e:
-        logger.warning("正弦拟合失败，降级为线性外推: %s", e)
+        logger.warning("正弦拟合失败，降级为最近值平滑外推: %s", e)
+
+        # 降级策略：用最近 10 个点的均值保持，避免线性外推越走越偏
         tail = min(10, n)
-        slope = (ys_arr[-1] - ys_arr[-tail]) / tail
-        ts_future = [last_ts + i + 1 for i in range(horizon)]
-        y_pred = [float(ys_arr[-1] + slope * (i + 1)) for i in range(horizon)]
+        last_value = float(np.mean(ys_grid[-tail:]))
+
+        ts_future = ts_future_arr.tolist()
+        y_pred = [last_value] * horizon
+
         return ts_future, y_pred
 
 
-def write_predictions(ts_future: list, y_pred: list, metric_name: str, extra_labels: dict = None):
+def prom_escape_label_value(value: str) -> str:
+    """
+    Prometheus exposition label value 转义。
+    """
+    return (
+        str(value)
+        .replace("\\", "\\\\")
+        .replace("\n", "\\n")
+        .replace('"', '\\"')
+    )
+
+
+def build_selector(metric_name: str, labels: Dict[str, str]) -> str:
+    """
+    构造 PromQL selector，用于 delete_series。
+
+    示例：
+        feed_rate_predicted{device_id="fanuc-cnc"}
+    """
+    if not labels:
+        return metric_name
+
+    parts = []
+    for k in sorted(labels.keys()):
+        v = prom_escape_label_value(labels[k])
+        parts.append(f'{k}="{v}"')
+
+    return f'{metric_name}' + "{" + ",".join(parts) + "}"
+
+
+def delete_old_predictions(metric_name: str, base_labels: Dict[str, str]) -> bool:
+    """
+    删除旧预测序列，避免多轮预测窗口重叠。
+
+    注意：
+    这里故意只用 base_labels，比如 device_id。
+    不带 forecast/source 标签，是为了兼容旧版本脚本写入的无 forecast 标签数据。
+    """
+    selector = build_selector(metric_name, base_labels)
+
+    try:
+        resp = requests.post(
+            f"{VM_URL}/api/v1/admin/tsdb/delete_series",
+            params=[("match[]", selector)],
+            timeout=10,
+        )
+
+        if resp.status_code not in (200, 204):
+            logger.error(
+                "删除旧预测数据失败 metric=%s selector=%s status=%s body=%s",
+                metric_name,
+                selector,
+                resp.status_code,
+                resp.text[:500],
+            )
+            return False
+
+        logger.debug("已删除旧预测数据 selector=%s", selector)
+        return True
+
+    except requests.RequestException as e:
+        logger.error("删除旧预测数据异常 metric=%s selector=%s: %s", metric_name, selector, e)
+        return False
+
+
+def write_predictions(
+    ts_future: List[float],
+    y_pred: List[float],
+    metric_name: str,
+    labels: Dict[str, str] = None,
+) -> bool:
     """
     将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。
-    时间戳为毫秒级 Unix 时间戳，对应未来时间点。
+    时间戳为毫秒级 Unix timestamp。
     """
+    if labels is None:
+        labels = {}
+
+    if not ts_future or not y_pred or len(ts_future) != len(y_pred):
+        logger.warning("预测数据为空或长度不一致 metric=%s", metric_name)
+        return False
+
     label_str = ""
-    if extra_labels:
-        parts = [f'{k}="{v}"' for k, v in extra_labels.items()]
+    if labels:
+        parts = []
+        for k in sorted(labels.keys()):
+            v = prom_escape_label_value(labels[k])
+            parts.append(f'{k}="{v}"')
         label_str = "{" + ",".join(parts) + "}"
 
     lines = []
+
     for t, y in zip(ts_future, y_pred):
-        ts_ms = int(t * 1000)
-        lines.append(f"{metric_name}{label_str} {y:.4f} {ts_ms}")
+        try:
+            ts_sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(ts_sec) or not math.isfinite(val):
+            continue
+
+        ts_ms = ts_sec * 1000
+        lines.append(f"{metric_name}{label_str} {val:.6f} {ts_ms}")
+
+    if not lines:
+        logger.warning("没有可写入的预测点 metric=%s", metric_name)
+        return False
+
+    payload = "\n".join(lines) + "\n"
 
-    payload = "\n".join(lines)
     try:
         resp = requests.post(
             f"{VM_URL}/api/v1/import/prometheus",
-            data=payload,
+            data=payload.encode("utf-8"),
+            headers={
+                "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
+            },
             timeout=10,
         )
         resp.raise_for_status()
+        return True
+
     except requests.RequestException as e:
         logger.error("写入预测数据失败 metric=%s: %s", metric_name, e)
+        return False
+
+
+_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*')
 
 
-def _parse_labels(query: str) -> dict:
-    """从查询表达式中解析标签，如 feed_rate{device_id="fanuc-cnc"} → {"device_id": "fanuc-cnc"}"""
+def _parse_labels(query: str) -> Dict[str, str]:
+    """
+    从查询表达式中解析标签。
+
+    示例：
+        feed_rate{device_id="fanuc-cnc"} -> {"device_id": "fanuc-cnc"}
+    """
     labels = {}
-    if "{" not in query:
+
+    if "{" not in query or "}" not in query:
         return labels
-    label_part = query[query.index("{") + 1: query.index("}")]
-    for item in label_part.split(","):
-        if "=" in item:
-            k, v = item.split("=", 1)
-            labels[k.strip()] = v.strip().strip('"')
+
+    try:
+        label_part = query[query.index("{") + 1: query.rindex("}")]
+    except Exception:
+        return labels
+
+    for match in _LABEL_PATTERN.finditer(label_part):
+        key = match.group(1)
+        value = match.group(2)
+        value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\")
+        labels[key] = value
+
     return labels
 
 
+def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]:
+    result = {}
+
+    for d in dicts:
+        if not d:
+            continue
+        result.update(d)
+
+    return result
+
+
 def run_once():
     now_str = datetime.now().strftime("%H:%M:%S")
+
     for query, pred_metric in PREDICT_TARGETS:
         ts, ys = fetch_history(query)
+
         if len(ys) < MIN_POINTS:
             logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
             continue
 
-        ts_future, y_pred = predict_next(ts, ys, horizon=HORIZON_SECONDS)
-        if not ts_future:
+        ts_future, y_pred = predict_next(
+            ts,
+            ys,
+            horizon=HORIZON_SECONDS,
+            start_from_now=True,
+        )
+
+        if not ts_future or not y_pred:
+            logger.warning("[%s] %s 预测结果为空，跳过", now_str, query)
+            continue
+
+        base_labels = _parse_labels(query)
+
+        # 先删除旧预测，再写入新预测。
+        # 删除条件只带 base_labels，兼容老版本无 forecast/source 标签的脏数据。
+        if CLEAR_OLD_PREDICTIONS:
+            clear_ok = delete_old_predictions(pred_metric, base_labels)
+
+            if not clear_ok and SKIP_WRITE_IF_CLEAR_FAILED:
+                logger.error(
+                    "[%s] %s 删除旧预测失败，为避免继续制造重叠数据，本轮跳过写入",
+                    now_str,
+                    pred_metric,
+                )
+                continue
+
+        write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
+
+        ok = write_predictions(
+            ts_future=ts_future,
+            y_pred=y_pred,
+            metric_name=pred_metric,
+            labels=write_labels,
+        )
+
+        if not ok:
             continue
 
-        extra_labels = _parse_labels(query)
-        write_predictions(ts_future, y_pred, pred_metric, extra_labels)
+        future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
+        future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
 
-        future_time = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
         logger.info(
-            "[%s] %-40s → %-35s 写入 %d 点，预测至 %s",
-            now_str, query, pred_metric, len(y_pred), future_time,
+            "[%s] %-40s → %-35s 写入 %d 点，预测区间 %s ~ %s",
+            now_str,
+            query,
+            pred_metric,
+            len(y_pred),
+            future_start,
+            future_end,
         )
 
 
 def main():
     logger.info(
-        "预测服务启动  VM=%s  预测窗口=%ds  轮询间隔=%ds",
-        VM_URL, HORIZON_SECONDS, POLL_INTERVAL,
+        "预测服务启动 VM=%s 历史窗口=%dmin 预测窗口=%ds 轮询间隔=%ds 清理旧预测=%s",
+        VM_URL,
+        HISTORY_MINUTES,
+        HORIZON_SECONDS,
+        POLL_INTERVAL,
+        CLEAR_OLD_PREDICTIONS,
     )
+
     while True:
         run_once()
         time.sleep(POLL_INTERVAL)
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 88aec295671ca112fd422a28acd4d76d43a82f20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 21:21:11 +0800
Subject: [PATCH 20/31] fix

---
 ai/predict_v2.py | 263 +++++++++++++++++++++++------------------------
 1 file changed, 128 insertions(+), 135 deletions(-)

diff --git a/ai/predict_v2.py b/ai/predict_v2.py
index bc425c8..f631e12 100755
--- a/ai/predict_v2.py
+++ b/ai/predict_v2.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 """
-ProtoForge 预测服务 v3
+ProtoForge 预测服务 v4
 
 修复点：
-1. 解决 HORIZON_SECONDS > POLL_INTERVAL 时，多轮预测窗口重叠导致 Grafana 出现毛刺/竖线问题。
-2. 每轮写入新预测前，删除同一个预测 metric 的旧预测序列，只保留最新一轮预测。
-3. 预测时间戳按整秒写入，避免毫秒时间戳和 Grafana step 不对齐。
-4. 拟合使用真实 timestamp 相对时间，不再假设历史数据严格 1 秒等间隔。
-5. 对历史数据做排序、去重、NaN/Inf 清洗。
+1. 不再使用 VictoriaMetrics delete_series，避免预测历史被整条删除。
+2. 不再每 30 秒写未来 120 秒，避免多轮预测窗口重叠导致 Grafana 出现竖线/毛刺。
+3. 每轮只写未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒的数据。
+4. 使用 forecast="rolling_v2" 新标签，避免和上一版 forecast="latest" 的旧预测数据混在一起。
+5. 使用真实 timestamp 做拟合，不假设采样严格等间隔。
+6. 拟合失败时不再简单写平直线，而是尽量重复最近一个周期的波形。
 """
 
 import logging
@@ -44,29 +45,35 @@
 ]
 
 HISTORY_MINUTES = 30
+
+# 理论预测窗口
 HORIZON_SECONDS = 120
+
+# 轮询间隔
 POLL_INTERVAL = 30
+
+# 实际写入窗口。
+# 关键点：实际写入窗口不要大于轮询间隔，否则不同批次预测会重叠。
+WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
+
 MIN_POINTS = 120
 QUERY_STEP = "1s"
 
-# 关键修复：每轮写入前删除旧预测，避免 120s 预测窗口和 30s 轮询周期重叠
-CLEAR_OLD_PREDICTIONS = True
-
-# 如果删除旧预测失败，是否跳过本轮写入。
-# 建议 True，避免继续叠加脏数据。
-SKIP_WRITE_IF_CLEAR_FAILED = True
+# 不要再清理旧预测，否则历史预测会被整条删除。
+CLEAR_OLD_PREDICTIONS = False
 
-# 给新预测数据加一个稳定标签，方便 Grafana 查询过滤。
-# Grafana 可以查询：feed_rate_predicted{device_id="fanuc-cnc",forecast="latest"}
+# 使用新标签，避免和上一版 forecast="latest" 数据混在一起。
 EXTRA_PREDICT_LABELS = {
-    "forecast": "latest",
+    "forecast": "rolling_v2",
     "source": "protoforge",
 }
 
-# 正弦周期限制
 MIN_PERIOD_SECONDS = 5.0
 MAX_PERIOD_SECONDS = 3600.0
 
+# 进程内记录每条预测序列上次写到哪里，避免本进程运行期间重复写同一时间段
+LAST_WRITTEN_UNTIL: Dict[str, int] = {}
+
 # ─────────────────────────────────────────────────────────────────────────────
 
 
@@ -134,10 +141,10 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa
 def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]:
     """
     清洗历史数据：
-    1. 转换为整秒时间戳
+    1. 时间戳转为整秒
     2. 排序
     3. 同一秒多个值时保留最后一个
-    4. 插值补齐中间缺失秒
+    4. 插值补齐缺失秒
     """
     if not ts or not ys or len(ts) != len(ys):
         return np.array([]), np.array([])
@@ -173,7 +180,6 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
     if end_sec <= start_sec:
         return ts_clean, ys_clean
 
-    # 统一为 1 秒网格，减少 query_range 缺点、抖动、缺失点对 FFT 的影响
     ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float)
     ys_grid = np.interp(ts_grid, ts_clean, ys_clean)
 
@@ -187,7 +193,7 @@ def _sine_model(t: np.ndarray, A: float, T: float, phi: float, C: float) -> np.n
 def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
     """
     使用 FFT 估算主周期。
-    ys_arr 默认是 1 秒间隔。
+    ys_arr 默认已经是 1 秒间隔。
     """
     n = len(ys_arr)
 
@@ -205,8 +211,8 @@ def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
     if len(freqs) <= 1:
         return 60.0
 
-    # 跳过直流分量 index 0
     power = np.abs(fft_vals[1:])
+
     if len(power) == 0 or np.max(power) <= 0:
         return 60.0
 
@@ -222,59 +228,84 @@ def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
     return period
 
 
+def repeat_last_period(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    ts_future_arr: np.ndarray,
+    period_seconds: float,
+) -> np.ndarray:
+    """
+    拟合失败时的降级策略：
+    不直接写平直线，而是把未来时间映射回最近一个周期的历史波形。
+    """
+    if len(ts_grid) < 2:
+        return np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float)
+
+    period = max(int(round(period_seconds)), 1)
+
+    y_pred = []
+
+    hist_start = float(ts_grid[0])
+    hist_end = float(ts_grid[-1])
+
+    for future_ts in ts_future_arr:
+        mapped_ts = float(future_ts)
+
+        while mapped_ts > hist_end:
+            mapped_ts -= period
+
+        while mapped_ts < hist_start:
+            mapped_ts += period
+
+        val = float(np.interp(mapped_ts, ts_grid, ys_grid))
+        y_pred.append(val)
+
+    return np.array(y_pred, dtype=float)
+
+
 def predict_next(
     ts: List[float],
     ys: List[float],
-    horizon: int = HORIZON_SECONDS,
-    start_from_now: bool = True,
+    horizon: int,
+    base_ts: int,
 ) -> Tuple[List[float], List[float]]:
     """
     用 FFT 检测主频，拟合正弦波，外推未来 horizon 秒。
-    返回：
-        future_timestamps: 未来整秒时间戳
-        predicted_values: 预测值
+
+    base_ts:
+        从 base_ts + 1 开始写预测。
     """
     ts_grid, ys_grid = normalize_history(ts, ys)
 
     if len(ys_grid) < MIN_POINTS:
         return [], []
 
-    n = len(ys_grid)
-
     y_min = float(np.min(ys_grid))
     y_max = float(np.max(ys_grid))
     y_mean = float(np.mean(ys_grid))
     y_range = y_max - y_min
 
-    # 数据几乎不波动时，直接使用最后一个值保持
-    if y_range <= 1e-9:
-        base_ts = int(time.time()) if start_from_now else int(ts_grid[-1])
-        base_ts = max(base_ts, int(ts_grid[-1]))
+    base_ts = max(int(base_ts), int(ts_grid[-1]))
+
+    ts_future_arr = np.arange(
+        base_ts + 1,
+        base_ts + 1 + horizon,
+        1,
+        dtype=float,
+    )
 
-        ts_future = [base_ts + i + 1 for i in range(horizon)]
-        y_pred = [float(ys_grid[-1])] * horizon
-        return ts_future, y_pred
+    if y_range <= 1e-9:
+        y_pred_arr = np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float)
+        return ts_future_arr.tolist(), y_pred_arr.tolist()
 
     period = estimate_period_by_fft(ys_grid)
 
-    # 用真实时间戳做相对时间，而不是 np.arange(n)
     t_fit = ts_grid - ts_grid[0]
+    t_future = ts_future_arr - ts_grid[0]
 
     amplitude = y_range / 2.0
     offset = y_mean
 
-    # 预测起点统一对齐到整秒
-    if start_from_now:
-        base_ts = int(time.time())
-    else:
-        base_ts = int(ts_grid[-1])
-
-    # 避免因为 VM 查询延迟导致预测点落在最后一个真实点之前
-    base_ts = max(base_ts, int(ts_grid[-1]))
-
-    ts_future_arr = np.arange(base_ts + 1, base_ts + 1 + horizon, 1, dtype=float)
-    t_future = ts_future_arr - ts_grid[0]
-
     try:
         popt, _ = curve_fit(
             _sine_model,
@@ -290,7 +321,6 @@ def predict_next(
 
         y_pred_arr = _sine_model(t_future, *popt)
 
-        # 裁剪到合理范围，避免拟合异常时飞出去
         margin = y_range * 0.2
         lower = y_min - margin
         upper = y_max + margin
@@ -309,16 +339,21 @@ def predict_next(
         return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist()
 
     except Exception as e:
-        logger.warning("正弦拟合失败，降级为最近值平滑外推: %s", e)
+        logger.warning("正弦拟合失败，降级为最近周期波形复制: %s", e)
 
-        # 降级策略：用最近 10 个点的均值保持，避免线性外推越走越偏
-        tail = min(10, n)
-        last_value = float(np.mean(ys_grid[-tail:]))
+        y_pred_arr = repeat_last_period(
+            ts_grid=ts_grid,
+            ys_grid=ys_grid,
+            ts_future_arr=ts_future_arr,
+            period_seconds=period,
+        )
 
-        ts_future = ts_future_arr.tolist()
-        y_pred = [last_value] * horizon
+        margin = y_range * 0.2
+        lower = y_min - margin
+        upper = y_max + margin
+        y_pred_arr = np.clip(y_pred_arr, lower, upper)
 
-        return ts_future, y_pred
+        return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist()
 
 
 def prom_escape_label_value(value: str) -> str:
@@ -333,83 +368,34 @@ def prom_escape_label_value(value: str) -> str:
     )
 
 
-def build_selector(metric_name: str, labels: Dict[str, str]) -> str:
-    """
-    构造 PromQL selector，用于 delete_series。
-
-    示例：
-        feed_rate_predicted{device_id="fanuc-cnc"}
-    """
+def labels_to_str(labels: Dict[str, str]) -> str:
     if not labels:
-        return metric_name
+        return ""
 
     parts = []
+
     for k in sorted(labels.keys()):
         v = prom_escape_label_value(labels[k])
         parts.append(f'{k}="{v}"')
 
-    return f'{metric_name}' + "{" + ",".join(parts) + "}"
-
-
-def delete_old_predictions(metric_name: str, base_labels: Dict[str, str]) -> bool:
-    """
-    删除旧预测序列，避免多轮预测窗口重叠。
-
-    注意：
-    这里故意只用 base_labels，比如 device_id。
-    不带 forecast/source 标签，是为了兼容旧版本脚本写入的无 forecast 标签数据。
-    """
-    selector = build_selector(metric_name, base_labels)
-
-    try:
-        resp = requests.post(
-            f"{VM_URL}/api/v1/admin/tsdb/delete_series",
-            params=[("match[]", selector)],
-            timeout=10,
-        )
-
-        if resp.status_code not in (200, 204):
-            logger.error(
-                "删除旧预测数据失败 metric=%s selector=%s status=%s body=%s",
-                metric_name,
-                selector,
-                resp.status_code,
-                resp.text[:500],
-            )
-            return False
-
-        logger.debug("已删除旧预测数据 selector=%s", selector)
-        return True
-
-    except requests.RequestException as e:
-        logger.error("删除旧预测数据异常 metric=%s selector=%s: %s", metric_name, selector, e)
-        return False
+    return "{" + ",".join(parts) + "}"
 
 
 def write_predictions(
     ts_future: List[float],
     y_pred: List[float],
     metric_name: str,
-    labels: Dict[str, str] = None,
+    labels: Dict[str, str],
 ) -> bool:
     """
     将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。
     时间戳为毫秒级 Unix timestamp。
     """
-    if labels is None:
-        labels = {}
-
     if not ts_future or not y_pred or len(ts_future) != len(y_pred):
         logger.warning("预测数据为空或长度不一致 metric=%s", metric_name)
         return False
 
-    label_str = ""
-    if labels:
-        parts = []
-        for k in sorted(labels.keys()):
-            v = prom_escape_label_value(labels[k])
-            parts.append(f'{k}="{v}"')
-        label_str = "{" + ",".join(parts) + "}"
+    label_str = labels_to_str(labels)
 
     lines = []
 
@@ -449,7 +435,9 @@ def write_predictions(
         return False
 
 
-_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*')
+_LABEL_PATTERN = re.compile(
+    r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
+)
 
 
 def _parse_labels(query: str) -> Dict[str, str]:
@@ -489,6 +477,13 @@ def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]:
     return result
 
 
+def series_key(metric_name: str, labels: Dict[str, str]) -> str:
+    """
+    构造进程内唯一 key，用于记录上次写到哪个时间点。
+    """
+    return metric_name + labels_to_str(labels)
+
+
 def run_once():
     now_str = datetime.now().strftime("%H:%M:%S")
 
@@ -499,34 +494,28 @@ def run_once():
             logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
             continue
 
+        base_labels = _parse_labels(query)
+        write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
+
+        key = series_key(pred_metric, write_labels)
+
+        now_sec = int(time.time())
+        last_until = LAST_WRITTEN_UNTIL.get(key, 0)
+
+        # 防止同一进程内重复写入已经预测过的时间段
+        base_ts = max(now_sec, last_until)
+
         ts_future, y_pred = predict_next(
-            ts,
-            ys,
-            horizon=HORIZON_SECONDS,
-            start_from_now=True,
+            ts=ts,
+            ys=ys,
+            horizon=WRITE_HORIZON_SECONDS,
+            base_ts=base_ts,
         )
 
         if not ts_future or not y_pred:
             logger.warning("[%s] %s 预测结果为空，跳过", now_str, query)
             continue
 
-        base_labels = _parse_labels(query)
-
-        # 先删除旧预测，再写入新预测。
-        # 删除条件只带 base_labels，兼容老版本无 forecast/source 标签的脏数据。
-        if CLEAR_OLD_PREDICTIONS:
-            clear_ok = delete_old_predictions(pred_metric, base_labels)
-
-            if not clear_ok and SKIP_WRITE_IF_CLEAR_FAILED:
-                logger.error(
-                    "[%s] %s 删除旧预测失败，为避免继续制造重叠数据，本轮跳过写入",
-                    now_str,
-                    pred_metric,
-                )
-                continue
-
-        write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
-
         ok = write_predictions(
             ts_future=ts_future,
             y_pred=y_pred,
@@ -537,26 +526,30 @@ def run_once():
         if not ok:
             continue
 
+        LAST_WRITTEN_UNTIL[key] = int(max(ts_future))
+
         future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
         future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
 
         logger.info(
-            "[%s] %-40s → %-35s 写入 %d 点，预测区间 %s ~ %s",
+            "[%s] %-40s → %-35s 写入 %d 点，预测区间 %s ~ %s，标签=%s",
             now_str,
             query,
             pred_metric,
             len(y_pred),
             future_start,
             future_end,
+            labels_to_str(write_labels),
         )
 
 
 def main():
     logger.info(
-        "预测服务启动 VM=%s 历史窗口=%dmin 预测窗口=%ds 轮询间隔=%ds 清理旧预测=%s",
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds 清理旧预测=%s",
         VM_URL,
         HISTORY_MINUTES,
         HORIZON_SECONDS,
+        WRITE_HORIZON_SECONDS,
         POLL_INTERVAL,
         CLEAR_OLD_PREDICTIONS,
     )

From 72d5c092018c5caf59f1a9f6ae556e6eff24ecca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Wed, 20 May 2026 21:35:15 +0800
Subject: [PATCH 21/31] fix

---
 ai/predict_v2.py | 368 ++++++++++++++++++++++++-----------------------
 1 file changed, 188 insertions(+), 180 deletions(-)

diff --git a/ai/predict_v2.py b/ai/predict_v2.py
index f631e12..933a34f 100755
--- a/ai/predict_v2.py
+++ b/ai/predict_v2.py
@@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 """
-ProtoForge 预测服务 v4
+ProtoForge 预测服务 v5
 
 修复点：
-1. 不再使用 VictoriaMetrics delete_series，避免预测历史被整条删除。
-2. 不再每 30 秒写未来 120 秒，避免多轮预测窗口重叠导致 Grafana 出现竖线/毛刺。
-3. 每轮只写未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒的数据。
-4. 使用 forecast="rolling_v2" 新标签，避免和上一版 forecast="latest" 的旧预测数据混在一起。
-5. 使用真实 timestamp 做拟合，不假设采样严格等间隔。
-6. 拟合失败时不再简单写平直线，而是尽量重复最近一个周期的波形。
+1. 不再使用“单正弦拟合”作为主预测算法。
+2. 主算法改为：周期模板预测（同相位历史值加权平均）。
+3. 周期估计使用 FFT 粗估 + 自相关细化，比单纯 FFT 更稳。
+4. 若可用完整周期不足，则降级为多谐波回归（而不是单正弦）。
+5. 每轮只写入未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒，避免预测窗口重叠。
+6. 不删除旧预测历史，避免历史预测消失。
 """
 
 import logging
@@ -20,17 +20,13 @@
 
 import numpy as np
 import requests
-from scipy.optimize import curve_fit
-
 
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(message)s",
 )
-
 logger = logging.getLogger(__name__)
 
-
 # ── 配置 ──────────────────────────────────────────────────────────────────────
 
 VM_URL = "http://localhost:8428"
@@ -45,45 +41,36 @@
 ]
 
 HISTORY_MINUTES = 30
-
-# 理论预测窗口
 HORIZON_SECONDS = 120
-
-# 轮询间隔
 POLL_INTERVAL = 30
-
-# 实际写入窗口。
-# 关键点：实际写入窗口不要大于轮询间隔，否则不同批次预测会重叠。
 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
-
 MIN_POINTS = 120
 QUERY_STEP = "1s"
 
-# 不要再清理旧预测，否则历史预测会被整条删除。
-CLEAR_OLD_PREDICTIONS = False
+# 至少要有多少个完整周期，才使用“周期模板预测”
+MIN_FULL_CYCLES_FOR_TEMPLATE = 3
+MAX_CYCLES_FOR_TEMPLATE = 6
+
+# 周期范围
+MIN_PERIOD_SECONDS = 5
+MAX_PERIOD_SECONDS = 3600
+
+# 多谐波回归最高阶数（降级模式）
+MAX_HARMONICS = 4
 
-# 使用新标签，避免和上一版 forecast="latest" 数据混在一起。
 EXTRA_PREDICT_LABELS = {
-    "forecast": "rolling_v2",
+    "forecast": "seasonal_v1",
     "source": "protoforge",
 }
 
-MIN_PERIOD_SECONDS = 5.0
-MAX_PERIOD_SECONDS = 3600.0
-
-# 进程内记录每条预测序列上次写到哪里，避免本进程运行期间重复写同一时间段
+# 进程内记录每条预测序列上次写到哪里，避免本进程运行时重复写
 LAST_WRITTEN_UNTIL: Dict[str, int] = {}
 
 # ─────────────────────────────────────────────────────────────────────────────
 
 
 def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]:
-    """
-    从 VictoriaMetrics 拉取历史时序数据。
-    返回：
-        timestamps: Unix 秒级时间戳
-        values: float 数值
-    """
+    """从 VictoriaMetrics 拉取历史时序数据。"""
     now = datetime.now()
     start = now - timedelta(minutes=minutes)
 
@@ -118,20 +105,16 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa
 
     ts = []
     ys = []
-
     for item in values:
         if len(item) < 2:
             continue
-
         try:
             t = float(item[0])
             y = float(item[1])
         except Exception:
             continue
-
         if not math.isfinite(t) or not math.isfinite(y):
             continue
-
         ts.append(t)
         ys.append(y)
 
@@ -141,33 +124,29 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa
 def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]:
     """
     清洗历史数据：
-    1. 时间戳转为整秒
+    1. 时间戳整秒化
     2. 排序
-    3. 同一秒多个值时保留最后一个
-    4. 插值补齐缺失秒
+    3. 同一秒多个点保留最后一个
+    4. 按 1 秒插值补齐
     """
     if not ts or not ys or len(ts) != len(ys):
         return np.array([]), np.array([])
 
     data = {}
-
     for t, y in zip(ts, ys):
         try:
             sec = int(round(float(t)))
             val = float(y)
         except Exception:
             continue
-
         if not math.isfinite(sec) or not math.isfinite(val):
             continue
-
         data[sec] = val
 
     if not data:
         return np.array([]), np.array([])
 
     sorted_items = sorted(data.items(), key=lambda x: x[0])
-
     ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
     ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
 
@@ -186,22 +165,13 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
     return ts_grid, ys_grid
 
 
-def _sine_model(t: np.ndarray, A: float, T: float, phi: float, C: float) -> np.ndarray:
-    return A * np.sin(2.0 * np.pi / T * t + phi) + C
-
-
 def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
-    """
-    使用 FFT 估算主周期。
-    ys_arr 默认已经是 1 秒间隔。
-    """
+    """FFT 粗估周期。"""
     n = len(ys_arr)
-
-    if n < 4:
+    if n < 8:
         return 60.0
 
     centered = ys_arr - np.mean(ys_arr)
-
     if np.allclose(centered, 0):
         return 60.0
 
@@ -212,55 +182,139 @@ def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
         return 60.0
 
     power = np.abs(fft_vals[1:])
-
     if len(power) == 0 or np.max(power) <= 0:
         return 60.0
 
     dominant_idx = int(np.argmax(power)) + 1
     dominant_freq = float(freqs[dominant_idx])
-
     if dominant_freq <= 0:
         return 60.0
 
     period = 1.0 / dominant_freq
-    period = float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+    return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float:
+    """
+    用自相关在 init_period 附近细化周期估计。
+    """
+    n = len(ys_arr)
+    if n < 20:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    centered = ys_arr - np.mean(ys_arr)
+    if np.allclose(centered, 0):
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
 
-    return period
+    corr = np.correlate(centered, centered, mode="full")[n - 1:]
 
+    p0 = int(round(init_period))
+    left = max(MIN_PERIOD_SECONDS, int(max(2, p0 * 0.7)))
+    right = min(n // 2, int(max(left + 1, p0 * 1.3)))
 
-def repeat_last_period(
-    ts_grid: np.ndarray,
-    ys_grid: np.ndarray,
-    ts_future_arr: np.ndarray,
-    period_seconds: float,
-) -> np.ndarray:
+    if right <= left:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    search = corr[left:right + 1]
+    if len(search) == 0:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    best_lag = left + int(np.argmax(search))
+    return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def estimate_period(ys_arr: np.ndarray) -> float:
+    """FFT + 自相关 的组合周期估计。"""
+    p_fft = estimate_period_by_fft(ys_arr)
+    p_refined = refine_period_by_autocorr(ys_arr, p_fft)
+    return p_refined
+
+
+def seasonal_template_predict(
+    ys_arr: np.ndarray,
+    horizon: int,
+    period: int,
+    gap: int = 0,
+    max_cycles: int = MAX_CYCLES_FOR_TEMPLATE,
+) -> List[float]:
     """
-    拟合失败时的降级策略：
-    不直接写平直线，而是把未来时间映射回最近一个周期的历史波形。
+    同相位历史值加权平均预测。
+    对未来第 k 个点，取过去多个周期同相位点做加权平均：
+        y[n-1+gap+k] ≈ avg(y[n-1+gap+k-p], y[n-1+gap+k-2p], ...)
     """
-    if len(ts_grid) < 2:
-        return np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float)
+    n = len(ys_arr)
+    preds = []
 
-    period = max(int(round(period_seconds)), 1)
+    for k in range(1, horizon + 1):
+        target_idx = (n - 1) + gap + k
 
-    y_pred = []
+        values = []
+        weights = []
 
-    hist_start = float(ts_grid[0])
-    hist_end = float(ts_grid[-1])
+        # m=1 表示最近一个周期；m 越大越久远
+        for m in range(1, max_cycles + 1):
+            hist_idx = target_idx - m * period
+            if 0 <= hist_idx < n:
+                # 越近权重越大
+                w = 1.0 / m
+                values.append(float(ys_arr[hist_idx]))
+                weights.append(w)
 
-    for future_ts in ts_future_arr:
-        mapped_ts = float(future_ts)
+        if not values:
+            # 万一拿不到，退化为最后一个值
+            preds.append(float(ys_arr[-1]))
+        else:
+            preds.append(float(np.average(values, weights=weights)))
 
-        while mapped_ts > hist_end:
-            mapped_ts -= period
+    return preds
 
-        while mapped_ts < hist_start:
-            mapped_ts += period
 
-        val = float(np.interp(mapped_ts, ts_grid, ys_grid))
-        y_pred.append(val)
+def harmonic_regression_predict(
+    ys_arr: np.ndarray,
+    horizon: int,
+    period: int,
+    gap: int = 0,
+    max_harmonics: int = MAX_HARMONICS,
+) -> List[float]:
+    """
+    多谐波回归（降级模式）：
+    y = c + Σ [a_k sin(2πkt/P) + b_k cos(2πkt/P)]
+    相比单正弦，更能表达非标准正弦波形。
+    """
+    n = len(ys_arr)
+    if n < 10 or period <= 1:
+        return [float(ys_arr[-1])] * horizon
+
+    # 周期太短时，谐波数不能太大
+    K = min(max_harmonics, max(1, period // 4))
+
+    t = np.arange(n, dtype=float)
+    cols = [np.ones(n, dtype=float)]
+
+    for k in range(1, K + 1):
+        angle = 2.0 * np.pi * k * t / period
+        cols.append(np.sin(angle))
+        cols.append(np.cos(angle))
+
+    X = np.column_stack(cols)
+
+    try:
+        coef, _, _, _ = np.linalg.lstsq(X, ys_arr, rcond=None)
+    except Exception:
+        return [float(ys_arr[-1])] * horizon
+
+    t_future = np.arange(n + gap, n + gap + horizon, dtype=float)
+    cols_future = [np.ones(horizon, dtype=float)]
 
-    return np.array(y_pred, dtype=float)
+    for k in range(1, K + 1):
+        angle = 2.0 * np.pi * k * t_future / period
+        cols_future.append(np.sin(angle))
+        cols_future.append(np.cos(angle))
+
+    X_future = np.column_stack(cols_future)
+    y_pred = X_future @ coef
+
+    return y_pred.astype(float).tolist()
 
 
 def predict_next(
@@ -270,96 +324,74 @@ def predict_next(
     base_ts: int,
 ) -> Tuple[List[float], List[float]]:
     """
-    用 FFT 检测主频，拟合正弦波，外推未来 horizon 秒。
-
-    base_ts:
-        从 base_ts + 1 开始写预测。
+    主预测函数：
+    1. 周期估计
+    2. 优先使用周期模板预测
+    3. 周期不够时降级为多谐波回归
     """
     ts_grid, ys_grid = normalize_history(ts, ys)
-
     if len(ys_grid) < MIN_POINTS:
         return [], []
 
     y_min = float(np.min(ys_grid))
     y_max = float(np.max(ys_grid))
-    y_mean = float(np.mean(ys_grid))
     y_range = y_max - y_min
 
-    base_ts = max(int(base_ts), int(ts_grid[-1]))
-
-    ts_future_arr = np.arange(
-        base_ts + 1,
-        base_ts + 1 + horizon,
-        1,
-        dtype=float,
-    )
-
     if y_range <= 1e-9:
-        y_pred_arr = np.full_like(ts_future_arr, float(ys_grid[-1]), dtype=float)
-        return ts_future_arr.tolist(), y_pred_arr.tolist()
+        base_ts = max(int(base_ts), int(ts_grid[-1]))
+        ts_future = [base_ts + i + 1 for i in range(horizon)]
+        y_pred = [float(ys_grid[-1])] * horizon
+        return ts_future, y_pred
 
-    period = estimate_period_by_fft(ys_grid)
+    period_est = estimate_period(ys_grid)
+    period = int(round(period_est))
+    period = max(MIN_PERIOD_SECONDS, min(MAX_PERIOD_SECONDS, period))
 
-    t_fit = ts_grid - ts_grid[0]
-    t_future = ts_future_arr - ts_grid[0]
+    last_real_ts = int(ts_grid[-1])
+    base_ts = max(int(base_ts), last_real_ts)
 
-    amplitude = y_range / 2.0
-    offset = y_mean
+    # 如果当前时间已经超过最后一个真实点，gap 表示中间“空过去”的秒数
+    gap = max(0, base_ts - last_real_ts)
 
-    try:
-        popt, _ = curve_fit(
-            _sine_model,
-            t_fit,
-            ys_grid,
-            p0=[amplitude, period, 0.0, offset],
-            bounds=(
-                [0.0, MIN_PERIOD_SECONDS, -2.0 * np.pi, y_min - y_range],
-                [np.inf, MAX_PERIOD_SECONDS, 2.0 * np.pi, y_max + y_range],
-            ),
-            maxfev=12000,
-        )
+    ts_future = [base_ts + i + 1 for i in range(horizon)]
 
-        y_pred_arr = _sine_model(t_future, *popt)
+    full_cycles = len(ys_grid) // period if period > 0 else 0
 
-        margin = y_range * 0.2
-        lower = y_min - margin
-        upper = y_max + margin
-        y_pred_arr = np.clip(y_pred_arr, lower, upper)
-
-        if not np.all(np.isfinite(y_pred_arr)):
-            raise ValueError("预测结果包含 NaN/Inf")
-
-        logger.debug(
-            "正弦拟合成功 period=%.2fs amplitude=%.4f offset=%.4f",
-            popt[1],
-            popt[0],
-            popt[3],
+    if full_cycles >= MIN_FULL_CYCLES_FOR_TEMPLATE:
+        y_pred = seasonal_template_predict(
+            ys_arr=ys_grid,
+            horizon=horizon,
+            period=period,
+            gap=gap,
+            max_cycles=min(MAX_CYCLES_FOR_TEMPLATE, full_cycles),
         )
-
-        return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist()
-
-    except Exception as e:
-        logger.warning("正弦拟合失败，降级为最近周期波形复制: %s", e)
-
-        y_pred_arr = repeat_last_period(
-            ts_grid=ts_grid,
-            ys_grid=ys_grid,
-            ts_future_arr=ts_future_arr,
-            period_seconds=period,
+        model_name = "seasonal_template"
+    else:
+        y_pred = harmonic_regression_predict(
+            ys_arr=ys_grid,
+            horizon=horizon,
+            period=period,
+            gap=gap,
+            max_harmonics=MAX_HARMONICS,
         )
+        model_name = "harmonic_regression"
+
+    # 合理裁剪，避免偶然外推过大
+    margin = y_range * 0.15
+    lower = y_min - margin
+    upper = y_max + margin
+    y_pred = np.clip(np.array(y_pred, dtype=float), lower, upper).astype(float).tolist()
 
-        margin = y_range * 0.2
-        lower = y_min - margin
-        upper = y_max + margin
-        y_pred_arr = np.clip(y_pred_arr, lower, upper)
+    logger.debug(
+        "predict_next model=%s period=%ss full_cycles=%s gap=%s",
+        model_name, period, full_cycles, gap
+    )
 
-        return ts_future_arr.tolist(), y_pred_arr.astype(float).tolist()
+    return ts_future, y_pred
 
 
 def prom_escape_label_value(value: str) -> str:
-    """
-    Prometheus exposition label value 转义。
-    """
+    """Prometheus label value 转义。"""
     return (
         str(value)
         .replace("\\", "\\\\")
@@ -371,13 +403,10 @@ def prom_escape_label_value(value: str) -> str:
 def labels_to_str(labels: Dict[str, str]) -> str:
     if not labels:
         return ""
-
     parts = []
-
     for k in sorted(labels.keys()):
         v = prom_escape_label_value(labels[k])
         parts.append(f'{k}="{v}"')
-
     return "{" + ",".join(parts) + "}"
 
 
@@ -387,16 +416,12 @@ def write_predictions(
     metric_name: str,
     labels: Dict[str, str],
 ) -> bool:
-    """
-    将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。
-    时间戳为毫秒级 Unix timestamp。
-    """
+    """将预测值以 Prometheus exposition 格式写入 VictoriaMetrics。"""
     if not ts_future or not y_pred or len(ts_future) != len(y_pred):
         logger.warning("预测数据为空或长度不一致 metric=%s", metric_name)
         return False
 
     label_str = labels_to_str(labels)
-
     lines = []
 
     for t, y in zip(ts_future, y_pred):
@@ -422,14 +447,11 @@ def write_predictions(
         resp = requests.post(
             f"{VM_URL}/api/v1/import/prometheus",
             data=payload.encode("utf-8"),
-            headers={
-                "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
-            },
+            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
             timeout=10,
         )
         resp.raise_for_status()
         return True
-
     except requests.RequestException as e:
         logger.error("写入预测数据失败 metric=%s: %s", metric_name, e)
         return False
@@ -441,12 +463,7 @@ def write_predictions(
 
 
 def _parse_labels(query: str) -> Dict[str, str]:
-    """
-    从查询表达式中解析标签。
-
-    示例：
-        feed_rate{device_id="fanuc-cnc"} -> {"device_id": "fanuc-cnc"}
-    """
+    """从查询表达式中解析标签。"""
     labels = {}
 
     if "{" not in query or "}" not in query:
@@ -468,19 +485,13 @@ def _parse_labels(query: str) -> Dict[str, str]:
 
 def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]:
     result = {}
-
     for d in dicts:
-        if not d:
-            continue
-        result.update(d)
-
+        if d:
+            result.update(d)
     return result
 
 
 def series_key(metric_name: str, labels: Dict[str, str]) -> str:
-    """
-    构造进程内唯一 key，用于记录上次写到哪个时间点。
-    """
     return metric_name + labels_to_str(labels)
 
 
@@ -489,7 +500,6 @@ def run_once():
 
     for query, pred_metric in PREDICT_TARGETS:
         ts, ys = fetch_history(query)
-
         if len(ys) < MIN_POINTS:
             logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
             continue
@@ -502,7 +512,7 @@ def run_once():
         now_sec = int(time.time())
         last_until = LAST_WRITTEN_UNTIL.get(key, 0)
 
-        # 防止同一进程内重复写入已经预测过的时间段
+        # 避免同一进程内写重叠时间段
         base_ts = max(now_sec, last_until)
 
         ts_future, y_pred = predict_next(
@@ -522,7 +532,6 @@ def run_once():
             metric_name=pred_metric,
             labels=write_labels,
         )
-
         if not ok:
             continue
 
@@ -545,13 +554,12 @@ def run_once():
 
 def main():
     logger.info(
-        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds 清理旧预测=%s",
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds",
         VM_URL,
         HISTORY_MINUTES,
         HORIZON_SECONDS,
         WRITE_HORIZON_SECONDS,
         POLL_INTERVAL,
-        CLEAR_OLD_PREDICTIONS,
     )
 
     while True:

From c26b9991d4cfac374d56829b47023b043d270aba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 09:03:00 +0800
Subject: [PATCH 22/31] feat(ai): support single scene predict

---
 ai/predict_v3_single_scene.py | 1058 +++++++++++++++++++++++++++++++++
 1 file changed, 1058 insertions(+)
 create mode 100644 ai/predict_v3_single_scene.py

diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py
new file mode 100644
index 0000000..23af8c5
--- /dev/null
+++ b/ai/predict_v3_single_scene.py
@@ -0,0 +1,1058 @@
+# -*- coding: utf-8 -*-
+"""
+ProtoForge 预测服务 v6
+
+核心能力：
+1. 周期模板预测：适合 CNC 这类强周期、非标准正弦波形。
+2. 健康基线冻结：检测到异常后，不再用故障数据更新预测模板。
+3. 恢复冷却机制：故障恢复后，需要连续稳定多个周期，才恢复学习。
+4. 预测上下界：写入 predicted_upper / predicted_lower，方便 Grafana 展示预测带。
+5. 异常标记：写入 xxx_anomaly，1 表示异常，0 表示正常。
+6. 不删除历史预测，不使用 delete_series。
+"""
+
+"""
+场景：不考虑物料、不考虑跨程序场景算法预测
+"""
+
+import json
+import logging
+import math
+import os
+import re
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import requests
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ── 基础配置 ──────────────────────────────────────────────────────────────────
+
+VM_URL = "http://localhost:8428"
+
+STATE_FILE = "/tmp/protoforge_predictor_state.json"
+
+HISTORY_MINUTES = 30
+HORIZON_SECONDS = 120
+POLL_INTERVAL = 30
+
+# 实际每轮写入未来多少秒。
+# 不要大于 POLL_INTERVAL，否则多轮预测会重叠。
+WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
+
+QUERY_STEP = "1s"
+MIN_POINTS = 120
+
+MIN_PERIOD_SECONDS = 5
+MAX_PERIOD_SECONDS = 3600
+
+# 至少多少个完整周期才允许构建健康模板
+MIN_FULL_CYCLES_FOR_TEMPLATE = 3
+
+# 构建模板最多使用最近多少个周期
+MAX_CYCLES_FOR_TEMPLATE = 6
+
+# 检测异常使用最近多少秒实际数据
+DETECT_WINDOW_SECONDS = 15
+
+# 恢复后，至少连续正常多少秒才考虑恢复学习
+RECOVERY_MIN_SECONDS = 60
+
+# 健康状态下模板更新速度，越小越保守
+HEALTHY_EMA_ALPHA = 0.15
+
+# 故障恢复后第一次重新学习时的更新速度
+RECOVERY_EMA_ALPHA = 0.35
+
+# 最近窗口里有多少比例的点超过阈值，才认为异常
+OUTSIDE_RATIO_THRESHOLD = 0.60
+
+# 最近窗口里有多少比例的点回到阈值内，才认为恢复正常
+RECOVERY_INSIDE_RATIO_THRESHOLD = 0.80
+
+
+# ── 指标配置 ──────────────────────────────────────────────────────────────────
+# abs_threshold / rel_threshold 需要按指标单位调。
+# feed_rate 单位 mm/min，这里先给 400 和 25%。
+
+PREDICT_TARGETS = [
+    {
+        "query": 'feed_rate{device_id="fanuc-cnc"}',
+        "pred_metric": "feed_rate_predicted",
+        "anomaly_metric": "feed_rate_anomaly",
+        "abs_threshold": 400.0,
+        "rel_threshold": 0.25,
+    },
+    {
+        "query": 'spindle_speed{device_id="fanuc-cnc"}',
+        "pred_metric": "spindle_speed_predicted",
+        "anomaly_metric": "spindle_speed_anomaly",
+        "abs_threshold": 500.0,
+        "rel_threshold": 0.25,
+    },
+    {
+        "query": 'spindle_current{device_id="fanuc-cnc"}',
+        "pred_metric": "spindle_current_predicted",
+        "anomaly_metric": "spindle_current_anomaly",
+        "abs_threshold": 5.0,
+        "rel_threshold": 0.25,
+    },
+    {
+        "query": 'vibration_x{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_x_predicted",
+        "anomaly_metric": "vibration_x_anomaly",
+        "abs_threshold": 1.0,
+        "rel_threshold": 0.30,
+    },
+    {
+        "query": 'vibration_y{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_y_predicted",
+        "anomaly_metric": "vibration_y_anomaly",
+        "abs_threshold": 1.0,
+        "rel_threshold": 0.30,
+    },
+    {
+        "query": 'vibration_z{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_z_predicted",
+        "anomaly_metric": "vibration_z_anomaly",
+        "abs_threshold": 1.0,
+        "rel_threshold": 0.30,
+    },
+]
+
+EXTRA_PREDICT_LABELS = {
+    "forecast": "health_gated_v1",
+    "source": "protoforge",
+}
+
+BASELINE_STATUS_HEALTHY = "healthy"
+BASELINE_STATUS_ANOMALY = "anomaly"
+BASELINE_STATUS_RECOVERING = "recovering"
+BASELINE_STATUS_LEARNING = "learning"
+
+
+# ── 状态结构 ──────────────────────────────────────────────────────────────────
+
+@dataclass
+class BaselineState:
+    period: int
+    template: List[float]
+    status: str
+    clean_seconds: int
+    last_update_ts: int
+    last_seen_ts: int
+    y_min: float
+    y_max: float
+
+
+BASELINE_STATES: Dict[str, BaselineState] = {}
+LAST_WRITTEN_UNTIL: Dict[str, int] = {}
+
+
+# ── VM 读取 ───────────────────────────────────────────────────────────────────
+
+def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]:
+    now = datetime.now()
+    start = now - timedelta(minutes=minutes)
+
+    try:
+        resp = requests.get(
+            f"{VM_URL}/api/v1/query_range",
+            params={
+                "query": query,
+                "start": start.timestamp(),
+                "end": now.timestamp(),
+                "step": QUERY_STEP,
+            },
+            timeout=10,
+        )
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        logger.error("拉取数据失败 query=%s: %s", query, e)
+        return [], []
+
+    try:
+        result = resp.json().get("data", {}).get("result", [])
+    except Exception as e:
+        logger.error("解析 VM 返回失败 query=%s: %s", query, e)
+        return [], []
+
+    if not result:
+        return [], []
+
+    values = result[0].get("values", [])
+    if not values:
+        return [], []
+
+    ts = []
+    ys = []
+
+    for item in values:
+        if len(item) < 2:
+            continue
+
+        try:
+            t = float(item[0])
+            y = float(item[1])
+        except Exception:
+            continue
+
+        if not math.isfinite(t) or not math.isfinite(y):
+            continue
+
+        ts.append(t)
+        ys.append(y)
+
+    return ts, ys
+
+
+def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]:
+    if not ts or not ys or len(ts) != len(ys):
+        return np.array([]), np.array([])
+
+    data = {}
+
+    for t, y in zip(ts, ys):
+        try:
+            sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(sec) or not math.isfinite(val):
+            continue
+
+        data[sec] = val
+
+    if not data:
+        return np.array([]), np.array([])
+
+    sorted_items = sorted(data.items(), key=lambda x: x[0])
+
+    ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
+    ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
+
+    if len(ts_clean) < 2:
+        return ts_clean, ys_clean
+
+    start_sec = int(ts_clean[0])
+    end_sec = int(ts_clean[-1])
+
+    if end_sec <= start_sec:
+        return ts_clean, ys_clean
+
+    ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float)
+    ys_grid = np.interp(ts_grid, ts_clean, ys_clean)
+
+    return ts_grid, ys_grid
+
+
+# ── 周期估计 ──────────────────────────────────────────────────────────────────
+
+def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
+    n = len(ys_arr)
+
+    if n < 8:
+        return 60.0
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return 60.0
+
+    fft_vals = np.fft.rfft(centered)
+    freqs = np.fft.rfftfreq(n, d=1.0)
+
+    if len(freqs) <= 1:
+        return 60.0
+
+    power = np.abs(fft_vals[1:])
+
+    if len(power) == 0 or np.max(power) <= 0:
+        return 60.0
+
+    dominant_idx = int(np.argmax(power)) + 1
+    dominant_freq = float(freqs[dominant_idx])
+
+    if dominant_freq <= 0:
+        return 60.0
+
+    period = 1.0 / dominant_freq
+
+    return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float:
+    n = len(ys_arr)
+
+    if n < 20:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    corr = np.correlate(centered, centered, mode="full")[n - 1:]
+
+    p0 = int(round(init_period))
+    left = max(MIN_PERIOD_SECONDS, int(max(2, p0 * 0.7)))
+    right = min(n // 2, int(max(left + 1, p0 * 1.3)))
+
+    if right <= left:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    search = corr[left:right + 1]
+
+    if len(search) == 0:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    best_lag = left + int(np.argmax(search))
+
+    return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def estimate_period(ys_arr: np.ndarray) -> int:
+    p_fft = estimate_period_by_fft(ys_arr)
+    p_refined = refine_period_by_autocorr(ys_arr, p_fft)
+
+    period = int(round(p_refined))
+    period = max(MIN_PERIOD_SECONDS, min(MAX_PERIOD_SECONDS, period))
+
+    return int(period)
+
+
+# ── 模板构建与预测 ─────────────────────────────────────────────────────────────
+
+def fill_template_nan(template: np.ndarray) -> np.ndarray:
+    period = len(template)
+
+    if period == 0:
+        return template
+
+    idx = np.arange(period)
+    valid = np.isfinite(template)
+
+    if not np.any(valid):
+        return np.zeros(period, dtype=float)
+
+    if np.all(valid):
+        return template
+
+    x_valid = idx[valid]
+    y_valid = template[valid]
+
+    # 环形插值，处理 phase 0 附近缺口
+    x_ext = np.concatenate([x_valid - period, x_valid, x_valid + period])
+    y_ext = np.concatenate([y_valid, y_valid, y_valid])
+
+    filled = np.interp(idx, x_ext, y_ext)
+
+    return filled.astype(float)
+
+
+def build_phase_template(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    period: int,
+    max_cycles: int = MAX_CYCLES_FOR_TEMPLATE,
+    tail_seconds: Optional[int] = None,
+) -> Optional[np.ndarray]:
+    if period <= 1 or len(ys_grid) < period * MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    max_seconds = period * max_cycles
+
+    if tail_seconds is not None:
+        max_seconds = min(max_seconds, int(tail_seconds))
+
+    max_seconds = max(period * MIN_FULL_CYCLES_FOR_TEMPLATE, max_seconds)
+
+    if len(ys_grid) < max_seconds:
+        start_idx = 0
+    else:
+        start_idx = len(ys_grid) - max_seconds
+
+    ts_tail = ts_grid[start_idx:]
+    ys_tail = ys_grid[start_idx:]
+
+    if len(ys_tail) < period * MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    sums = np.zeros(period, dtype=float)
+    weights = np.zeros(period, dtype=float)
+
+    total = len(ys_tail)
+
+    for i, (t, y) in enumerate(zip(ts_tail, ys_tail)):
+        phase = int(t) % period
+
+        # 越近的数据权重越高
+        recency = (i + 1) / total
+        weight = 0.3 + 0.7 * recency
+
+        sums[phase] += float(y) * weight
+        weights[phase] += weight
+
+    template = np.full(period, np.nan, dtype=float)
+
+    valid = weights > 0
+    template[valid] = sums[valid] / weights[valid]
+
+    template = fill_template_nan(template)
+
+    return template
+
+
+def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
+    old_period = len(old_template)
+
+    if old_period == new_period:
+        return old_template.astype(float)
+
+    if old_period <= 1 or new_period <= 1:
+        return np.full(new_period, float(np.mean(old_template)), dtype=float)
+
+    old_x = np.linspace(0.0, 1.0, old_period, endpoint=False)
+    new_x = np.linspace(0.0, 1.0, new_period, endpoint=False)
+
+    old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0])
+    old_y_ext = np.concatenate([old_template, old_template, old_template])
+
+    return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
+
+
+def merge_template(
+    old_template: np.ndarray,
+    new_template: np.ndarray,
+    alpha: float,
+) -> np.ndarray:
+    alpha = float(np.clip(alpha, 0.0, 1.0))
+
+    if len(old_template) != len(new_template):
+        old_template = resample_template(old_template, len(new_template))
+
+    return ((1.0 - alpha) * old_template + alpha * new_template).astype(float)
+
+
+def predict_by_state(state: BaselineState, ts_list: List[int]) -> np.ndarray:
+    template = np.array(state.template, dtype=float)
+    period = int(state.period)
+
+    if period <= 1 or len(template) != period:
+        return np.zeros(len(ts_list), dtype=float)
+
+    values = []
+
+    for ts in ts_list:
+        phase = int(ts) % period
+        values.append(float(template[phase]))
+
+    return np.array(values, dtype=float)
+
+
+def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray:
+    return np.maximum(abs_threshold, np.abs(pred) * rel_threshold)
+
+
+def calc_bounds(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> Tuple[np.ndarray, np.ndarray]:
+    threshold = calc_threshold(pred, abs_threshold, rel_threshold)
+    lower = pred - threshold
+    upper = pred + threshold
+    return lower, upper
+
+
+# ── 异常检测与状态更新 ────────────────────────────────────────────────────────
+
+def detect_anomaly(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> Tuple[bool, float, float, float]:
+    if len(ys_grid) < DETECT_WINDOW_SECONDS:
+        return False, 0.0, 0.0, 0.0
+
+    ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
+    actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float)
+
+    pred = predict_by_state(state, ts_recent)
+    threshold = calc_threshold(pred, abs_threshold, rel_threshold)
+
+    abs_err = np.abs(actual - pred)
+    outside = abs_err > threshold
+
+    outside_ratio = float(np.mean(outside))
+    mean_abs_err = float(np.mean(abs_err))
+    mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1.0)))
+
+    is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD
+
+    return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err
+
+
+def is_recovered(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> Tuple[bool, float]:
+    if len(ys_grid) < DETECT_WINDOW_SECONDS:
+        return False, 0.0
+
+    ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
+    actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float)
+
+    pred = predict_by_state(state, ts_recent)
+    threshold = calc_threshold(pred, abs_threshold, rel_threshold)
+
+    abs_err = np.abs(actual - pred)
+    inside = abs_err <= threshold
+
+    inside_ratio = float(np.mean(inside))
+
+    return inside_ratio >= RECOVERY_INSIDE_RATIO_THRESHOLD, inside_ratio
+
+
+def create_initial_state(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    now_sec: int,
+) -> Optional[BaselineState]:
+    if len(ys_grid) < MIN_POINTS:
+        return None
+
+    period = estimate_period(ys_grid)
+
+    template = build_phase_template(
+        ts_grid=ts_grid,
+        ys_grid=ys_grid,
+        period=period,
+        max_cycles=MAX_CYCLES_FOR_TEMPLATE,
+        tail_seconds=period * MAX_CYCLES_FOR_TEMPLATE,
+    )
+
+    if template is None:
+        return None
+
+    return BaselineState(
+        period=int(period),
+        template=template.astype(float).tolist(),
+        status=BASELINE_STATUS_HEALTHY,
+        clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE),
+        last_update_ts=now_sec,
+        last_seen_ts=now_sec,
+        y_min=float(np.min(ys_grid)),
+        y_max=float(np.max(ys_grid)),
+    )
+
+
+def maybe_update_state(
+    key: str,
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> Tuple[Optional[BaselineState], bool, float, float, float]:
+    now_sec = int(time.time())
+
+    state = BASELINE_STATES.get(key)
+
+    if state is None:
+        state = create_initial_state(ts_grid, ys_grid, now_sec)
+
+        if state is None:
+            return None, False, 0.0, 0.0, 0.0
+
+        BASELINE_STATES[key] = state
+        logger.info(
+            "初始化健康模板 key=%s period=%ss clean_seconds=%ss",
+            key,
+            state.period,
+            state.clean_seconds,
+        )
+        return state, False, 0.0, 0.0, 0.0
+
+    elapsed = max(1, now_sec - int(state.last_seen_ts))
+    elapsed = min(elapsed, POLL_INTERVAL * 2)
+    state.last_seen_ts = now_sec
+
+    is_anom, outside_ratio, mean_abs_err, mean_rel_err = detect_anomaly(
+        state=state,
+        ts_grid=ts_grid,
+        ys_grid=ys_grid,
+        abs_threshold=abs_threshold,
+        rel_threshold=rel_threshold,
+    )
+
+    if is_anom:
+        state.status = BASELINE_STATUS_ANOMALY
+        state.clean_seconds = 0
+
+        logger.warning(
+            "检测到异常，冻结模板 key=%s outside_ratio=%.2f mean_abs_err=%.2f mean_rel_err=%.2f",
+            key,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+        )
+
+        BASELINE_STATES[key] = state
+        return state, True, outside_ratio, mean_abs_err, mean_rel_err
+
+    recovered, inside_ratio = is_recovered(
+        state=state,
+        ts_grid=ts_grid,
+        ys_grid=ys_grid,
+        abs_threshold=abs_threshold,
+        rel_threshold=rel_threshold,
+    )
+
+    if state.status == BASELINE_STATUS_ANOMALY:
+        if recovered:
+            state.status = BASELINE_STATUS_RECOVERING
+            state.clean_seconds = elapsed
+            logger.info(
+                "异常开始恢复 key=%s inside_ratio=%.2f clean_seconds=%ss",
+                key,
+                inside_ratio,
+                state.clean_seconds,
+            )
+        else:
+            state.clean_seconds = 0
+            BASELINE_STATES[key] = state
+            return state, True, outside_ratio, mean_abs_err, mean_rel_err
+
+    elif state.status == BASELINE_STATUS_RECOVERING:
+        if recovered:
+            state.clean_seconds += elapsed
+        else:
+            state.status = BASELINE_STATUS_ANOMALY
+            state.clean_seconds = 0
+            BASELINE_STATES[key] = state
+            return state, True, outside_ratio, mean_abs_err, mean_rel_err
+
+    else:
+        state.status = BASELINE_STATUS_HEALTHY
+        state.clean_seconds += elapsed
+
+    # 故障恢复后，不要立刻学习。
+    # 必须至少连续正常：max(RECOVERY_MIN_SECONDS, 3 个周期)
+    min_clean_for_update = max(
+        RECOVERY_MIN_SECONDS,
+        int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE,
+    )
+
+    if state.clean_seconds < min_clean_for_update:
+        BASELINE_STATES[key] = state
+        return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+    # 只使用最近 clean_seconds 这段连续正常数据来更新模板，避免历史故障污染。
+    new_period = estimate_period(ys_grid)
+    tail_seconds = min(
+        int(state.clean_seconds),
+        int(new_period) * MAX_CYCLES_FOR_TEMPLATE,
+    )
+
+    new_template = build_phase_template(
+        ts_grid=ts_grid,
+        ys_grid=ys_grid,
+        period=new_period,
+        max_cycles=MAX_CYCLES_FOR_TEMPLATE,
+        tail_seconds=tail_seconds,
+    )
+
+    if new_template is None:
+        BASELINE_STATES[key] = state
+        return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+    old_template = np.array(state.template, dtype=float)
+
+    if state.status == BASELINE_STATUS_RECOVERING:
+        alpha = RECOVERY_EMA_ALPHA
+        state.status = BASELINE_STATUS_HEALTHY
+    else:
+        alpha = HEALTHY_EMA_ALPHA
+
+    merged = merge_template(
+        old_template=old_template,
+        new_template=new_template,
+        alpha=alpha,
+    )
+
+    state.period = int(new_period)
+    state.template = merged.astype(float).tolist()
+    state.last_update_ts = now_sec
+    state.y_min = float(np.min(ys_grid[-tail_seconds:]))
+    state.y_max = float(np.max(ys_grid[-tail_seconds:]))
+
+    BASELINE_STATES[key] = state
+
+    logger.info(
+        "更新健康模板 key=%s period=%ss status=%s clean_seconds=%ss alpha=%.2f",
+        key,
+        state.period,
+        state.status,
+        state.clean_seconds,
+        alpha,
+    )
+
+    return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+
+# ── Prometheus 格式写入 ───────────────────────────────────────────────────────
+
+def prom_escape_label_value(value: str) -> str:
+    return (
+        str(value)
+        .replace("\\", "\\\\")
+        .replace("\n", "\\n")
+        .replace('"', '\\"')
+    )
+
+
+def labels_to_str(labels: Dict[str, str]) -> str:
+    if not labels:
+        return ""
+
+    parts = []
+
+    for k in sorted(labels.keys()):
+        v = prom_escape_label_value(labels[k])
+        parts.append(f'{k}="{v}"')
+
+    return "{" + ",".join(parts) + "}"
+
+
+def write_series(
+    metric_name: str,
+    labels: Dict[str, str],
+    ts_list: List[int],
+    values: List[float],
+) -> bool:
+    if not ts_list or not values or len(ts_list) != len(values):
+        return False
+
+    label_str = labels_to_str(labels)
+    lines = []
+
+    for t, y in zip(ts_list, values):
+        try:
+            ts_sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(ts_sec) or not math.isfinite(val):
+            continue
+
+        ts_ms = ts_sec * 1000
+        lines.append(f"{metric_name}{label_str} {val:.6f} {ts_ms}")
+
+    if not lines:
+        return False
+
+    payload = "\n".join(lines) + "\n"
+
+    try:
+        resp = requests.post(
+            f"{VM_URL}/api/v1/import/prometheus",
+            data=payload.encode("utf-8"),
+            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
+            timeout=10,
+        )
+        resp.raise_for_status()
+        return True
+    except requests.RequestException as e:
+        logger.error("写入数据失败 metric=%s: %s", metric_name, e)
+        return False
+
+
+def write_prediction_bundle(
+    pred_metric: str,
+    anomaly_metric: str,
+    labels: Dict[str, str],
+    ts_future: List[int],
+    pred_values: np.ndarray,
+    lower_values: np.ndarray,
+    upper_values: np.ndarray,
+    is_anomaly: bool,
+    outside_ratio: float,
+    mean_abs_err: float,
+    mean_rel_err: float,
+) -> bool:
+    ok1 = write_series(
+        metric_name=pred_metric,
+        labels=labels,
+        ts_list=ts_future,
+        values=pred_values.astype(float).tolist(),
+    )
+
+    ok2 = write_series(
+        metric_name=f"{pred_metric}_lower",
+        labels=labels,
+        ts_list=ts_future,
+        values=lower_values.astype(float).tolist(),
+    )
+
+    ok3 = write_series(
+        metric_name=f"{pred_metric}_upper",
+        labels=labels,
+        ts_list=ts_future,
+        values=upper_values.astype(float).tolist(),
+    )
+
+    now_sec = int(time.time())
+
+    anomaly_labels = dict(labels)
+    anomaly_labels["type"] = "prediction_deviation"
+
+    ok4 = write_series(
+        metric_name=anomaly_metric,
+        labels=anomaly_labels,
+        ts_list=[now_sec],
+        values=[1.0 if is_anomaly else 0.0],
+    )
+
+    ok5 = write_series(
+        metric_name=f"{anomaly_metric}_outside_ratio",
+        labels=anomaly_labels,
+        ts_list=[now_sec],
+        values=[outside_ratio],
+    )
+
+    ok6 = write_series(
+        metric_name=f"{anomaly_metric}_mean_abs_error",
+        labels=anomaly_labels,
+        ts_list=[now_sec],
+        values=[mean_abs_err],
+    )
+
+    ok7 = write_series(
+        metric_name=f"{anomaly_metric}_mean_rel_error",
+        labels=anomaly_labels,
+        ts_list=[now_sec],
+        values=[mean_rel_err],
+    )
+
+    return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7
+
+
+# ── 标签解析 ──────────────────────────────────────────────────────────────────
+
+_LABEL_PATTERN = re.compile(
+    r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
+)
+
+
+def _parse_labels(query: str) -> Dict[str, str]:
+    labels = {}
+
+    if "{" not in query or "}" not in query:
+        return labels
+
+    try:
+        label_part = query[query.index("{") + 1: query.rindex("}")]
+    except Exception:
+        return labels
+
+    for match in _LABEL_PATTERN.finditer(label_part):
+        key = match.group(1)
+        value = match.group(2)
+        value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\")
+        labels[key] = value
+
+    return labels
+
+
+def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]:
+    result = {}
+
+    for d in dicts:
+        if d:
+            result.update(d)
+
+    return result
+
+
+def series_key(metric_name: str, labels: Dict[str, str]) -> str:
+    return metric_name + labels_to_str(labels)
+
+
+# ── 状态持久化 ────────────────────────────────────────────────────────────────
+
+def load_state():
+    global BASELINE_STATES
+
+    if not os.path.exists(STATE_FILE):
+        return
+
+    try:
+        with open(STATE_FILE, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+
+        states = {}
+
+        for key, value in raw.get("baseline_states", {}).items():
+            states[key] = BaselineState(**value)
+
+        BASELINE_STATES = states
+
+        logger.info("已加载预测状态文件 %s，状态数量=%d", STATE_FILE, len(BASELINE_STATES))
+
+    except Exception as e:
+        logger.warning("加载预测状态文件失败，将重新学习: %s", e)
+
+
+def save_state():
+    try:
+        raw = {
+            "baseline_states": {
+                key: asdict(value)
+                for key, value in BASELINE_STATES.items()
+            }
+        }
+
+        tmp_file = STATE_FILE + ".tmp"
+
+        with open(tmp_file, "w", encoding="utf-8") as f:
+            json.dump(raw, f, ensure_ascii=False, indent=2)
+
+        os.replace(tmp_file, STATE_FILE)
+
+    except Exception as e:
+        logger.warning("保存预测状态文件失败: %s", e)
+
+
+# ── 主逻辑 ────────────────────────────────────────────────────────────────────
+
+def run_once():
+    now_str = datetime.now().strftime("%H:%M:%S")
+
+    for target in PREDICT_TARGETS:
+        query = target["query"]
+        pred_metric = target["pred_metric"]
+        anomaly_metric = target["anomaly_metric"]
+        abs_threshold = float(target["abs_threshold"])
+        rel_threshold = float(target["rel_threshold"])
+
+        ts, ys = fetch_history(query)
+
+        if len(ys) < MIN_POINTS:
+            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
+            continue
+
+        ts_grid, ys_grid = normalize_history(ts, ys)
+
+        if len(ys_grid) < MIN_POINTS:
+            logger.info("[%s] %s 清洗后数据不足（%d 点），跳过", now_str, query, len(ys_grid))
+            continue
+
+        base_labels = _parse_labels(query)
+        write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
+
+        key = series_key(pred_metric, write_labels)
+
+        state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state(
+            key=key,
+            ts_grid=ts_grid,
+            ys_grid=ys_grid,
+            abs_threshold=abs_threshold,
+            rel_threshold=rel_threshold,
+        )
+
+        if state is None:
+            logger.info("[%s] %s 暂无可用健康模板，等待学习", now_str, query)
+            continue
+
+        now_sec = int(time.time())
+        last_until = LAST_WRITTEN_UNTIL.get(key, 0)
+        last_real_ts = int(ts_grid[-1])
+
+        base_ts = max(now_sec, last_until, last_real_ts)
+
+        ts_future = [
+            base_ts + i + 1
+            for i in range(WRITE_HORIZON_SECONDS)
+        ]
+
+        pred_values = predict_by_state(state, ts_future)
+
+        lower_values, upper_values = calc_bounds(
+            pred=pred_values,
+            abs_threshold=abs_threshold,
+            rel_threshold=rel_threshold,
+        )
+
+        ok = write_prediction_bundle(
+            pred_metric=pred_metric,
+            anomaly_metric=anomaly_metric,
+            labels=write_labels,
+            ts_future=ts_future,
+            pred_values=pred_values,
+            lower_values=lower_values,
+            upper_values=upper_values,
+            is_anomaly=is_anomaly,
+            outside_ratio=outside_ratio,
+            mean_abs_err=mean_abs_err,
+            mean_rel_err=mean_rel_err,
+        )
+
+        if not ok:
+            continue
+
+        LAST_WRITTEN_UNTIL[key] = int(max(ts_future))
+
+        future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
+        future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
+
+        logger.info(
+            "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss clean=%ss 写入 %d 点，预测区间 %s ~ %s",
+            now_str,
+            query,
+            pred_metric,
+            state.status,
+            is_anomaly,
+            state.period,
+            state.clean_seconds,
+            len(ts_future),
+            future_start,
+            future_end,
+        )
+
+    save_state()
+
+
+def main():
+    load_state()
+
+    logger.info(
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s",
+        VM_URL,
+        HISTORY_MINUTES,
+        HORIZON_SECONDS,
+        WRITE_HORIZON_SECONDS,
+        POLL_INTERVAL,
+        STATE_FILE,
+    )
+
+    while True:
+        run_once()
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 54c4b851a004567078cfec337933aafcbd676b44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 13:39:33 +0800
Subject: [PATCH 23/31] fix

---
 ai/predict_v3_single_scene.py | 701 +++++++++++++++++++++++-----------
 1 file changed, 488 insertions(+), 213 deletions(-)
 mode change 100644 => 100755 ai/predict_v3_single_scene.py

diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py
old mode 100644
new mode 100755
index 23af8c5..fc07f4f
--- a/ai/predict_v3_single_scene.py
+++ b/ai/predict_v3_single_scene.py
@@ -1,18 +1,22 @@
 # -*- coding: utf-8 -*-
 """
-ProtoForge 预测服务 v6
-
-核心能力：
-1. 周期模板预测：适合 CNC 这类强周期、非标准正弦波形。
-2. 健康基线冻结：检测到异常后，不再用故障数据更新预测模板。
-3. 恢复冷却机制：故障恢复后，需要连续稳定多个周期，才恢复学习。
-4. 预测上下界：写入 predicted_upper / predicted_lower，方便 Grafana 展示预测带。
-5. 异常标记：写入 xxx_anomaly，1 表示异常，0 表示正常。
-6. 不删除历史预测，不使用 delete_series。
-"""
-
-"""
-场景：不考虑物料、不考虑跨程序场景算法预测
+ProtoForge Predictor v8
+
+功能：
+1. 从 VictoriaMetrics 拉取历史数据。
+2. 对 CNC 周期型指标进行相位对齐预测。
+3. 使用“谷底锚点”对齐周期，减少上升沿/下降沿相位偏差。
+4. 每轮只写入未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒，避免预测窗口重叠。
+5. 检测异常后冻结健康模板，不把故障数据学进去。
+6. 故障恢复后等待稳定一段时间，再恢复模板更新。
+7. 写入：
+   - xxx_predicted
+   - xxx_predicted_upper
+   - xxx_predicted_lower
+   - xxx_anomaly
+   - xxx_anomaly_outside_ratio
+   - xxx_anomaly_mean_abs_error
+   - xxx_anomaly_mean_rel_error
 """
 
 import json
@@ -29,6 +33,10 @@
 import requests
 
 
+# =============================================================================
+# 日志配置
+# =============================================================================
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(message)s",
@@ -37,18 +45,19 @@
 logger = logging.getLogger(__name__)
 
 
-# ── 基础配置 ──────────────────────────────────────────────────────────────────
+# =============================================================================
+# 基础配置
+# =============================================================================
 
 VM_URL = "http://localhost:8428"
 
-STATE_FILE = "/tmp/protoforge_predictor_state.json"
+STATE_FILE = "/tmp/protoforge_predictor_state_v8.json"
 
 HISTORY_MINUTES = 30
 HORIZON_SECONDS = 120
 POLL_INTERVAL = 30
 
-# 实际每轮写入未来多少秒。
-# 不要大于 POLL_INTERVAL，否则多轮预测会重叠。
+# 实际写入窗口不要大于轮询间隔，否则多轮预测会重叠。
 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
 
 QUERY_STEP = "1s"
@@ -57,34 +66,25 @@
 MIN_PERIOD_SECONDS = 5
 MAX_PERIOD_SECONDS = 3600
 
-# 至少多少个完整周期才允许构建健康模板
 MIN_FULL_CYCLES_FOR_TEMPLATE = 3
-
-# 构建模板最多使用最近多少个周期
 MAX_CYCLES_FOR_TEMPLATE = 6
 
-# 检测异常使用最近多少秒实际数据
 DETECT_WINDOW_SECONDS = 15
-
-# 恢复后，至少连续正常多少秒才考虑恢复学习
 RECOVERY_MIN_SECONDS = 60
 
-# 健康状态下模板更新速度，越小越保守
-HEALTHY_EMA_ALPHA = 0.15
-
-# 故障恢复后第一次重新学习时的更新速度
-RECOVERY_EMA_ALPHA = 0.35
+HEALTHY_EMA_ALPHA = 0.12
+RECOVERY_EMA_ALPHA = 0.30
 
-# 最近窗口里有多少比例的点超过阈值，才认为异常
 OUTSIDE_RATIO_THRESHOLD = 0.60
-
-# 最近窗口里有多少比例的点回到阈值内，才认为恢复正常
 RECOVERY_INSIDE_RATIO_THRESHOLD = 0.80
 
+PHASE_SEARCH_RATIO = 0.15
+VALLEY_QUANTILE = 45
+
 
-# ── 指标配置 ──────────────────────────────────────────────────────────────────
-# abs_threshold / rel_threshold 需要按指标单位调。
-# feed_rate 单位 mm/min，这里先给 400 和 25%。
+# =============================================================================
+# 预测指标配置
+# =============================================================================
 
 PREDICT_TARGETS = [
     {
@@ -132,21 +132,23 @@
 ]
 
 EXTRA_PREDICT_LABELS = {
-    "forecast": "health_gated_v1",
+    "forecast": "phase_aligned_health_v8",
     "source": "protoforge",
 }
 
 BASELINE_STATUS_HEALTHY = "healthy"
 BASELINE_STATUS_ANOMALY = "anomaly"
 BASELINE_STATUS_RECOVERING = "recovering"
-BASELINE_STATUS_LEARNING = "learning"
 
 
-# ── 状态结构 ──────────────────────────────────────────────────────────────────
+# =============================================================================
+# 状态结构
+# =============================================================================
 
 @dataclass
 class BaselineState:
     period: int
+    phase_origin_ts: int
     template: List[float]
     status: str
     clean_seconds: int
@@ -160,7 +162,9 @@ class BaselineState:
 LAST_WRITTEN_UNTIL: Dict[str, int] = {}
 
 
-# ── VM 读取 ───────────────────────────────────────────────────────────────────
+# =============================================================================
+# VictoriaMetrics 读取
+# =============================================================================
 
 def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]:
     now = datetime.now()
@@ -258,7 +262,25 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
     return ts_grid, ys_grid
 
 
-# ── 周期估计 ──────────────────────────────────────────────────────────────────
+# =============================================================================
+# 周期估计
+# =============================================================================
+
+def moving_average(arr: np.ndarray, window: int) -> np.ndarray:
+    if window <= 1 or len(arr) < window:
+        return arr.astype(float)
+
+    window = int(window)
+
+    if window % 2 == 0:
+        window += 1
+
+    kernel = np.ones(window, dtype=float) / window
+    pad = window // 2
+    padded = np.pad(arr.astype(float), (pad, pad), mode="edge")
+
+    return np.convolve(padded, kernel, mode="valid")
+
 
 def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
     n = len(ys_arr)
@@ -307,7 +329,7 @@ def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float:
     corr = np.correlate(centered, centered, mode="full")[n - 1:]
 
     p0 = int(round(init_period))
-    left = max(MIN_PERIOD_SECONDS, int(max(2, p0 * 0.7)))
+    left = max(int(MIN_PERIOD_SECONDS), int(max(2, p0 * 0.7)))
     right = min(n // 2, int(max(left + 1, p0 * 1.3)))
 
     if right <= left:
@@ -323,96 +345,252 @@ def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float:
     return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
 
 
-def estimate_period(ys_arr: np.ndarray) -> int:
+def estimate_period_rough(ys_arr: np.ndarray) -> int:
     p_fft = estimate_period_by_fft(ys_arr)
     p_refined = refine_period_by_autocorr(ys_arr, p_fft)
 
     period = int(round(p_refined))
-    period = max(MIN_PERIOD_SECONDS, min(MAX_PERIOD_SECONDS, period))
+    period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period))
 
     return int(period)
 
 
-# ── 模板构建与预测 ─────────────────────────────────────────────────────────────
+# =============================================================================
+# 谷底锚点检测
+# =============================================================================
 
-def fill_template_nan(template: np.ndarray) -> np.ndarray:
-    period = len(template)
+def find_valley_indices(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    expected_period: int,
+) -> List[int]:
+    n = len(ys_grid)
 
-    if period == 0:
-        return template
+    if n < max(10, expected_period * 2):
+        return []
+
+    period = max(3, int(expected_period))
+
+    smooth_window = max(3, int(round(period * 0.08)))
+    smooth_window = min(smooth_window, 21)
+
+    ys_smooth = moving_average(ys_grid, smooth_window)
+    threshold = float(np.percentile(ys_smooth, VALLEY_QUANTILE))
+
+    candidates = []
+
+    for i in range(1, n - 1):
+        if (
+            ys_smooth[i] <= ys_smooth[i - 1]
+            and ys_smooth[i] < ys_smooth[i + 1]
+            and ys_smooth[i] <= threshold
+        ):
+            candidates.append(i)
+
+    if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        candidates = []
+
+        for i in range(1, n - 1):
+            if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]:
+                candidates.append(i)
+
+    if not candidates:
+        return []
 
-    idx = np.arange(period)
-    valid = np.isfinite(template)
+    min_distance = max(2, int(round(period * 0.55)))
+    selected = []
 
-    if not np.any(valid):
-        return np.zeros(period, dtype=float)
+    for idx in candidates:
+        if not selected:
+            selected.append(idx)
+            continue
+
+        if idx - selected[-1] >= min_distance:
+            selected.append(idx)
+            continue
+
+        if ys_smooth[idx] < ys_smooth[selected[-1]]:
+            selected[-1] = idx
 
-    if np.all(valid):
-        return template
+    if len(selected) < 2:
+        return selected
 
-    x_valid = idx[valid]
-    y_valid = template[valid]
+    cleaned = [selected[0]]
+
+    for idx in selected[1:]:
+        diff = int(ts_grid[idx] - ts_grid[cleaned[-1]])
+
+        if int(period * 0.55) <= diff <= int(period * 1.60):
+            cleaned.append(idx)
+            continue
 
-    # 环形插值，处理 phase 0 附近缺口
-    x_ext = np.concatenate([x_valid - period, x_valid, x_valid + period])
-    y_ext = np.concatenate([y_valid, y_valid, y_valid])
+        if diff < int(period * 0.55):
+            if ys_smooth[idx] < ys_smooth[cleaned[-1]]:
+                cleaned[-1] = idx
+            continue
 
-    filled = np.interp(idx, x_ext, y_ext)
+        cleaned.append(idx)
 
-    return filled.astype(float)
+    return cleaned
 
 
-def build_phase_template(
+def detect_period_and_valleys(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+) -> Tuple[int, List[int]]:
+    rough = estimate_period_rough(ys_grid)
+    valleys = find_valley_indices(ts_grid, ys_grid, rough)
+
+    if len(valleys) >= 3:
+        diffs = np.diff(ts_grid[valleys])
+        good = diffs[(diffs >= rough * 0.55) & (diffs <= rough * 1.60)]
+
+        if len(good) > 0:
+            period = int(round(float(np.median(good))))
+        else:
+            period = rough
+    else:
+        period = rough
+
+    period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period))
+
+    return int(period), valleys
+
+
+# =============================================================================
+# 相位对齐模板构建
+# =============================================================================
+
+def build_template_from_valleys(
     ts_grid: np.ndarray,
     ys_grid: np.ndarray,
     period: int,
+    valleys: List[int],
     max_cycles: int = MAX_CYCLES_FOR_TEMPLATE,
-    tail_seconds: Optional[int] = None,
 ) -> Optional[np.ndarray]:
-    if period <= 1 or len(ys_grid) < period * MIN_FULL_CYCLES_FOR_TEMPLATE:
+    if period <= 1 or len(valleys) < MIN_FULL_CYCLES_FOR_TEMPLATE + 1:
         return None
 
-    max_seconds = period * max_cycles
+    pairs = []
+
+    for a, b in zip(valleys[:-1], valleys[1:]):
+        cycle_len = float(ts_grid[b] - ts_grid[a])
 
-    if tail_seconds is not None:
-        max_seconds = min(max_seconds, int(tail_seconds))
+        if period * 0.55 <= cycle_len <= period * 1.60:
+            pairs.append((a, b, cycle_len))
 
-    max_seconds = max(period * MIN_FULL_CYCLES_FOR_TEMPLATE, max_seconds)
+    if len(pairs) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    pairs = pairs[-max_cycles:]
+
+    phase_grid = np.arange(period, dtype=float)
+    segments = []
+    weights = []
+
+    for idx, (a, b, cycle_len) in enumerate(pairs):
+        seg_ts = ts_grid[a:b + 1]
+        seg_y = ys_grid[a:b + 1]
+
+        if len(seg_y) < 3:
+            continue
 
-    if len(ys_grid) < max_seconds:
-        start_idx = 0
+        x_old = (seg_ts - seg_ts[0]) / cycle_len * period
+        seg = np.interp(phase_grid, x_old, seg_y)
+
+        segments.append(seg.astype(float))
+
+        weight = 0.5 + 0.5 * ((idx + 1) / len(pairs))
+        weights.append(weight)
+
+    if len(segments) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    arr = np.vstack(segments)
+    w_arr = np.array(weights, dtype=float)
+
+    template = np.average(arr, axis=0, weights=w_arr)
+
+    return template.astype(float)
+
+
+def build_current_baseline(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    tail_seconds: Optional[int] = None,
+) -> Optional[Tuple[int, int, np.ndarray]]:
+    if len(ys_grid) < MIN_POINTS:
+        return None
+
+    if tail_seconds is not None and tail_seconds > 0:
+        cutoff = ts_grid[-1] - int(tail_seconds)
+        mask = ts_grid >= cutoff
+        ts_use = ts_grid[mask]
+        ys_use = ys_grid[mask]
     else:
-        start_idx = len(ys_grid) - max_seconds
+        ts_use = ts_grid
+        ys_use = ys_grid
+
+    if len(ys_use) < MIN_POINTS:
+        return None
 
-    ts_tail = ts_grid[start_idx:]
-    ys_tail = ys_grid[start_idx:]
+    period, valleys = detect_period_and_valleys(ts_use, ys_use)
 
-    if len(ys_tail) < period * MIN_FULL_CYCLES_FOR_TEMPLATE:
+    template = build_template_from_valleys(
+        ts_grid=ts_use,
+        ys_grid=ys_use,
+        period=period,
+        valleys=valleys,
+    )
+
+    if template is None or len(valleys) == 0:
         return None
 
-    sums = np.zeros(period, dtype=float)
-    weights = np.zeros(period, dtype=float)
+    phase_origin_ts = int(round(float(ts_use[valleys[-1]])))
 
-    total = len(ys_tail)
+    return int(period), phase_origin_ts, template
 
-    for i, (t, y) in enumerate(zip(ts_tail, ys_tail)):
-        phase = int(t) % period
 
-        # 越近的数据权重越高
-        recency = (i + 1) / total
-        weight = 0.3 + 0.7 * recency
+# =============================================================================
+# 模板预测
+# =============================================================================
 
-        sums[phase] += float(y) * weight
-        weights[phase] += weight
+def circular_template_value(template: np.ndarray, phase: float) -> float:
+    period = len(template)
 
-    template = np.full(period, np.nan, dtype=float)
+    if period == 0:
+        return 0.0
+
+    phase = float(phase) % period
 
-    valid = weights > 0
-    template[valid] = sums[valid] / weights[valid]
+    i0 = int(math.floor(phase)) % period
+    i1 = (i0 + 1) % period
 
-    template = fill_template_nan(template)
+    frac = phase - math.floor(phase)
 
-    return template
+    return float((1.0 - frac) * template[i0] + frac * template[i1])
+
+
+def predict_with_origin(
+    state: BaselineState,
+    ts_list: List[int],
+    phase_origin_ts: Optional[int] = None,
+) -> np.ndarray:
+    template = np.array(state.template, dtype=float)
+    period = int(state.period)
+
+    if period <= 1 or len(template) != period:
+        return np.zeros(len(ts_list), dtype=float)
+
+    origin = int(state.phase_origin_ts if phase_origin_ts is None else phase_origin_ts)
+
+    values = []
+
+    for ts in ts_list:
+        phase = (int(ts) - origin) % period
+        values.append(circular_template_value(template, phase))
+
+    return np.array(values, dtype=float)
 
 
 def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
@@ -433,6 +611,38 @@ def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
     return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
 
 
+def align_new_template_to_old(
+    old_template: np.ndarray,
+    new_template: np.ndarray,
+) -> np.ndarray:
+    if len(old_template) != len(new_template):
+        old_template = resample_template(old_template, len(new_template))
+
+    period = len(new_template)
+
+    if period <= 2:
+        return new_template.astype(float)
+
+    max_shift = max(1, int(round(period * 0.10)))
+
+    old_norm = old_template - np.mean(old_template)
+
+    best_score = None
+    best_template = new_template
+
+    for shift in range(-max_shift, max_shift + 1):
+        shifted = np.roll(new_template, shift)
+        shifted_norm = shifted - np.mean(shifted)
+
+        score = float(np.dot(old_norm, shifted_norm))
+
+        if best_score is None or score > best_score:
+            best_score = score
+            best_template = shifted
+
+    return best_template.astype(float)
+
+
 def merge_template(
     old_template: np.ndarray,
     new_template: np.ndarray,
@@ -443,37 +653,64 @@ def merge_template(
     if len(old_template) != len(new_template):
         old_template = resample_template(old_template, len(new_template))
 
-    return ((1.0 - alpha) * old_template + alpha * new_template).astype(float)
-
-
-def predict_by_state(state: BaselineState, ts_list: List[int]) -> np.ndarray:
-    template = np.array(state.template, dtype=float)
-    period = int(state.period)
-
-    if period <= 1 or len(template) != period:
-        return np.zeros(len(ts_list), dtype=float)
+    new_template = align_new_template_to_old(old_template, new_template)
 
-    values = []
+    merged = (1.0 - alpha) * old_template + alpha * new_template
 
-    for ts in ts_list:
-        phase = int(ts) % period
-        values.append(float(template[phase]))
+    return merged.astype(float)
 
-    return np.array(values, dtype=float)
 
+# =============================================================================
+# 异常检测
+# =============================================================================
 
-def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray:
+def calc_threshold(
+    pred: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> np.ndarray:
     return np.maximum(abs_threshold, np.abs(pred) * rel_threshold)
 
 
-def calc_bounds(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> Tuple[np.ndarray, np.ndarray]:
+def calc_bounds(
+    pred: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> Tuple[np.ndarray, np.ndarray]:
     threshold = calc_threshold(pred, abs_threshold, rel_threshold)
+
     lower = pred - threshold
     upper = pred + threshold
+
     return lower, upper
 
 
-# ── 异常检测与状态更新 ────────────────────────────────────────────────────────
+def find_best_phase_origin_for_recent(
+    state: BaselineState,
+    ts_recent: List[int],
+    actual: np.ndarray,
+) -> Tuple[int, np.ndarray, float]:
+    period = int(state.period)
+    base_origin = int(state.phase_origin_ts)
+
+    max_shift = max(1, int(round(period * PHASE_SEARCH_RATIO)))
+
+    best_origin = base_origin
+    best_pred = predict_with_origin(state, ts_recent, base_origin)
+    best_mae = float(np.mean(np.abs(actual - best_pred)))
+
+    for shift in range(-max_shift, max_shift + 1):
+        origin = base_origin + shift
+        pred = predict_with_origin(state, ts_recent, origin)
+        mae = float(np.mean(np.abs(actual - pred)))
+
+        if mae < best_mae:
+            best_mae = mae
+            best_origin = origin
+            best_pred = pred
+
+    return best_origin, best_pred, best_mae
+
 
 def detect_anomaly(
     state: BaselineState,
@@ -481,14 +718,19 @@ def detect_anomaly(
     ys_grid: np.ndarray,
     abs_threshold: float,
     rel_threshold: float,
-) -> Tuple[bool, float, float, float]:
+) -> Tuple[bool, float, float, float, int]:
     if len(ys_grid) < DETECT_WINDOW_SECONDS:
-        return False, 0.0, 0.0, 0.0
+        return False, 0.0, 0.0, 0.0, int(state.phase_origin_ts)
 
     ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
     actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float)
 
-    pred = predict_by_state(state, ts_recent)
+    best_origin, pred, _ = find_best_phase_origin_for_recent(
+        state=state,
+        ts_recent=ts_recent,
+        actual=actual,
+    )
+
     threshold = calc_threshold(pred, abs_threshold, rel_threshold)
 
     abs_err = np.abs(actual - pred)
@@ -500,56 +742,28 @@ def detect_anomaly(
 
     is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD
 
-    return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err
-
-
-def is_recovered(
-    state: BaselineState,
-    ts_grid: np.ndarray,
-    ys_grid: np.ndarray,
-    abs_threshold: float,
-    rel_threshold: float,
-) -> Tuple[bool, float]:
-    if len(ys_grid) < DETECT_WINDOW_SECONDS:
-        return False, 0.0
-
-    ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
-    actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float)
-
-    pred = predict_by_state(state, ts_recent)
-    threshold = calc_threshold(pred, abs_threshold, rel_threshold)
-
-    abs_err = np.abs(actual - pred)
-    inside = abs_err <= threshold
-
-    inside_ratio = float(np.mean(inside))
+    return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err, int(best_origin)
 
-    return inside_ratio >= RECOVERY_INSIDE_RATIO_THRESHOLD, inside_ratio
 
+# =============================================================================
+# 健康基线状态管理
+# =============================================================================
 
 def create_initial_state(
     ts_grid: np.ndarray,
     ys_grid: np.ndarray,
     now_sec: int,
 ) -> Optional[BaselineState]:
-    if len(ys_grid) < MIN_POINTS:
-        return None
-
-    period = estimate_period(ys_grid)
-
-    template = build_phase_template(
-        ts_grid=ts_grid,
-        ys_grid=ys_grid,
-        period=period,
-        max_cycles=MAX_CYCLES_FOR_TEMPLATE,
-        tail_seconds=period * MAX_CYCLES_FOR_TEMPLATE,
-    )
+    baseline = build_current_baseline(ts_grid, ys_grid)
 
-    if template is None:
+    if baseline is None:
         return None
 
+    period, phase_origin_ts, template = baseline
+
     return BaselineState(
         period=int(period),
+        phase_origin_ts=int(phase_origin_ts),
         template=template.astype(float).tolist(),
         status=BASELINE_STATUS_HEALTHY,
         clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE),
@@ -578,19 +792,23 @@ def maybe_update_state(
             return None, False, 0.0, 0.0, 0.0
 
         BASELINE_STATES[key] = state
+
         logger.info(
-            "初始化健康模板 key=%s period=%ss clean_seconds=%ss",
+            "初始化健康模板 key=%s period=%ss origin=%s clean=%ss",
             key,
             state.period,
+            datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
             state.clean_seconds,
         )
+
         return state, False, 0.0, 0.0, 0.0
 
     elapsed = max(1, now_sec - int(state.last_seen_ts))
     elapsed = min(elapsed, POLL_INTERVAL * 2)
+
     state.last_seen_ts = now_sec
 
-    is_anom, outside_ratio, mean_abs_err, mean_rel_err = detect_anomaly(
+    is_anom, outside_ratio, mean_abs_err, mean_rel_err, best_origin = detect_anomaly(
         state=state,
         ts_grid=ts_grid,
         ys_grid=ys_grid,
@@ -602,6 +820,8 @@ def maybe_update_state(
         state.status = BASELINE_STATUS_ANOMALY
         state.clean_seconds = 0
 
+        BASELINE_STATES[key] = state
+
         logger.warning(
             "检测到异常，冻结模板 key=%s outside_ratio=%.2f mean_abs_err=%.2f mean_rel_err=%.2f",
             key,
@@ -610,47 +830,39 @@ def maybe_update_state(
             mean_rel_err,
         )
 
-        BASELINE_STATES[key] = state
         return state, True, outside_ratio, mean_abs_err, mean_rel_err
 
-    recovered, inside_ratio = is_recovered(
-        state=state,
-        ts_grid=ts_grid,
-        ys_grid=ys_grid,
-        abs_threshold=abs_threshold,
-        rel_threshold=rel_threshold,
-    )
+    old_origin = int(state.phase_origin_ts)
+    state.phase_origin_ts = int(best_origin)
+
+    if abs(state.phase_origin_ts - old_origin) >= 1:
+        logger.debug(
+            "相位校正 key=%s origin %s -> %s",
+            key,
+            datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"),
+            datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+        )
 
     if state.status == BASELINE_STATUS_ANOMALY:
-        if recovered:
-            state.status = BASELINE_STATUS_RECOVERING
-            state.clean_seconds = elapsed
-            logger.info(
-                "异常开始恢复 key=%s inside_ratio=%.2f clean_seconds=%ss",
-                key,
-                inside_ratio,
-                state.clean_seconds,
-            )
-        else:
-            state.clean_seconds = 0
-            BASELINE_STATES[key] = state
-            return state, True, outside_ratio, mean_abs_err, mean_rel_err
+        state.status = BASELINE_STATUS_RECOVERING
+        state.clean_seconds = elapsed
 
-    elif state.status == BASELINE_STATUS_RECOVERING:
-        if recovered:
-            state.clean_seconds += elapsed
-        else:
-            state.status = BASELINE_STATUS_ANOMALY
-            state.clean_seconds = 0
-            BASELINE_STATES[key] = state
-            return state, True, outside_ratio, mean_abs_err, mean_rel_err
+        BASELINE_STATES[key] = state
 
+        logger.info(
+            "异常开始恢复 key=%s clean_seconds=%ss",
+            key,
+            state.clean_seconds,
+        )
+
+        return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+    if state.status == BASELINE_STATUS_RECOVERING:
+        state.clean_seconds += elapsed
     else:
         state.status = BASELINE_STATUS_HEALTHY
         state.clean_seconds += elapsed
 
-    # 故障恢复后，不要立刻学习。
-    # 必须至少连续正常：max(RECOVERY_MIN_SECONDS, 3 个周期)
     min_clean_for_update = max(
         RECOVERY_MIN_SECONDS,
         int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE,
@@ -660,30 +872,26 @@ def maybe_update_state(
         BASELINE_STATES[key] = state
         return state, False, outside_ratio, mean_abs_err, mean_rel_err
 
-    # 只使用最近 clean_seconds 这段连续正常数据来更新模板，避免历史故障污染。
-    new_period = estimate_period(ys_grid)
     tail_seconds = min(
         int(state.clean_seconds),
-        int(new_period) * MAX_CYCLES_FOR_TEMPLATE,
+        int(state.period) * MAX_CYCLES_FOR_TEMPLATE,
     )
 
-    new_template = build_phase_template(
+    baseline = build_current_baseline(
         ts_grid=ts_grid,
         ys_grid=ys_grid,
-        period=new_period,
-        max_cycles=MAX_CYCLES_FOR_TEMPLATE,
         tail_seconds=tail_seconds,
     )
 
-    if new_template is None:
+    if baseline is None:
         BASELINE_STATES[key] = state
         return state, False, outside_ratio, mean_abs_err, mean_rel_err
 
+    new_period, new_origin, new_template = baseline
     old_template = np.array(state.template, dtype=float)
 
     if state.status == BASELINE_STATUS_RECOVERING:
         alpha = RECOVERY_EMA_ALPHA
-        state.status = BASELINE_STATUS_HEALTHY
     else:
         alpha = HEALTHY_EMA_ALPHA
 
@@ -694,18 +902,25 @@ def maybe_update_state(
     )
 
     state.period = int(new_period)
+    state.phase_origin_ts = int(new_origin)
     state.template = merged.astype(float).tolist()
+    state.status = BASELINE_STATUS_HEALTHY
     state.last_update_ts = now_sec
-    state.y_min = float(np.min(ys_grid[-tail_seconds:]))
-    state.y_max = float(np.max(ys_grid[-tail_seconds:]))
+
+    if tail_seconds > 0 and len(ys_grid) >= tail_seconds:
+        state.y_min = float(np.min(ys_grid[-tail_seconds:]))
+        state.y_max = float(np.max(ys_grid[-tail_seconds:]))
+    else:
+        state.y_min = float(np.min(ys_grid))
+        state.y_max = float(np.max(ys_grid))
 
     BASELINE_STATES[key] = state
 
     logger.info(
-        "更新健康模板 key=%s period=%ss status=%s clean_seconds=%ss alpha=%.2f",
+        "更新健康模板 key=%s period=%ss origin=%s clean=%ss alpha=%.2f",
         key,
         state.period,
-        state.status,
+        datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
         state.clean_seconds,
         alpha,
     )
@@ -713,7 +928,9 @@ def maybe_update_state(
     return state, False, outside_ratio, mean_abs_err, mean_rel_err
 
 
-# ── Prometheus 格式写入 ───────────────────────────────────────────────────────
+# =============================================================================
+# Prometheus Exposition 写入
+# =============================================================================
 
 def prom_escape_label_value(value: str) -> str:
     return (
@@ -731,8 +948,7 @@ def labels_to_str(labels: Dict[str, str]) -> str:
     parts = []
 
     for k in sorted(labels.keys()):
-        v = prom_escape_label_value(labels[k])
-        parts.append(f'{k}="{v}"')
+        parts.append(f'{k}="{prom_escape_label_value(labels[k])}"')
 
     return "{" + ",".join(parts) + "}"
 
@@ -771,11 +987,14 @@ def write_series(
         resp = requests.post(
             f"{VM_URL}/api/v1/import/prometheus",
             data=payload.encode("utf-8"),
-            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
+            headers={
+                "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
+            },
             timeout=10,
         )
         resp.raise_for_status()
         return True
+
     except requests.RequestException as e:
         logger.error("写入数据失败 metric=%s: %s", metric_name, e)
         return False
@@ -851,28 +1070,37 @@ def write_prediction_bundle(
     return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7
 
 
-# ── 标签解析 ──────────────────────────────────────────────────────────────────
+# =============================================================================
+# 标签解析
+# =============================================================================
 
 _LABEL_PATTERN = re.compile(
     r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
 )
 
 
-def _parse_labels(query: str) -> Dict[str, str]:
+def parse_labels_from_query(query: str) -> Dict[str, str]:
     labels = {}
 
     if "{" not in query or "}" not in query:
         return labels
 
     try:
-        label_part = query[query.index("{") + 1: query.rindex("}")]
+        label_part = query[query.index("{") + 1:query.rindex("}")]
     except Exception:
         return labels
 
     for match in _LABEL_PATTERN.finditer(label_part):
         key = match.group(1)
         value = match.group(2)
-        value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\")
+
+        value = (
+            value
+            .replace('\\"', '"')
+            .replace("\\n", "\n")
+            .replace("\\\\", "\\")
+        )
+
         labels[key] = value
 
     return labels
@@ -892,9 +1120,11 @@ def series_key(metric_name: str, labels: Dict[str, str]) -> str:
     return metric_name + labels_to_str(labels)
 
 
-# ── 状态持久化 ────────────────────────────────────────────────────────────────
+# =============================================================================
+# 状态持久化
+# =============================================================================
 
-def load_state():
+def load_state() -> None:
     global BASELINE_STATES
 
     if not os.path.exists(STATE_FILE):
@@ -907,17 +1137,36 @@ def load_state():
         states = {}
 
         for key, value in raw.get("baseline_states", {}).items():
+            required_fields = {
+                "period",
+                "phase_origin_ts",
+                "template",
+                "status",
+                "clean_seconds",
+                "last_update_ts",
+                "last_seen_ts",
+                "y_min",
+                "y_max",
+            }
+
+            if not required_fields.issubset(set(value.keys())):
+                continue
+
             states[key] = BaselineState(**value)
 
         BASELINE_STATES = states
 
-        logger.info("已加载预测状态文件 %s，状态数量=%d", STATE_FILE, len(BASELINE_STATES))
+        logger.info(
+            "已加载预测状态文件 %s，状态数量=%d",
+            STATE_FILE,
+            len(BASELINE_STATES),
+        )
 
     except Exception as e:
         logger.warning("加载预测状态文件失败，将重新学习: %s", e)
 
 
-def save_state():
+def save_state() -> None:
     try:
         raw = {
             "baseline_states": {
@@ -937,9 +1186,11 @@ def save_state():
         logger.warning("保存预测状态文件失败: %s", e)
 
 
-# ── 主逻辑 ────────────────────────────────────────────────────────────────────
+# =============================================================================
+# 主流程
+# =============================================================================
 
-def run_once():
+def run_once() -> None:
     now_str = datetime.now().strftime("%H:%M:%S")
 
     for target in PREDICT_TARGETS:
@@ -952,16 +1203,26 @@ def run_once():
         ts, ys = fetch_history(query)
 
         if len(ys) < MIN_POINTS:
-            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
+            logger.info(
+                "[%s] %s 数据不足（%d 点），跳过",
+                now_str,
+                query,
+                len(ys),
+            )
             continue
 
         ts_grid, ys_grid = normalize_history(ts, ys)
 
         if len(ys_grid) < MIN_POINTS:
-            logger.info("[%s] %s 清洗后数据不足（%d 点），跳过", now_str, query, len(ys_grid))
+            logger.info(
+                "[%s] %s 清洗后数据不足（%d 点），跳过",
+                now_str,
+                query,
+                len(ys_grid),
+            )
             continue
 
-        base_labels = _parse_labels(query)
+        base_labels = parse_labels_from_query(query)
         write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
 
         key = series_key(pred_metric, write_labels)
@@ -975,7 +1236,11 @@ def run_once():
         )
 
         if state is None:
-            logger.info("[%s] %s 暂无可用健康模板，等待学习", now_str, query)
+            logger.info(
+                "[%s] %s 暂无可用健康模板，等待学习",
+                now_str,
+                query,
+            )
             continue
 
         now_sec = int(time.time())
@@ -989,7 +1254,7 @@ def run_once():
             for i in range(WRITE_HORIZON_SECONDS)
         ]
 
-        pred_values = predict_by_state(state, ts_future)
+        pred_values = predict_with_origin(state, ts_future)
 
         lower_values, upper_values = calc_bounds(
             pred=pred_values,
@@ -1012,21 +1277,28 @@ def run_once():
         )
 
         if not ok:
+            logger.error(
+                "[%s] %s 写入预测数据失败",
+                now_str,
+                query,
+            )
             continue
 
         LAST_WRITTEN_UNTIL[key] = int(max(ts_future))
 
         future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
         future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
+        origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S")
 
         logger.info(
-            "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss clean=%ss 写入 %d 点，预测区间 %s ~ %s",
+            "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss origin=%s clean=%ss 写入 %d 点，预测区间 %s ~ %s",
             now_str,
             query,
             pred_metric,
             state.status,
             is_anomaly,
             state.period,
+            origin_str,
             state.clean_seconds,
             len(ts_future),
             future_start,
@@ -1036,17 +1308,18 @@ def run_once():
     save_state()
 
 
-def main():
+def main() -> None:
     load_state()
 
     logger.info(
-        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s",
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s",
         VM_URL,
         HISTORY_MINUTES,
         HORIZON_SECONDS,
         WRITE_HORIZON_SECONDS,
         POLL_INTERVAL,
         STATE_FILE,
+        EXTRA_PREDICT_LABELS["forecast"],
     )
 
     while True:
@@ -1055,4 +1328,6 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
+
+    
\ No newline at end of file

From 76e536eff9beb9e3f9db7453bbeb6f1c7844c198 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 13:55:08 +0800
Subject: [PATCH 24/31] fix

---
 ai/predict_v3_single_scene.py | 316 ++++++++++++----------------------
 1 file changed, 110 insertions(+), 206 deletions(-)

diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py
index fc07f4f..2cde8b8 100755
--- a/ai/predict_v3_single_scene.py
+++ b/ai/predict_v3_single_scene.py
@@ -1,22 +1,12 @@
 # -*- coding: utf-8 -*-
 """
-ProtoForge Predictor v8
-
-功能：
-1. 从 VictoriaMetrics 拉取历史数据。
-2. 对 CNC 周期型指标进行相位对齐预测。
-3. 使用“谷底锚点”对齐周期，减少上升沿/下降沿相位偏差。
-4. 每轮只写入未来 min(HORIZON_SECONDS, POLL_INTERVAL) 秒，避免预测窗口重叠。
-5. 检测异常后冻结健康模板，不把故障数据学进去。
-6. 故障恢复后等待稳定一段时间，再恢复模板更新。
-7. 写入：
-   - xxx_predicted
-   - xxx_predicted_upper
-   - xxx_predicted_lower
-   - xxx_anomaly
-   - xxx_anomaly_outside_ratio
-   - xxx_anomaly_mean_abs_error
-   - xxx_anomaly_mean_rel_error
+ProtoForge Predictor v9
+
+修复重点：
+1. 预测时间轴改为锚定最后一个真实数据点 last_real_ts，而不是锚定 time.time()。
+2. 不再使用 LAST_WRITTEN_UNTIL 把预测不断推向更远未来，避免 Grafana 里预测线相对真实线出现延迟/错位。
+3. 如果真实数据时间戳没有推进，则跳过本轮预测写入，避免重复写同一段未来时间造成毛刺。
+4. 保留：相位对齐、健康模板冻结、故障期不学习、恢复后再学习、预测上下界、异常指标。
 """
 
 import json
@@ -50,14 +40,13 @@
 # =============================================================================
 
 VM_URL = "http://localhost:8428"
-
-STATE_FILE = "/tmp/protoforge_predictor_state_v8.json"
+STATE_FILE = "/tmp/protoforge_predictor_state_v9.json"
 
 HISTORY_MINUTES = 30
 HORIZON_SECONDS = 120
 POLL_INTERVAL = 30
 
-# 实际写入窗口不要大于轮询间隔，否则多轮预测会重叠。
+# 实际每轮写入的预测长度。不要大于 POLL_INTERVAL，否则容易出现预测窗口重叠。
 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
 
 QUERY_STEP = "1s"
@@ -76,14 +65,23 @@
 RECOVERY_EMA_ALPHA = 0.30
 
 OUTSIDE_RATIO_THRESHOLD = 0.60
-RECOVERY_INSIDE_RATIO_THRESHOLD = 0.80
-
 PHASE_SEARCH_RATIO = 0.15
 VALLEY_QUANTILE = 45
 
+# 关键修复：预测时间轴锚定真实数据最后一个点。
+# True：预测从 last_real_ts + 1 开始，适合 Grafana 与真实曲线对齐展示。
+# False：预测从当前系统时间 + 1 开始，适合只看纯未来预测，但容易与有采集延迟的真实数据错位。
+ALIGN_PREDICTION_TO_LAST_REAL_TS = True
+
+# 如果 last_real_ts 距离当前系统时间太久，说明采集链路可能断了，跳过预测，避免用陈旧数据继续画未来线。
+MAX_DATA_LAG_SECONDS = 180
+
+# 真实数据至少推进多少秒，才写入新预测，避免同一段未来时间被反复写入。
+MIN_REAL_ADVANCE_SECONDS = 1
+
 
 # =============================================================================
-# 预测指标配置
+# 指标配置
 # =============================================================================
 
 PREDICT_TARGETS = [
@@ -132,7 +130,7 @@
 ]
 
 EXTRA_PREDICT_LABELS = {
-    "forecast": "phase_aligned_health_v8",
+    "forecast": "phase_aligned_health_v9",
     "source": "protoforge",
 }
 
@@ -159,7 +157,10 @@ class BaselineState:
 
 
 BASELINE_STATES: Dict[str, BaselineState] = {}
-LAST_WRITTEN_UNTIL: Dict[str, int] = {}
+
+# 记录每条序列最后一次使用的真实数据时间戳，而不是预测写到哪里。
+# 这样不会把预测不断推向更远的未来。
+LAST_REAL_TS_WRITTEN: Dict[str, int] = {}
 
 
 # =============================================================================
@@ -243,7 +244,6 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
         return np.array([]), np.array([])
 
     sorted_items = sorted(data.items(), key=lambda x: x[0])
-
     ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
     ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
 
@@ -263,7 +263,7 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
 
 
 # =============================================================================
-# 周期估计
+# 周期估计与谷底检测
 # =============================================================================
 
 def moving_average(arr: np.ndarray, window: int) -> np.ndarray:
@@ -355,10 +355,6 @@ def estimate_period_rough(ys_arr: np.ndarray) -> int:
     return int(period)
 
 
-# =============================================================================
-# 谷底锚点检测
-# =============================================================================
-
 def find_valley_indices(
     ts_grid: np.ndarray,
     ys_grid: np.ndarray,
@@ -370,7 +366,6 @@ def find_valley_indices(
         return []
 
     period = max(3, int(expected_period))
-
     smooth_window = max(3, int(round(period * 0.08)))
     smooth_window = min(smooth_window, 21)
 
@@ -389,7 +384,6 @@ def find_valley_indices(
 
     if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE:
         candidates = []
-
         for i in range(1, n - 1):
             if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]:
                 candidates.append(i)
@@ -458,7 +452,7 @@ def detect_period_and_valleys(
 
 
 # =============================================================================
-# 相位对齐模板构建
+# 相位对齐模板
 # =============================================================================
 
 def build_template_from_valleys(
@@ -552,7 +546,7 @@ def build_current_baseline(
 
 
 # =============================================================================
-# 模板预测
+# 预测与模板合并
 # =============================================================================
 
 def circular_template_value(template: np.ndarray, phase: float) -> float:
@@ -562,10 +556,8 @@ def circular_template_value(template: np.ndarray, phase: float) -> float:
         return 0.0
 
     phase = float(phase) % period
-
     i0 = int(math.floor(phase)) % period
     i1 = (i0 + 1) % period
-
     frac = phase - math.floor(phase)
 
     return float((1.0 - frac) * template[i0] + frac * template[i1])
@@ -583,7 +575,6 @@ def predict_with_origin(
         return np.zeros(len(ts_list), dtype=float)
 
     origin = int(state.phase_origin_ts if phase_origin_ts is None else phase_origin_ts)
-
     values = []
 
     for ts in ts_list:
@@ -611,10 +602,7 @@ def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
     return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
 
 
-def align_new_template_to_old(
-    old_template: np.ndarray,
-    new_template: np.ndarray,
-) -> np.ndarray:
+def align_new_template_to_old(old_template: np.ndarray, new_template: np.ndarray) -> np.ndarray:
     if len(old_template) != len(new_template):
         old_template = resample_template(old_template, len(new_template))
 
@@ -624,7 +612,6 @@ def align_new_template_to_old(
         return new_template.astype(float)
 
     max_shift = max(1, int(round(period * 0.10)))
-
     old_norm = old_template - np.mean(old_template)
 
     best_score = None
@@ -633,7 +620,6 @@ def align_new_template_to_old(
     for shift in range(-max_shift, max_shift + 1):
         shifted = np.roll(new_template, shift)
         shifted_norm = shifted - np.mean(shifted)
-
         score = float(np.dot(old_norm, shifted_norm))
 
         if best_score is None or score > best_score:
@@ -643,18 +629,13 @@ def align_new_template_to_old(
     return best_template.astype(float)
 
 
-def merge_template(
-    old_template: np.ndarray,
-    new_template: np.ndarray,
-    alpha: float,
-) -> np.ndarray:
+def merge_template(old_template: np.ndarray, new_template: np.ndarray, alpha: float) -> np.ndarray:
     alpha = float(np.clip(alpha, 0.0, 1.0))
 
     if len(old_template) != len(new_template):
         old_template = resample_template(old_template, len(new_template))
 
     new_template = align_new_template_to_old(old_template, new_template)
-
     merged = (1.0 - alpha) * old_template + alpha * new_template
 
     return merged.astype(float)
@@ -664,11 +645,7 @@ def merge_template(
 # 异常检测
 # =============================================================================
 
-def calc_threshold(
-    pred: np.ndarray,
-    abs_threshold: float,
-    rel_threshold: float,
-) -> np.ndarray:
+def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray:
     return np.maximum(abs_threshold, np.abs(pred) * rel_threshold)
 
 
@@ -678,11 +655,7 @@ def calc_bounds(
     rel_threshold: float,
 ) -> Tuple[np.ndarray, np.ndarray]:
     threshold = calc_threshold(pred, abs_threshold, rel_threshold)
-
-    lower = pred - threshold
-    upper = pred + threshold
-
-    return lower, upper
+    return pred - threshold, pred + threshold
 
 
 def find_best_phase_origin_for_recent(
@@ -692,7 +665,6 @@ def find_best_phase_origin_for_recent(
 ) -> Tuple[int, np.ndarray, float]:
     period = int(state.period)
     base_origin = int(state.phase_origin_ts)
-
     max_shift = max(1, int(round(period * PHASE_SEARCH_RATIO)))
 
     best_origin = base_origin
@@ -732,14 +704,12 @@ def detect_anomaly(
     )
 
     threshold = calc_threshold(pred, abs_threshold, rel_threshold)
-
     abs_err = np.abs(actual - pred)
     outside = abs_err > threshold
 
     outside_ratio = float(np.mean(outside))
     mean_abs_err = float(np.mean(abs_err))
     mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1.0)))
-
     is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD
 
     return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err, int(best_origin)
@@ -749,11 +719,7 @@ def detect_anomaly(
 # 健康基线状态管理
 # =============================================================================
 
-def create_initial_state(
-    ts_grid: np.ndarray,
-    ys_grid: np.ndarray,
-    now_sec: int,
-) -> Optional[BaselineState]:
+def create_initial_state(ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int) -> Optional[BaselineState]:
     baseline = build_current_baseline(ts_grid, ys_grid)
 
     if baseline is None:
@@ -782,7 +748,6 @@ def maybe_update_state(
     rel_threshold: float,
 ) -> Tuple[Optional[BaselineState], bool, float, float, float]:
     now_sec = int(time.time())
-
     state = BASELINE_STATES.get(key)
 
     if state is None:
@@ -805,7 +770,6 @@ def maybe_update_state(
 
     elapsed = max(1, now_sec - int(state.last_seen_ts))
     elapsed = min(elapsed, POLL_INTERVAL * 2)
-
     state.last_seen_ts = now_sec
 
     is_anom, outside_ratio, mean_abs_err, mean_rel_err, best_origin = detect_anomaly(
@@ -819,7 +783,6 @@ def maybe_update_state(
     if is_anom:
         state.status = BASELINE_STATUS_ANOMALY
         state.clean_seconds = 0
-
         BASELINE_STATES[key] = state
 
         logger.warning(
@@ -846,15 +809,9 @@ def maybe_update_state(
     if state.status == BASELINE_STATUS_ANOMALY:
         state.status = BASELINE_STATUS_RECOVERING
         state.clean_seconds = elapsed
-
         BASELINE_STATES[key] = state
 
-        logger.info(
-            "异常开始恢复 key=%s clean_seconds=%ss",
-            key,
-            state.clean_seconds,
-        )
-
+        logger.info("异常开始恢复 key=%s clean_seconds=%ss", key, state.clean_seconds)
         return state, False, outside_ratio, mean_abs_err, mean_rel_err
 
     if state.status == BASELINE_STATUS_RECOVERING:
@@ -877,11 +834,7 @@ def maybe_update_state(
         int(state.period) * MAX_CYCLES_FOR_TEMPLATE,
     )
 
-    baseline = build_current_baseline(
-        ts_grid=ts_grid,
-        ys_grid=ys_grid,
-        tail_seconds=tail_seconds,
-    )
+    baseline = build_current_baseline(ts_grid=ts_grid, ys_grid=ys_grid, tail_seconds=tail_seconds)
 
     if baseline is None:
         BASELINE_STATES[key] = state
@@ -889,17 +842,9 @@ def maybe_update_state(
 
     new_period, new_origin, new_template = baseline
     old_template = np.array(state.template, dtype=float)
+    alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA
 
-    if state.status == BASELINE_STATUS_RECOVERING:
-        alpha = RECOVERY_EMA_ALPHA
-    else:
-        alpha = HEALTHY_EMA_ALPHA
-
-    merged = merge_template(
-        old_template=old_template,
-        new_template=new_template,
-        alpha=alpha,
-    )
+    merged = merge_template(old_template=old_template, new_template=new_template, alpha=alpha)
 
     state.period = int(new_period)
     state.phase_origin_ts = int(new_origin)
@@ -933,12 +878,7 @@ def maybe_update_state(
 # =============================================================================
 
 def prom_escape_label_value(value: str) -> str:
-    return (
-        str(value)
-        .replace("\\", "\\\\")
-        .replace("\n", "\\n")
-        .replace('"', '\\"')
-    )
+    return str(value).replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
 
 
 def labels_to_str(labels: Dict[str, str]) -> str:
@@ -975,8 +915,7 @@ def write_series(
         if not math.isfinite(ts_sec) or not math.isfinite(val):
             continue
 
-        ts_ms = ts_sec * 1000
-        lines.append(f"{metric_name}{label_str} {val:.6f} {ts_ms}")
+        lines.append(f"{metric_name}{label_str} {val:.6f} {ts_sec * 1000}")
 
     if not lines:
         return False
@@ -987,9 +926,7 @@ def write_series(
         resp = requests.post(
             f"{VM_URL}/api/v1/import/prometheus",
             data=payload.encode("utf-8"),
-            headers={
-                "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
-            },
+            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
             timeout=10,
         )
         resp.raise_for_status()
@@ -1012,60 +949,19 @@ def write_prediction_bundle(
     outside_ratio: float,
     mean_abs_err: float,
     mean_rel_err: float,
+    event_ts: int,
 ) -> bool:
-    ok1 = write_series(
-        metric_name=pred_metric,
-        labels=labels,
-        ts_list=ts_future,
-        values=pred_values.astype(float).tolist(),
-    )
-
-    ok2 = write_series(
-        metric_name=f"{pred_metric}_lower",
-        labels=labels,
-        ts_list=ts_future,
-        values=lower_values.astype(float).tolist(),
-    )
-
-    ok3 = write_series(
-        metric_name=f"{pred_metric}_upper",
-        labels=labels,
-        ts_list=ts_future,
-        values=upper_values.astype(float).tolist(),
-    )
-
-    now_sec = int(time.time())
+    ok1 = write_series(pred_metric, labels, ts_future, pred_values.astype(float).tolist())
+    ok2 = write_series(f"{pred_metric}_lower", labels, ts_future, lower_values.astype(float).tolist())
+    ok3 = write_series(f"{pred_metric}_upper", labels, ts_future, upper_values.astype(float).tolist())
 
     anomaly_labels = dict(labels)
     anomaly_labels["type"] = "prediction_deviation"
 
-    ok4 = write_series(
-        metric_name=anomaly_metric,
-        labels=anomaly_labels,
-        ts_list=[now_sec],
-        values=[1.0 if is_anomaly else 0.0],
-    )
-
-    ok5 = write_series(
-        metric_name=f"{anomaly_metric}_outside_ratio",
-        labels=anomaly_labels,
-        ts_list=[now_sec],
-        values=[outside_ratio],
-    )
-
-    ok6 = write_series(
-        metric_name=f"{anomaly_metric}_mean_abs_error",
-        labels=anomaly_labels,
-        ts_list=[now_sec],
-        values=[mean_abs_err],
-    )
-
-    ok7 = write_series(
-        metric_name=f"{anomaly_metric}_mean_rel_error",
-        labels=anomaly_labels,
-        ts_list=[now_sec],
-        values=[mean_rel_err],
-    )
+    ok4 = write_series(anomaly_metric, anomaly_labels, [event_ts], [1.0 if is_anomaly else 0.0])
+    ok5 = write_series(f"{anomaly_metric}_outside_ratio", anomaly_labels, [event_ts], [outside_ratio])
+    ok6 = write_series(f"{anomaly_metric}_mean_abs_error", anomaly_labels, [event_ts], [mean_abs_err])
+    ok7 = write_series(f"{anomaly_metric}_mean_rel_error", anomaly_labels, [event_ts], [mean_rel_err])
 
     return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7
 
@@ -1074,9 +970,7 @@ def write_prediction_bundle(
 # 标签解析
 # =============================================================================
 
-_LABEL_PATTERN = re.compile(
-    r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
-)
+_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*')
 
 
 def parse_labels_from_query(query: str) -> Dict[str, str]:
@@ -1093,14 +987,7 @@ def parse_labels_from_query(query: str) -> Dict[str, str]:
     for match in _LABEL_PATTERN.finditer(label_part):
         key = match.group(1)
         value = match.group(2)
-
-        value = (
-            value
-            .replace('\\"', '"')
-            .replace("\\n", "\n")
-            .replace("\\\\", "\\")
-        )
-
+        value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\")
         labels[key] = value
 
     return labels
@@ -1155,12 +1042,7 @@ def load_state() -> None:
             states[key] = BaselineState(**value)
 
         BASELINE_STATES = states
-
-        logger.info(
-            "已加载预测状态文件 %s，状态数量=%d",
-            STATE_FILE,
-            len(BASELINE_STATES),
-        )
+        logger.info("已加载预测状态文件 %s，状态数量=%d", STATE_FILE, len(BASELINE_STATES))
 
     except Exception as e:
         logger.warning("加载预测状态文件失败，将重新学习: %s", e)
@@ -1186,6 +1068,44 @@ def save_state() -> None:
         logger.warning("保存预测状态文件失败: %s", e)
 
 
+# =============================================================================
+# 时间轴选择
+# =============================================================================
+
+def build_prediction_timestamps(key: str, last_real_ts: int, now_sec: int) -> Optional[List[int]]:
+    data_lag = now_sec - last_real_ts
+
+    if data_lag > MAX_DATA_LAG_SECONDS:
+        logger.warning(
+            "真实数据延迟过大，跳过预测 key=%s data_lag=%ss max=%ss",
+            key,
+            data_lag,
+            MAX_DATA_LAG_SECONDS,
+        )
+        return None
+
+    last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key)
+
+    if last_written_real_ts is not None:
+        advance = last_real_ts - int(last_written_real_ts)
+
+        if advance < MIN_REAL_ADVANCE_SECONDS:
+            logger.info(
+                "真实数据时间戳未推进，跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s",
+                key,
+                last_real_ts,
+                last_written_real_ts,
+            )
+            return None
+
+    if ALIGN_PREDICTION_TO_LAST_REAL_TS:
+        base_ts = last_real_ts
+    else:
+        base_ts = now_sec
+
+    return [base_ts + i + 1 for i in range(WRITE_HORIZON_SECONDS)]
+
+
 # =============================================================================
 # 主流程
 # =============================================================================
@@ -1203,28 +1123,17 @@ def run_once() -> None:
         ts, ys = fetch_history(query)
 
         if len(ys) < MIN_POINTS:
-            logger.info(
-                "[%s] %s 数据不足（%d 点），跳过",
-                now_str,
-                query,
-                len(ys),
-            )
+            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
             continue
 
         ts_grid, ys_grid = normalize_history(ts, ys)
 
         if len(ys_grid) < MIN_POINTS:
-            logger.info(
-                "[%s] %s 清洗后数据不足（%d 点），跳过",
-                now_str,
-                query,
-                len(ys_grid),
-            )
+            logger.info("[%s] %s 清洗后数据不足（%d 点），跳过", now_str, query, len(ys_grid))
             continue
 
         base_labels = parse_labels_from_query(query)
         write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
-
         key = series_key(pred_metric, write_labels)
 
         state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state(
@@ -1236,26 +1145,23 @@ def run_once() -> None:
         )
 
         if state is None:
-            logger.info(
-                "[%s] %s 暂无可用健康模板，等待学习",
-                now_str,
-                query,
-            )
+            logger.info("[%s] %s 暂无可用健康模板，等待学习", now_str, query)
             continue
 
         now_sec = int(time.time())
-        last_until = LAST_WRITTEN_UNTIL.get(key, 0)
         last_real_ts = int(ts_grid[-1])
+        data_lag = now_sec - last_real_ts
 
-        base_ts = max(now_sec, last_until, last_real_ts)
+        ts_future = build_prediction_timestamps(
+            key=key,
+            last_real_ts=last_real_ts,
+            now_sec=now_sec,
+        )
 
-        ts_future = [
-            base_ts + i + 1
-            for i in range(WRITE_HORIZON_SECONDS)
-        ]
+        if not ts_future:
+            continue
 
         pred_values = predict_with_origin(state, ts_future)
-
         lower_values, upper_values = calc_bounds(
             pred=pred_values,
             abs_threshold=abs_threshold,
@@ -1274,24 +1180,22 @@ def run_once() -> None:
             outside_ratio=outside_ratio,
             mean_abs_err=mean_abs_err,
             mean_rel_err=mean_rel_err,
+            event_ts=last_real_ts,
         )
 
         if not ok:
-            logger.error(
-                "[%s] %s 写入预测数据失败",
-                now_str,
-                query,
-            )
+            logger.error("[%s] %s 写入预测数据失败", now_str, query)
             continue
 
-        LAST_WRITTEN_UNTIL[key] = int(max(ts_future))
+        LAST_REAL_TS_WRITTEN[key] = last_real_ts
 
         future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
         future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
+        last_real_str = datetime.fromtimestamp(last_real_ts).strftime("%H:%M:%S")
         origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S")
 
         logger.info(
-            "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss origin=%s clean=%ss 写入 %d 点，预测区间 %s ~ %s",
+            "[%s] %-40s → %-35s status=%s anomaly=%s period=%ss origin=%s last_real=%s lag=%ss 写入 %d 点，预测区间 %s ~ %s",
             now_str,
             query,
             pred_metric,
@@ -1299,7 +1203,8 @@ def run_once() -> None:
             is_anomaly,
             state.period,
             origin_str,
-            state.clean_seconds,
+            last_real_str,
+            data_lag,
             len(ts_future),
             future_start,
             future_end,
@@ -1312,7 +1217,7 @@ def main() -> None:
     load_state()
 
     logger.info(
-        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s",
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s align_to_last_real=%s",
         VM_URL,
         HISTORY_MINUTES,
         HORIZON_SECONDS,
@@ -1320,6 +1225,7 @@ def main() -> None:
         POLL_INTERVAL,
         STATE_FILE,
         EXTRA_PREDICT_LABELS["forecast"],
+        ALIGN_PREDICTION_TO_LAST_REAL_TS,
     )
 
     while True:
@@ -1329,5 +1235,3 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
-
-    
\ No newline at end of file

From f5e7b2a27320066b5eb9157f297d6651adcd36c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 14:05:54 +0800
Subject: [PATCH 25/31] fix

---
 ai/predict_v3_single_scene.py | 520 +++++++++++++++++++++++++---------
 1 file changed, 385 insertions(+), 135 deletions(-)

diff --git a/ai/predict_v3_single_scene.py b/ai/predict_v3_single_scene.py
index 2cde8b8..d212d2d 100755
--- a/ai/predict_v3_single_scene.py
+++ b/ai/predict_v3_single_scene.py
@@ -1,12 +1,14 @@
 # -*- coding: utf-8 -*-
 """
-ProtoForge Predictor v9
+ProtoForge Predictor v10
 
 修复重点：
-1. 预测时间轴改为锚定最后一个真实数据点 last_real_ts，而不是锚定 time.time()。
-2. 不再使用 LAST_WRITTEN_UNTIL 把预测不断推向更远未来，避免 Grafana 里预测线相对真实线出现延迟/错位。
-3. 如果真实数据时间戳没有推进，则跳过本轮预测写入，避免重复写同一段未来时间造成毛刺。
-4. 保留：相位对齐、健康模板冻结、故障期不学习、恢复后再学习、预测上下界、异常指标。
+1. 修复 lag=0 但预测线仍然相位漂移的问题。
+2. 在谷底相位对齐基础上，增加 phase-lock 相位锁定。
+3. 每轮使用最近 1~2 个周期真实数据，搜索最佳 period + phase_origin。
+4. 预测起点仍然锚定最后一个真实点 last_real_ts，避免写入延迟。
+5. 保留健康模板冻结逻辑：异常期间不学习故障数据。
+6. 保留预测上下界和异常指标。
 """
 
 import json
@@ -40,13 +42,12 @@
 # =============================================================================
 
 VM_URL = "http://localhost:8428"
-STATE_FILE = "/tmp/protoforge_predictor_state_v9.json"
+STATE_FILE = "/tmp/protoforge_predictor_state_v10.json"
 
 HISTORY_MINUTES = 30
 HORIZON_SECONDS = 120
 POLL_INTERVAL = 30
 
-# 实际每轮写入的预测长度。不要大于 POLL_INTERVAL，否则容易出现预测窗口重叠。
 WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
 
 QUERY_STEP = "1s"
@@ -58,26 +59,29 @@
 MIN_FULL_CYCLES_FOR_TEMPLATE = 3
 MAX_CYCLES_FOR_TEMPLATE = 6
 
-DETECT_WINDOW_SECONDS = 15
+DETECT_WINDOW_SECONDS = 20
 RECOVERY_MIN_SECONDS = 60
 
-HEALTHY_EMA_ALPHA = 0.12
-RECOVERY_EMA_ALPHA = 0.30
+HEALTHY_EMA_ALPHA = 0.10
+RECOVERY_EMA_ALPHA = 0.25
 
 OUTSIDE_RATIO_THRESHOLD = 0.60
-PHASE_SEARCH_RATIO = 0.15
+
 VALLEY_QUANTILE = 45
 
-# 关键修复：预测时间轴锚定真实数据最后一个点。
-# True：预测从 last_real_ts + 1 开始，适合 Grafana 与真实曲线对齐展示。
-# False：预测从当前系统时间 + 1 开始，适合只看纯未来预测，但容易与有采集延迟的真实数据错位。
-ALIGN_PREDICTION_TO_LAST_REAL_TS = True
+# phase-lock 配置
+PHASE_LOCK_MIN_WINDOW_SECONDS = 45
+PHASE_LOCK_MAX_WINDOW_SECONDS = 180
+PHASE_LOCK_PERIOD_SEARCH_RATIO = 0.12
+PHASE_LOCK_ORIGIN_SEARCH_RATIO = 0.35
+PHASE_LOCK_PERIOD_STEP = 1
+PHASE_LOCK_ORIGIN_STEP = 1
 
-# 如果 last_real_ts 距离当前系统时间太久，说明采集链路可能断了，跳过预测，避免用陈旧数据继续画未来线。
+# 真实数据延迟超过这个值，就不继续预测
 MAX_DATA_LAG_SECONDS = 180
 
-# 真实数据至少推进多少秒，才写入新预测，避免同一段未来时间被反复写入。
-MIN_REAL_ADVANCE_SECONDS = 1
+# 预测锚定最后一个真实点
+ALIGN_PREDICTION_TO_LAST_REAL_TS = True
 
 
 # =============================================================================
@@ -130,7 +134,7 @@
 ]
 
 EXTRA_PREDICT_LABELS = {
-    "forecast": "phase_aligned_health_v9",
+    "forecast": "phase_locked_health_v10",
     "source": "protoforge",
 }
 
@@ -157,9 +161,6 @@ class BaselineState:
 
 
 BASELINE_STATES: Dict[str, BaselineState] = {}
-
-# 记录每条序列最后一次使用的真实数据时间戳，而不是预测写到哪里。
-# 这样不会把预测不断推向更远的未来。
 LAST_REAL_TS_WRITTEN: Dict[str, int] = {}
 
 
@@ -197,8 +198,6 @@ def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[floa
         return [], []
 
     values = result[0].get("values", [])
-    if not values:
-        return [], []
 
     ts = []
     ys = []
@@ -244,6 +243,7 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
         return np.array([]), np.array([])
 
     sorted_items = sorted(data.items(), key=lambda x: x[0])
+
     ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
     ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
 
@@ -263,7 +263,7 @@ def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.
 
 
 # =============================================================================
-# 周期估计与谷底检测
+# 周期估计
 # =============================================================================
 
 def moving_average(arr: np.ndarray, window: int) -> np.ndarray:
@@ -355,6 +355,10 @@ def estimate_period_rough(ys_arr: np.ndarray) -> int:
     return int(period)
 
 
+# =============================================================================
+# 谷底检测与模板构建
+# =============================================================================
+
 def find_valley_indices(
     ts_grid: np.ndarray,
     ys_grid: np.ndarray,
@@ -366,6 +370,7 @@ def find_valley_indices(
         return []
 
     period = max(3, int(expected_period))
+
     smooth_window = max(3, int(round(period * 0.08)))
     smooth_window = min(smooth_window, 21)
 
@@ -384,6 +389,7 @@ def find_valley_indices(
 
     if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE:
         candidates = []
+
         for i in range(1, n - 1):
             if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]:
                 candidates.append(i)
@@ -451,10 +457,6 @@ def detect_period_and_valleys(
     return int(period), valleys
 
 
-# =============================================================================
-# 相位对齐模板
-# =============================================================================
-
 def build_template_from_valleys(
     ts_grid: np.ndarray,
     ys_grid: np.ndarray,
@@ -546,7 +548,7 @@ def build_current_baseline(
 
 
 # =============================================================================
-# 预测与模板合并
+# 模板预测与重采样
 # =============================================================================
 
 def circular_template_value(template: np.ndarray, phase: float) -> float:
@@ -556,6 +558,7 @@ def circular_template_value(template: np.ndarray, phase: float) -> float:
         return 0.0
 
     phase = float(phase) % period
+
     i0 = int(math.floor(phase)) % period
     i1 = (i0 + 1) % period
     frac = phase - math.floor(phase)
@@ -563,46 +566,77 @@ def circular_template_value(template: np.ndarray, phase: float) -> float:
     return float((1.0 - frac) * template[i0] + frac * template[i1])
 
 
-def predict_with_origin(
-    state: BaselineState,
+def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
+    old_period = len(old_template)
+
+    if old_period == new_period:
+        return old_template.astype(float)
+
+    if old_period <= 1 or new_period <= 1:
+        return np.full(new_period, float(np.mean(old_template)), dtype=float)
+
+    old_x = np.linspace(0.0, 1.0, old_period, endpoint=False)
+    new_x = np.linspace(0.0, 1.0, new_period, endpoint=False)
+
+    old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0])
+    old_y_ext = np.concatenate([old_template, old_template, old_template])
+
+    return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
+
+
+def predict_template_values(
+    template: np.ndarray,
+    period: int,
+    phase_origin_ts: int,
     ts_list: List[int],
-    phase_origin_ts: Optional[int] = None,
 ) -> np.ndarray:
-    template = np.array(state.template, dtype=float)
-    period = int(state.period)
-
-    if period <= 1 or len(template) != period:
+    if period <= 1:
         return np.zeros(len(ts_list), dtype=float)
 
-    origin = int(state.phase_origin_ts if phase_origin_ts is None else phase_origin_ts)
+    if len(template) != period:
+        template = resample_template(template, period)
+
     values = []
 
     for ts in ts_list:
-        phase = (int(ts) - origin) % period
+        phase = (int(ts) - int(phase_origin_ts)) % period
         values.append(circular_template_value(template, phase))
 
     return np.array(values, dtype=float)
 
 
-def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
-    old_period = len(old_template)
+def predict_with_state(state: BaselineState, ts_list: List[int]) -> np.ndarray:
+    template = np.array(state.template, dtype=float)
 
-    if old_period == new_period:
-        return old_template.astype(float)
+    return predict_template_values(
+        template=template,
+        period=int(state.period),
+        phase_origin_ts=int(state.phase_origin_ts),
+        ts_list=ts_list,
+    )
 
-    if old_period <= 1 or new_period <= 1:
-        return np.full(new_period, float(np.mean(old_template)), dtype=float)
 
-    old_x = np.linspace(0.0, 1.0, old_period, endpoint=False)
-    new_x = np.linspace(0.0, 1.0, new_period, endpoint=False)
+def normalize_origin_near(origin: int, period: int, near_ts: int) -> int:
+    if period <= 1:
+        return origin
 
-    old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0])
-    old_y_ext = np.concatenate([old_template, old_template, old_template])
+    origin = int(origin)
+    period = int(period)
+    near_ts = int(near_ts)
 
-    return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
+    while origin + period <= near_ts:
+        origin += period
 
+    while origin > near_ts:
+        origin -= period
 
-def align_new_template_to_old(old_template: np.ndarray, new_template: np.ndarray) -> np.ndarray:
+    return origin
+
+
+def align_new_template_to_old(
+    old_template: np.ndarray,
+    new_template: np.ndarray,
+) -> np.ndarray:
     if len(old_template) != len(new_template):
         old_template = resample_template(old_template, len(new_template))
 
@@ -629,23 +663,117 @@ def align_new_template_to_old(old_template: np.ndarray, new_template: np.ndarray
     return best_template.astype(float)
 
 
-def merge_template(old_template: np.ndarray, new_template: np.ndarray, alpha: float) -> np.ndarray:
+def merge_template(
+    old_template: np.ndarray,
+    new_template: np.ndarray,
+    alpha: float,
+) -> np.ndarray:
     alpha = float(np.clip(alpha, 0.0, 1.0))
 
     if len(old_template) != len(new_template):
         old_template = resample_template(old_template, len(new_template))
 
     new_template = align_new_template_to_old(old_template, new_template)
+
     merged = (1.0 - alpha) * old_template + alpha * new_template
 
     return merged.astype(float)
 
 
+# =============================================================================
+# Phase Lock
+# =============================================================================
+
+def phase_lock_recent(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+) -> Tuple[int, int, np.ndarray, float]:
+    base_period = int(state.period)
+    base_origin = int(state.phase_origin_ts)
+    base_template = np.array(state.template, dtype=float)
+
+    if base_period <= 1 or len(base_template) <= 1:
+        ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
+        pred = predict_with_state(state, ts_recent)
+        actual = ys_grid[-len(ts_recent):].astype(float)
+        mae = float(np.mean(np.abs(actual - pred))) if len(actual) else 0.0
+        return base_period, base_origin, pred, mae
+
+    window_seconds = max(
+        PHASE_LOCK_MIN_WINDOW_SECONDS,
+        min(PHASE_LOCK_MAX_WINDOW_SECONDS, int(base_period * 2)),
+    )
+
+    cutoff = ts_grid[-1] - window_seconds
+    mask = ts_grid >= cutoff
+
+    ts_recent_arr = ts_grid[mask].astype(int)
+    actual = ys_grid[mask].astype(float)
+
+    if len(ts_recent_arr) < max(10, DETECT_WINDOW_SECONDS):
+        ts_recent_arr = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int)
+        actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float)
+
+    ts_recent = ts_recent_arr.tolist()
+    last_ts = int(ts_recent[-1])
+
+    p_min = max(int(MIN_PERIOD_SECONDS), int(round(base_period * (1.0 - PHASE_LOCK_PERIOD_SEARCH_RATIO))))
+    p_max = min(int(MAX_PERIOD_SECONDS), int(round(base_period * (1.0 + PHASE_LOCK_PERIOD_SEARCH_RATIO))))
+
+    if p_max < p_min:
+        p_min = p_max = base_period
+
+    best_period = base_period
+    best_origin = normalize_origin_near(base_origin, base_period, last_ts)
+    best_template = resample_template(base_template, best_period)
+    best_pred = predict_template_values(best_template, best_period, best_origin, ts_recent)
+    best_mae = float(np.mean(np.abs(actual - best_pred)))
+
+    for period in range(p_min, p_max + 1, PHASE_LOCK_PERIOD_STEP):
+        template = resample_template(base_template, period)
+        center_origin = normalize_origin_near(base_origin, period, last_ts)
+
+        origin_shift = max(2, int(round(period * PHASE_LOCK_ORIGIN_SEARCH_RATIO)))
+
+        for shift in range(-origin_shift, origin_shift + 1, PHASE_LOCK_ORIGIN_STEP):
+            origin = center_origin + shift
+
+            pred = predict_template_values(
+                template=template,
+                period=period,
+                phase_origin_ts=origin,
+                ts_list=ts_recent,
+            )
+
+            mae = float(np.mean(np.abs(actual - pred)))
+
+            # 轻微惩罚周期变化，避免过拟合抖动
+            penalty = abs(period - base_period) * 0.5
+            score = mae + penalty
+
+            best_score = best_mae + abs(best_period - base_period) * 0.5
+
+            if score < best_score:
+                best_period = period
+                best_origin = origin
+                best_pred = pred
+                best_mae = mae
+
+    best_origin = normalize_origin_near(best_origin, best_period, last_ts)
+
+    return int(best_period), int(best_origin), best_pred, float(best_mae)
+
+
 # =============================================================================
 # 异常检测
 # =============================================================================
 
-def calc_threshold(pred: np.ndarray, abs_threshold: float, rel_threshold: float) -> np.ndarray:
+def calc_threshold(
+    pred: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> np.ndarray:
     return np.maximum(abs_threshold, np.abs(pred) * rel_threshold)
 
 
@@ -655,33 +783,8 @@ def calc_bounds(
     rel_threshold: float,
 ) -> Tuple[np.ndarray, np.ndarray]:
     threshold = calc_threshold(pred, abs_threshold, rel_threshold)
-    return pred - threshold, pred + threshold
-
-
-def find_best_phase_origin_for_recent(
-    state: BaselineState,
-    ts_recent: List[int],
-    actual: np.ndarray,
-) -> Tuple[int, np.ndarray, float]:
-    period = int(state.period)
-    base_origin = int(state.phase_origin_ts)
-    max_shift = max(1, int(round(period * PHASE_SEARCH_RATIO)))
-
-    best_origin = base_origin
-    best_pred = predict_with_origin(state, ts_recent, base_origin)
-    best_mae = float(np.mean(np.abs(actual - best_pred)))
-
-    for shift in range(-max_shift, max_shift + 1):
-        origin = base_origin + shift
-        pred = predict_with_origin(state, ts_recent, origin)
-        mae = float(np.mean(np.abs(actual - pred)))
 
-        if mae < best_mae:
-            best_mae = mae
-            best_origin = origin
-            best_pred = pred
-
-    return best_origin, best_pred, best_mae
+    return pred - threshold, pred + threshold
 
 
 def detect_anomaly(
@@ -690,36 +793,50 @@ def detect_anomaly(
     ys_grid: np.ndarray,
     abs_threshold: float,
     rel_threshold: float,
-) -> Tuple[bool, float, float, float, int]:
-    if len(ys_grid) < DETECT_WINDOW_SECONDS:
-        return False, 0.0, 0.0, 0.0, int(state.phase_origin_ts)
-
-    ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
-    actual = ys_grid[-DETECT_WINDOW_SECONDS:].astype(float)
-
-    best_origin, pred, _ = find_best_phase_origin_for_recent(
+) -> Tuple[bool, float, float, float, int, int]:
+    best_period, best_origin, pred_recent, _ = phase_lock_recent(
         state=state,
-        ts_recent=ts_recent,
-        actual=actual,
+        ts_grid=ts_grid,
+        ys_grid=ys_grid,
     )
 
-    threshold = calc_threshold(pred, abs_threshold, rel_threshold)
-    abs_err = np.abs(actual - pred)
+    recent_len = len(pred_recent)
+
+    if recent_len <= 0:
+        return False, 0.0, 0.0, 0.0, best_period, best_origin
+
+    actual = ys_grid[-recent_len:].astype(float)
+
+    threshold = calc_threshold(pred_recent, abs_threshold, rel_threshold)
+
+    abs_err = np.abs(actual - pred_recent)
     outside = abs_err > threshold
 
     outside_ratio = float(np.mean(outside))
     mean_abs_err = float(np.mean(abs_err))
-    mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1.0)))
+    mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred_recent), 1.0)))
+
     is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD
 
-    return is_anomaly, outside_ratio, mean_abs_err, mean_rel_err, int(best_origin)
+    return (
+        is_anomaly,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        int(best_period),
+        int(best_origin),
+    )
 
 
 # =============================================================================
 # 健康基线状态管理
 # =============================================================================
 
-def create_initial_state(ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int) -> Optional[BaselineState]:
+def create_initial_state(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    now_sec: int,
+) -> Optional[BaselineState]:
     baseline = build_current_baseline(ts_grid, ys_grid)
 
     if baseline is None:
@@ -740,6 +857,26 @@ def create_initial_state(ts_grid: np.ndarray, ys_grid: np.ndarray, now_sec: int)
     )
 
 
+def apply_phase_lock_to_state(
+    state: BaselineState,
+    best_period: int,
+    best_origin: int,
+) -> None:
+    best_period = int(best_period)
+
+    if best_period <= 1:
+        return
+
+    template = np.array(state.template, dtype=float)
+
+    if len(template) != best_period:
+        template = resample_template(template, best_period)
+
+    state.period = best_period
+    state.phase_origin_ts = int(best_origin)
+    state.template = template.astype(float).tolist()
+
+
 def maybe_update_state(
     key: str,
     ts_grid: np.ndarray,
@@ -772,7 +909,14 @@ def maybe_update_state(
     elapsed = min(elapsed, POLL_INTERVAL * 2)
     state.last_seen_ts = now_sec
 
-    is_anom, outside_ratio, mean_abs_err, mean_rel_err, best_origin = detect_anomaly(
+    (
+        is_anomaly,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        best_period,
+        best_origin,
+    ) = detect_anomaly(
         state=state,
         ts_grid=ts_grid,
         ys_grid=ys_grid,
@@ -780,9 +924,10 @@ def maybe_update_state(
         rel_threshold=rel_threshold,
     )
 
-    if is_anom:
+    if is_anomaly:
         state.status = BASELINE_STATUS_ANOMALY
         state.clean_seconds = 0
+
         BASELINE_STATES[key] = state
 
         logger.warning(
@@ -795,13 +940,17 @@ def maybe_update_state(
 
         return state, True, outside_ratio, mean_abs_err, mean_rel_err
 
+    old_period = int(state.period)
     old_origin = int(state.phase_origin_ts)
-    state.phase_origin_ts = int(best_origin)
 
-    if abs(state.phase_origin_ts - old_origin) >= 1:
-        logger.debug(
-            "相位校正 key=%s origin %s -> %s",
+    apply_phase_lock_to_state(state, best_period, best_origin)
+
+    if old_period != state.period or old_origin != state.phase_origin_ts:
+        logger.info(
+            "phase-lock key=%s period %s -> %s origin %s -> %s",
             key,
+            old_period,
+            state.period,
             datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"),
             datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
         )
@@ -809,9 +958,15 @@ def maybe_update_state(
     if state.status == BASELINE_STATUS_ANOMALY:
         state.status = BASELINE_STATUS_RECOVERING
         state.clean_seconds = elapsed
+
         BASELINE_STATES[key] = state
 
-        logger.info("异常开始恢复 key=%s clean_seconds=%ss", key, state.clean_seconds)
+        logger.info(
+            "异常开始恢复 key=%s clean_seconds=%ss",
+            key,
+            state.clean_seconds,
+        )
+
         return state, False, outside_ratio, mean_abs_err, mean_rel_err
 
     if state.status == BASELINE_STATUS_RECOVERING:
@@ -834,17 +989,27 @@ def maybe_update_state(
         int(state.period) * MAX_CYCLES_FOR_TEMPLATE,
     )
 
-    baseline = build_current_baseline(ts_grid=ts_grid, ys_grid=ys_grid, tail_seconds=tail_seconds)
+    baseline = build_current_baseline(
+        ts_grid=ts_grid,
+        ys_grid=ys_grid,
+        tail_seconds=tail_seconds,
+    )
 
     if baseline is None:
         BASELINE_STATES[key] = state
         return state, False, outside_ratio, mean_abs_err, mean_rel_err
 
     new_period, new_origin, new_template = baseline
+
     old_template = np.array(state.template, dtype=float)
+
     alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA
 
-    merged = merge_template(old_template=old_template, new_template=new_template, alpha=alpha)
+    merged = merge_template(
+        old_template=old_template,
+        new_template=new_template,
+        alpha=alpha,
+    )
 
     state.period = int(new_period)
     state.phase_origin_ts = int(new_origin)
@@ -878,7 +1043,12 @@ def maybe_update_state(
 # =============================================================================
 
 def prom_escape_label_value(value: str) -> str:
-    return str(value).replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
+    return (
+        str(value)
+        .replace("\\", "\\\\")
+        .replace("\n", "\\n")
+        .replace('"', '\\"')
+    )
 
 
 def labels_to_str(labels: Dict[str, str]) -> str:
@@ -926,7 +1096,9 @@ def write_series(
         resp = requests.post(
             f"{VM_URL}/api/v1/import/prometheus",
             data=payload.encode("utf-8"),
-            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
+            headers={
+                "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
+            },
             timeout=10,
         )
         resp.raise_for_status()
@@ -951,17 +1123,57 @@ def write_prediction_bundle(
     mean_rel_err: float,
     event_ts: int,
 ) -> bool:
-    ok1 = write_series(pred_metric, labels, ts_future, pred_values.astype(float).tolist())
-    ok2 = write_series(f"{pred_metric}_lower", labels, ts_future, lower_values.astype(float).tolist())
-    ok3 = write_series(f"{pred_metric}_upper", labels, ts_future, upper_values.astype(float).tolist())
+    ok1 = write_series(
+        metric_name=pred_metric,
+        labels=labels,
+        ts_list=ts_future,
+        values=pred_values.astype(float).tolist(),
+    )
+
+    ok2 = write_series(
+        metric_name=f"{pred_metric}_lower",
+        labels=labels,
+        ts_list=ts_future,
+        values=lower_values.astype(float).tolist(),
+    )
+
+    ok3 = write_series(
+        metric_name=f"{pred_metric}_upper",
+        labels=labels,
+        ts_list=ts_future,
+        values=upper_values.astype(float).tolist(),
+    )
 
     anomaly_labels = dict(labels)
     anomaly_labels["type"] = "prediction_deviation"
 
-    ok4 = write_series(anomaly_metric, anomaly_labels, [event_ts], [1.0 if is_anomaly else 0.0])
-    ok5 = write_series(f"{anomaly_metric}_outside_ratio", anomaly_labels, [event_ts], [outside_ratio])
-    ok6 = write_series(f"{anomaly_metric}_mean_abs_error", anomaly_labels, [event_ts], [mean_abs_err])
-    ok7 = write_series(f"{anomaly_metric}_mean_rel_error", anomaly_labels, [event_ts], [mean_rel_err])
+    ok4 = write_series(
+        metric_name=anomaly_metric,
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[1.0 if is_anomaly else 0.0],
+    )
+
+    ok5 = write_series(
+        metric_name=f"{anomaly_metric}_outside_ratio",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[outside_ratio],
+    )
+
+    ok6 = write_series(
+        metric_name=f"{anomaly_metric}_mean_abs_error",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[mean_abs_err],
+    )
+
+    ok7 = write_series(
+        metric_name=f"{anomaly_metric}_mean_rel_error",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[mean_rel_err],
+    )
 
     return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7
 
@@ -970,7 +1182,9 @@ def write_prediction_bundle(
 # 标签解析
 # =============================================================================
 
-_LABEL_PATTERN = re.compile(r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*')
+_LABEL_PATTERN = re.compile(
+    r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
+)
 
 
 def parse_labels_from_query(query: str) -> Dict[str, str]:
@@ -987,7 +1201,14 @@ def parse_labels_from_query(query: str) -> Dict[str, str]:
     for match in _LABEL_PATTERN.finditer(label_part):
         key = match.group(1)
         value = match.group(2)
-        value = value.replace('\\"', '"').replace("\\n", "\n").replace("\\\\", "\\")
+
+        value = (
+            value
+            .replace('\\"', '"')
+            .replace("\\n", "\n")
+            .replace("\\\\", "\\")
+        )
+
         labels[key] = value
 
     return labels
@@ -1042,7 +1263,12 @@ def load_state() -> None:
             states[key] = BaselineState(**value)
 
         BASELINE_STATES = states
-        logger.info("已加载预测状态文件 %s，状态数量=%d", STATE_FILE, len(BASELINE_STATES))
+
+        logger.info(
+            "已加载预测状态文件 %s，状态数量=%d",
+            STATE_FILE,
+            len(BASELINE_STATES),
+        )
 
     except Exception as e:
         logger.warning("加载预测状态文件失败，将重新学习: %s", e)
@@ -1069,10 +1295,14 @@ def save_state() -> None:
 
 
 # =============================================================================
-# 时间轴选择
+# 时间轴
 # =============================================================================
 
-def build_prediction_timestamps(key: str, last_real_ts: int, now_sec: int) -> Optional[List[int]]:
+def build_prediction_timestamps(
+    key: str,
+    last_real_ts: int,
+    now_sec: int,
+) -> Optional[List[int]]:
     data_lag = now_sec - last_real_ts
 
     if data_lag > MAX_DATA_LAG_SECONDS:
@@ -1086,24 +1316,24 @@ def build_prediction_timestamps(key: str, last_real_ts: int, now_sec: int) -> Op
 
     last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key)
 
-    if last_written_real_ts is not None:
-        advance = last_real_ts - int(last_written_real_ts)
-
-        if advance < MIN_REAL_ADVANCE_SECONDS:
-            logger.info(
-                "真实数据时间戳未推进，跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s",
-                key,
-                last_real_ts,
-                last_written_real_ts,
-            )
-            return None
+    if last_written_real_ts is not None and last_real_ts <= int(last_written_real_ts):
+        logger.info(
+            "真实数据时间戳未推进，跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s",
+            key,
+            last_real_ts,
+            last_written_real_ts,
+        )
+        return None
 
     if ALIGN_PREDICTION_TO_LAST_REAL_TS:
         base_ts = last_real_ts
     else:
         base_ts = now_sec
 
-    return [base_ts + i + 1 for i in range(WRITE_HORIZON_SECONDS)]
+    return [
+        base_ts + i + 1
+        for i in range(WRITE_HORIZON_SECONDS)
+    ]
 
 
 # =============================================================================
@@ -1123,17 +1353,28 @@ def run_once() -> None:
         ts, ys = fetch_history(query)
 
         if len(ys) < MIN_POINTS:
-            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
+            logger.info(
+                "[%s] %s 数据不足（%d 点），跳过",
+                now_str,
+                query,
+                len(ys),
+            )
             continue
 
         ts_grid, ys_grid = normalize_history(ts, ys)
 
         if len(ys_grid) < MIN_POINTS:
-            logger.info("[%s] %s 清洗后数据不足（%d 点），跳过", now_str, query, len(ys_grid))
+            logger.info(
+                "[%s] %s 清洗后数据不足（%d 点），跳过",
+                now_str,
+                query,
+                len(ys_grid),
+            )
             continue
 
         base_labels = parse_labels_from_query(query)
         write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
+
         key = series_key(pred_metric, write_labels)
 
         state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state(
@@ -1145,7 +1386,11 @@ def run_once() -> None:
         )
 
         if state is None:
-            logger.info("[%s] %s 暂无可用健康模板，等待学习", now_str, query)
+            logger.info(
+                "[%s] %s 暂无可用健康模板，等待学习",
+                now_str,
+                query,
+            )
             continue
 
         now_sec = int(time.time())
@@ -1161,7 +1406,8 @@ def run_once() -> None:
         if not ts_future:
             continue
 
-        pred_values = predict_with_origin(state, ts_future)
+        pred_values = predict_with_state(state, ts_future)
+
         lower_values, upper_values = calc_bounds(
             pred=pred_values,
             abs_threshold=abs_threshold,
@@ -1184,7 +1430,11 @@ def run_once() -> None:
         )
 
         if not ok:
-            logger.error("[%s] %s 写入预测数据失败", now_str, query)
+            logger.error(
+                "[%s] %s 写入预测数据失败",
+                now_str,
+                query,
+            )
             continue
 
         LAST_REAL_TS_WRITTEN[key] = last_real_ts
@@ -1234,4 +1484,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 79e9f9b080e3f5fc4a284ab1422d3a54aacaff0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 21:23:40 +0800
Subject: [PATCH 26/31] feat(pridict_v4): update pridict v4 version

---
 ai/pridict_v4.py | 1604 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1604 insertions(+)
 create mode 100644 ai/pridict_v4.py

diff --git a/ai/pridict_v4.py b/ai/pridict_v4.py
new file mode 100644
index 0000000..774ad3a
--- /dev/null
+++ b/ai/pridict_v4.py
@@ -0,0 +1,1604 @@
+# -*- coding: utf-8 -*-
+"""
+ProtoForge Predictor v11
+
+核心能力：
+1. feed_rate / spindle_speed / spindle_current 使用 phase-lock 点预测。
+2. vibration_x / vibration_y / vibration_z 使用 phase-band 预测带。
+3. vibration 类指标不再追求单点完全贴合，而是输出：
+   - xxx_predicted        中位数预测线
+   - xxx_predicted_upper  正常上边界
+   - xxx_predicted_lower  正常下边界
+4. 预测起点锚定最后一个真实点 last_real_ts，避免时间错位。
+5. 异常期间冻结健康模板，不学习故障数据。
+6. 故障恢复后等待稳定，再恢复模板学习。
+"""
+
+import json
+import logging
+import math
+import os
+import re
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import requests
+
+
+# =============================================================================
+# 日志配置
+# =============================================================================
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# 基础配置
+# =============================================================================
+
+VM_URL = "http://localhost:8428"
+STATE_FILE = "/tmp/protoforge_predictor_state_v11.json"
+
+HISTORY_MINUTES = 30
+HORIZON_SECONDS = 120
+POLL_INTERVAL = 30
+
+WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
+
+QUERY_STEP = "1s"
+MIN_POINTS = 120
+
+MIN_PERIOD_SECONDS = 5
+MAX_PERIOD_SECONDS = 3600
+
+MIN_FULL_CYCLES_FOR_TEMPLATE = 3
+MAX_CYCLES_FOR_TEMPLATE = 8
+
+DETECT_WINDOW_SECONDS = 20
+RECOVERY_MIN_SECONDS = 60
+
+HEALTHY_EMA_ALPHA = 0.10
+RECOVERY_EMA_ALPHA = 0.25
+
+OUTSIDE_RATIO_THRESHOLD = 0.60
+
+VALLEY_QUANTILE = 45
+
+MAX_DATA_LAG_SECONDS = 180
+
+PHASE_LOCK_MIN_WINDOW_SECONDS = 45
+PHASE_LOCK_MAX_WINDOW_SECONDS = 180
+PHASE_LOCK_PERIOD_SEARCH_RATIO = 0.12
+PHASE_LOCK_ORIGIN_SEARCH_RATIO = 0.35
+PHASE_LOCK_PERIOD_STEP = 1
+PHASE_LOCK_ORIGIN_STEP = 1
+
+
+# =============================================================================
+# 指标配置
+# =============================================================================
+
+PREDICT_TARGETS = [
+    {
+        "query": 'feed_rate{device_id="fanuc-cnc"}',
+        "pred_metric": "feed_rate_predicted",
+        "anomaly_metric": "feed_rate_anomaly",
+        "strategy": "phase_point",
+        "abs_threshold": 400.0,
+        "rel_threshold": 0.25,
+        "smooth_window": 1,
+    },
+    {
+        "query": 'spindle_speed{device_id="fanuc-cnc"}',
+        "pred_metric": "spindle_speed_predicted",
+        "anomaly_metric": "spindle_speed_anomaly",
+        "strategy": "phase_point",
+        "abs_threshold": 500.0,
+        "rel_threshold": 0.25,
+        "smooth_window": 1,
+    },
+    {
+        "query": 'spindle_current{device_id="fanuc-cnc"}',
+        "pred_metric": "spindle_current_predicted",
+        "anomaly_metric": "spindle_current_anomaly",
+        "strategy": "phase_point",
+        "abs_threshold": 5.0,
+        "rel_threshold": 0.25,
+        "smooth_window": 1,
+    },
+    {
+        "query": 'vibration_x{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_x_predicted",
+        "anomaly_metric": "vibration_x_anomaly",
+        "strategy": "phase_band",
+        "abs_threshold": 0.12,
+        "rel_threshold": 0.35,
+        "smooth_window": 5,
+        "band_low_q": 10,
+        "band_high_q": 90,
+        "band_pad_abs": 0.06,
+    },
+    {
+        "query": 'vibration_y{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_y_predicted",
+        "anomaly_metric": "vibration_y_anomaly",
+        "strategy": "phase_band",
+        "abs_threshold": 0.12,
+        "rel_threshold": 0.35,
+        "smooth_window": 5,
+        "band_low_q": 10,
+        "band_high_q": 90,
+        "band_pad_abs": 0.06,
+    },
+    {
+        "query": 'vibration_z{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_z_predicted",
+        "anomaly_metric": "vibration_z_anomaly",
+        "strategy": "phase_band",
+        "abs_threshold": 0.12,
+        "rel_threshold": 0.35,
+        "smooth_window": 5,
+        "band_low_q": 10,
+        "band_high_q": 90,
+        "band_pad_abs": 0.06,
+    },
+]
+
+EXTRA_PREDICT_LABELS = {
+    "forecast": "phase_band_health_v11",
+    "source": "protoforge",
+}
+
+BASELINE_STATUS_HEALTHY = "healthy"
+BASELINE_STATUS_ANOMALY = "anomaly"
+BASELINE_STATUS_RECOVERING = "recovering"
+
+
+# =============================================================================
+# 状态结构
+# =============================================================================
+
+@dataclass
+class BaselineState:
+    period: int
+    phase_origin_ts: int
+    template: List[float]
+    lower_template: List[float]
+    upper_template: List[float]
+    strategy: str
+    status: str
+    clean_seconds: int
+    last_update_ts: int
+    last_seen_ts: int
+    y_min: float
+    y_max: float
+
+
+BASELINE_STATES: Dict[str, BaselineState] = {}
+LAST_REAL_TS_WRITTEN: Dict[str, int] = {}
+
+
+# =============================================================================
+# VictoriaMetrics 读取
+# =============================================================================
+
+def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]:
+    now = datetime.now()
+    start = now - timedelta(minutes=minutes)
+
+    try:
+        resp = requests.get(
+            f"{VM_URL}/api/v1/query_range",
+            params={
+                "query": query,
+                "start": start.timestamp(),
+                "end": now.timestamp(),
+                "step": QUERY_STEP,
+            },
+            timeout=10,
+        )
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        logger.error("拉取数据失败 query=%s: %s", query, e)
+        return [], []
+
+    try:
+        result = resp.json().get("data", {}).get("result", [])
+    except Exception as e:
+        logger.error("解析 VM 返回失败 query=%s: %s", query, e)
+        return [], []
+
+    if not result:
+        return [], []
+
+    values = result[0].get("values", [])
+
+    ts = []
+    ys = []
+
+    for item in values:
+        if len(item) < 2:
+            continue
+
+        try:
+            t = float(item[0])
+            y = float(item[1])
+        except Exception:
+            continue
+
+        if not math.isfinite(t) or not math.isfinite(y):
+            continue
+
+        ts.append(t)
+        ys.append(y)
+
+    return ts, ys
+
+
+def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]:
+    if not ts or not ys or len(ts) != len(ys):
+        return np.array([]), np.array([])
+
+    data = {}
+
+    for t, y in zip(ts, ys):
+        try:
+            sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(sec) or not math.isfinite(val):
+            continue
+
+        data[sec] = val
+
+    if not data:
+        return np.array([]), np.array([])
+
+    sorted_items = sorted(data.items(), key=lambda x: x[0])
+
+    ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
+    ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
+
+    if len(ts_clean) < 2:
+        return ts_clean, ys_clean
+
+    start_sec = int(ts_clean[0])
+    end_sec = int(ts_clean[-1])
+
+    if end_sec <= start_sec:
+        return ts_clean, ys_clean
+
+    ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float)
+    ys_grid = np.interp(ts_grid, ts_clean, ys_clean)
+
+    return ts_grid, ys_grid
+
+
+# =============================================================================
+# 平滑与预处理
+# =============================================================================
+
+def rolling_median(arr: np.ndarray, window: int) -> np.ndarray:
+    if window <= 1 or len(arr) < window:
+        return arr.astype(float)
+
+    if window % 2 == 0:
+        window += 1
+
+    pad = window // 2
+    padded = np.pad(arr.astype(float), (pad, pad), mode="edge")
+
+    result = []
+
+    for i in range(len(arr)):
+        result.append(float(np.median(padded[i:i + window])))
+
+    return np.array(result, dtype=float)
+
+
+def moving_average(arr: np.ndarray, window: int) -> np.ndarray:
+    if window <= 1 or len(arr) < window:
+        return arr.astype(float)
+
+    if window % 2 == 0:
+        window += 1
+
+    kernel = np.ones(window, dtype=float) / window
+    pad = window // 2
+    padded = np.pad(arr.astype(float), (pad, pad), mode="edge")
+
+    return np.convolve(padded, kernel, mode="valid")
+
+
+def preprocess_values(ys_grid: np.ndarray, target: Dict) -> np.ndarray:
+    strategy = target.get("strategy", "phase_point")
+    smooth_window = int(target.get("smooth_window", 1))
+
+    if strategy == "phase_band":
+        return rolling_median(ys_grid, smooth_window)
+
+    if smooth_window > 1:
+        return moving_average(ys_grid, smooth_window)
+
+    return ys_grid.astype(float)
+
+
+# =============================================================================
+# 周期估计
+# =============================================================================
+
+def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
+    n = len(ys_arr)
+
+    if n < 8:
+        return 60.0
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return 60.0
+
+    fft_vals = np.fft.rfft(centered)
+    freqs = np.fft.rfftfreq(n, d=1.0)
+
+    if len(freqs) <= 1:
+        return 60.0
+
+    power = np.abs(fft_vals[1:])
+
+    if len(power) == 0 or np.max(power) <= 0:
+        return 60.0
+
+    dominant_idx = int(np.argmax(power)) + 1
+    dominant_freq = float(freqs[dominant_idx])
+
+    if dominant_freq <= 0:
+        return 60.0
+
+    period = 1.0 / dominant_freq
+
+    return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float:
+    n = len(ys_arr)
+
+    if n < 20:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    corr = np.correlate(centered, centered, mode="full")[n - 1:]
+
+    p0 = int(round(init_period))
+    left = max(int(MIN_PERIOD_SECONDS), int(max(2, p0 * 0.7)))
+    right = min(n // 2, int(max(left + 1, p0 * 1.3)))
+
+    if right <= left:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    search = corr[left:right + 1]
+
+    if len(search) == 0:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    best_lag = left + int(np.argmax(search))
+
+    return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def estimate_period_rough(ys_arr: np.ndarray) -> int:
+    p_fft = estimate_period_by_fft(ys_arr)
+    p_refined = refine_period_by_autocorr(ys_arr, p_fft)
+
+    period = int(round(p_refined))
+    period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period))
+
+    return int(period)
+
+
+# =============================================================================
+# 谷底检测
+# =============================================================================
+
+def find_valley_indices(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    expected_period: int,
+) -> List[int]:
+    n = len(ys_grid)
+
+    if n < max(10, expected_period * 2):
+        return []
+
+    period = max(3, int(expected_period))
+    smooth_window = max(3, int(round(period * 0.08)))
+    smooth_window = min(smooth_window, 21)
+
+    ys_smooth = moving_average(ys_grid, smooth_window)
+    threshold = float(np.percentile(ys_smooth, VALLEY_QUANTILE))
+
+    candidates = []
+
+    for i in range(1, n - 1):
+        if (
+            ys_smooth[i] <= ys_smooth[i - 1]
+            and ys_smooth[i] < ys_smooth[i + 1]
+            and ys_smooth[i] <= threshold
+        ):
+            candidates.append(i)
+
+    if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        candidates = []
+
+        for i in range(1, n - 1):
+            if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]:
+                candidates.append(i)
+
+    if not candidates:
+        return []
+
+    min_distance = max(2, int(round(period * 0.55)))
+    selected = []
+
+    for idx in candidates:
+        if not selected:
+            selected.append(idx)
+            continue
+
+        if idx - selected[-1] >= min_distance:
+            selected.append(idx)
+            continue
+
+        if ys_smooth[idx] < ys_smooth[selected[-1]]:
+            selected[-1] = idx
+
+    if len(selected) < 2:
+        return selected
+
+    cleaned = [selected[0]]
+
+    for idx in selected[1:]:
+        diff = int(ts_grid[idx] - ts_grid[cleaned[-1]])
+
+        if int(period * 0.55) <= diff <= int(period * 1.60):
+            cleaned.append(idx)
+            continue
+
+        if diff < int(period * 0.55):
+            if ys_smooth[idx] < ys_smooth[cleaned[-1]]:
+                cleaned[-1] = idx
+            continue
+
+        cleaned.append(idx)
+
+    return cleaned
+
+
+def detect_period_and_valleys(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+) -> Tuple[int, List[int]]:
+    rough = estimate_period_rough(ys_grid)
+    valleys = find_valley_indices(ts_grid, ys_grid, rough)
+
+    if len(valleys) >= 3:
+        diffs = np.diff(ts_grid[valleys])
+        good = diffs[(diffs >= rough * 0.55) & (diffs <= rough * 1.60)]
+
+        if len(good) > 0:
+            period = int(round(float(np.median(good))))
+        else:
+            period = rough
+    else:
+        period = rough
+
+    period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period))
+
+    return int(period), valleys
+
+
+# =============================================================================
+# 模板构建
+# =============================================================================
+
+def build_templates_from_valleys(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    period: int,
+    valleys: List[int],
+    target: Dict,
+) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
+    if period <= 1 or len(valleys) < MIN_FULL_CYCLES_FOR_TEMPLATE + 1:
+        return None
+
+    strategy = target.get("strategy", "phase_point")
+    low_q = float(target.get("band_low_q", 10))
+    high_q = float(target.get("band_high_q", 90))
+
+    pairs = []
+
+    for a, b in zip(valleys[:-1], valleys[1:]):
+        cycle_len = float(ts_grid[b] - ts_grid[a])
+
+        if period * 0.55 <= cycle_len <= period * 1.60:
+            pairs.append((a, b, cycle_len))
+
+    if len(pairs) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    pairs = pairs[-MAX_CYCLES_FOR_TEMPLATE:]
+
+    phase_grid = np.arange(period, dtype=float)
+    segments = []
+    weights = []
+
+    for idx, (a, b, cycle_len) in enumerate(pairs):
+        seg_ts = ts_grid[a:b + 1]
+        seg_y = ys_grid[a:b + 1]
+
+        if len(seg_y) < 3:
+            continue
+
+        x_old = (seg_ts - seg_ts[0]) / cycle_len * period
+        seg = np.interp(phase_grid, x_old, seg_y)
+
+        segments.append(seg.astype(float))
+        weights.append(0.5 + 0.5 * ((idx + 1) / len(pairs)))
+
+    if len(segments) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    arr = np.vstack(segments)
+    w_arr = np.array(weights, dtype=float)
+
+    if strategy == "phase_band":
+        mid_template = np.percentile(arr, 50, axis=0)
+        lower_template = np.percentile(arr, low_q, axis=0)
+        upper_template = np.percentile(arr, high_q, axis=0)
+    else:
+        mid_template = np.average(arr, axis=0, weights=w_arr)
+        lower_template = mid_template.copy()
+        upper_template = mid_template.copy()
+
+    return (
+        mid_template.astype(float),
+        lower_template.astype(float),
+        upper_template.astype(float),
+    )
+
+
+def build_current_baseline(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    target: Dict,
+    tail_seconds: Optional[int] = None,
+) -> Optional[Tuple[int, int, np.ndarray, np.ndarray, np.ndarray]]:
+    if len(ys_grid) < MIN_POINTS:
+        return None
+
+    if tail_seconds is not None and tail_seconds > 0:
+        cutoff = ts_grid[-1] - int(tail_seconds)
+        mask = ts_grid >= cutoff
+        ts_use = ts_grid[mask]
+        ys_use = ys_grid[mask]
+    else:
+        ts_use = ts_grid
+        ys_use = ys_grid
+
+    if len(ys_use) < MIN_POINTS:
+        return None
+
+    period, valleys = detect_period_and_valleys(ts_use, ys_use)
+
+    templates = build_templates_from_valleys(
+        ts_grid=ts_use,
+        ys_grid=ys_use,
+        period=period,
+        valleys=valleys,
+        target=target,
+    )
+
+    if templates is None or len(valleys) == 0:
+        return None
+
+    template, lower_template, upper_template = templates
+    phase_origin_ts = int(round(float(ts_use[valleys[-1]])))
+
+    return int(period), phase_origin_ts, template, lower_template, upper_template
+
+
+# =============================================================================
+# 模板预测
+# =============================================================================
+
+def circular_template_value(template: np.ndarray, phase: float) -> float:
+    period = len(template)
+
+    if period == 0:
+        return 0.0
+
+    phase = float(phase) % period
+    i0 = int(math.floor(phase)) % period
+    i1 = (i0 + 1) % period
+    frac = phase - math.floor(phase)
+
+    return float((1.0 - frac) * template[i0] + frac * template[i1])
+
+
+def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
+    old_period = len(old_template)
+
+    if old_period == new_period:
+        return old_template.astype(float)
+
+    if old_period <= 1 or new_period <= 1:
+        return np.full(new_period, float(np.mean(old_template)), dtype=float)
+
+    old_x = np.linspace(0.0, 1.0, old_period, endpoint=False)
+    new_x = np.linspace(0.0, 1.0, new_period, endpoint=False)
+
+    old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0])
+    old_y_ext = np.concatenate([old_template, old_template, old_template])
+
+    return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
+
+
+def predict_template_values(
+    template: np.ndarray,
+    period: int,
+    phase_origin_ts: int,
+    ts_list: List[int],
+) -> np.ndarray:
+    if period <= 1:
+        return np.zeros(len(ts_list), dtype=float)
+
+    if len(template) != period:
+        template = resample_template(template, period)
+
+    values = []
+
+    for ts in ts_list:
+        phase = (int(ts) - int(phase_origin_ts)) % period
+        values.append(circular_template_value(template, phase))
+
+    return np.array(values, dtype=float)
+
+
+def predict_state_bundle(
+    state: BaselineState,
+    ts_list: List[int],
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    period = int(state.period)
+    origin = int(state.phase_origin_ts)
+
+    mid = predict_template_values(
+        template=np.array(state.template, dtype=float),
+        period=period,
+        phase_origin_ts=origin,
+        ts_list=ts_list,
+    )
+
+    lower = predict_template_values(
+        template=np.array(state.lower_template, dtype=float),
+        period=period,
+        phase_origin_ts=origin,
+        ts_list=ts_list,
+    )
+
+    upper = predict_template_values(
+        template=np.array(state.upper_template, dtype=float),
+        period=period,
+        phase_origin_ts=origin,
+        ts_list=ts_list,
+    )
+
+    return mid, lower, upper
+
+
+def normalize_origin_near(origin: int, period: int, near_ts: int) -> int:
+    if period <= 1:
+        return origin
+
+    origin = int(origin)
+    period = int(period)
+    near_ts = int(near_ts)
+
+    while origin + period <= near_ts:
+        origin += period
+
+    while origin > near_ts:
+        origin -= period
+
+    return origin
+
+
+def merge_template(
+    old_template: np.ndarray,
+    new_template: np.ndarray,
+    alpha: float,
+) -> np.ndarray:
+    alpha = float(np.clip(alpha, 0.0, 1.0))
+
+    if len(old_template) != len(new_template):
+        old_template = resample_template(old_template, len(new_template))
+
+    merged = (1.0 - alpha) * old_template + alpha * new_template
+
+    return merged.astype(float)
+
+
+# =============================================================================
+# Phase Lock
+# =============================================================================
+
+def phase_lock_recent(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+) -> Tuple[int, int, np.ndarray, float]:
+    base_period = int(state.period)
+    base_origin = int(state.phase_origin_ts)
+    base_template = np.array(state.template, dtype=float)
+
+    if base_period <= 1 or len(base_template) <= 1:
+        ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
+        pred = predict_template_values(base_template, base_period, base_origin, ts_recent)
+        actual = ys_model[-len(ts_recent):].astype(float)
+        mae = float(np.mean(np.abs(actual - pred))) if len(actual) else 0.0
+        return base_period, base_origin, pred, mae
+
+    window_seconds = max(
+        PHASE_LOCK_MIN_WINDOW_SECONDS,
+        min(PHASE_LOCK_MAX_WINDOW_SECONDS, int(base_period * 2)),
+    )
+
+    cutoff = ts_grid[-1] - window_seconds
+    mask = ts_grid >= cutoff
+
+    ts_recent_arr = ts_grid[mask].astype(int)
+    actual = ys_model[mask].astype(float)
+
+    if len(ts_recent_arr) < max(10, DETECT_WINDOW_SECONDS):
+        ts_recent_arr = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int)
+        actual = ys_model[-DETECT_WINDOW_SECONDS:].astype(float)
+
+    ts_recent = ts_recent_arr.tolist()
+    last_ts = int(ts_recent[-1])
+
+    p_min = max(
+        int(MIN_PERIOD_SECONDS),
+        int(round(base_period * (1.0 - PHASE_LOCK_PERIOD_SEARCH_RATIO))),
+    )
+    p_max = min(
+        int(MAX_PERIOD_SECONDS),
+        int(round(base_period * (1.0 + PHASE_LOCK_PERIOD_SEARCH_RATIO))),
+    )
+
+    best_period = base_period
+    best_origin = normalize_origin_near(base_origin, base_period, last_ts)
+    best_template = resample_template(base_template, best_period)
+
+    best_pred = predict_template_values(
+        template=best_template,
+        period=best_period,
+        phase_origin_ts=best_origin,
+        ts_list=ts_recent,
+    )
+
+    best_mae = float(np.mean(np.abs(actual - best_pred)))
+
+    for period in range(p_min, p_max + 1, PHASE_LOCK_PERIOD_STEP):
+        template = resample_template(base_template, period)
+        center_origin = normalize_origin_near(base_origin, period, last_ts)
+        origin_shift = max(2, int(round(period * PHASE_LOCK_ORIGIN_SEARCH_RATIO)))
+
+        for shift in range(-origin_shift, origin_shift + 1, PHASE_LOCK_ORIGIN_STEP):
+            origin = center_origin + shift
+
+            pred = predict_template_values(
+                template=template,
+                period=period,
+                phase_origin_ts=origin,
+                ts_list=ts_recent,
+            )
+
+            mae = float(np.mean(np.abs(actual - pred)))
+            penalty = abs(period - base_period) * 0.5
+            score = mae + penalty
+
+            best_score = best_mae + abs(best_period - base_period) * 0.5
+
+            if score < best_score:
+                best_period = period
+                best_origin = origin
+                best_pred = pred
+                best_mae = mae
+
+    best_origin = normalize_origin_near(best_origin, best_period, last_ts)
+
+    return int(best_period), int(best_origin), best_pred, float(best_mae)
+
+
+# =============================================================================
+# 异常检测
+# =============================================================================
+
+def calc_point_bounds(
+    pred: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> Tuple[np.ndarray, np.ndarray]:
+    threshold = np.maximum(abs_threshold, np.abs(pred) * rel_threshold)
+    return pred - threshold, pred + threshold
+
+
+def calc_final_bounds(
+    state: BaselineState,
+    pred: np.ndarray,
+    lower_raw: np.ndarray,
+    upper_raw: np.ndarray,
+    target: Dict,
+) -> Tuple[np.ndarray, np.ndarray]:
+    strategy = target.get("strategy", "phase_point")
+    abs_threshold = float(target.get("abs_threshold", 1.0))
+    rel_threshold = float(target.get("rel_threshold", 0.25))
+
+    if strategy == "phase_band":
+        pad_abs = float(target.get("band_pad_abs", abs_threshold))
+        dynamic_pad = np.maximum(pad_abs, np.abs(pred) * rel_threshold * 0.20)
+        lower = lower_raw - dynamic_pad
+        upper = upper_raw + dynamic_pad
+        return lower, upper
+
+    return calc_point_bounds(pred, abs_threshold, rel_threshold)
+
+
+def detect_anomaly(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+    target: Dict,
+) -> Tuple[bool, float, float, float, int, int]:
+    best_period, best_origin, pred_recent, _ = phase_lock_recent(
+        state=state,
+        ts_grid=ts_grid,
+        ys_model=ys_model,
+    )
+
+    recent_len = len(pred_recent)
+
+    if recent_len <= 0:
+        return False, 0.0, 0.0, 0.0, best_period, best_origin
+
+    actual = ys_model[-recent_len:].astype(float)
+
+    tmp_state = BaselineState(
+        period=best_period,
+        phase_origin_ts=best_origin,
+        template=state.template,
+        lower_template=state.lower_template,
+        upper_template=state.upper_template,
+        strategy=state.strategy,
+        status=state.status,
+        clean_seconds=state.clean_seconds,
+        last_update_ts=state.last_update_ts,
+        last_seen_ts=state.last_seen_ts,
+        y_min=state.y_min,
+        y_max=state.y_max,
+    )
+
+    recent_ts = ts_grid[-recent_len:].astype(int).tolist()
+    pred, lower_raw, upper_raw = predict_state_bundle(tmp_state, recent_ts)
+
+    lower, upper = calc_final_bounds(
+        state=tmp_state,
+        pred=pred,
+        lower_raw=lower_raw,
+        upper_raw=upper_raw,
+        target=target,
+    )
+
+    outside = (actual < lower) | (actual > upper)
+    abs_err = np.abs(actual - pred)
+
+    outside_ratio = float(np.mean(outside))
+    mean_abs_err = float(np.mean(abs_err))
+    mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1e-6)))
+
+    is_anomaly = outside_ratio >= OUTSIDE_RATIO_THRESHOLD
+
+    return (
+        is_anomaly,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        int(best_period),
+        int(best_origin),
+    )
+
+
+# =============================================================================
+# 状态管理
+# =============================================================================
+
+def create_initial_state(
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+    target: Dict,
+    now_sec: int,
+) -> Optional[BaselineState]:
+    baseline = build_current_baseline(
+        ts_grid=ts_grid,
+        ys_grid=ys_model,
+        target=target,
+    )
+
+    if baseline is None:
+        return None
+
+    period, phase_origin_ts, template, lower_template, upper_template = baseline
+
+    return BaselineState(
+        period=int(period),
+        phase_origin_ts=int(phase_origin_ts),
+        template=template.astype(float).tolist(),
+        lower_template=lower_template.astype(float).tolist(),
+        upper_template=upper_template.astype(float).tolist(),
+        strategy=str(target.get("strategy", "phase_point")),
+        status=BASELINE_STATUS_HEALTHY,
+        clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE),
+        last_update_ts=now_sec,
+        last_seen_ts=now_sec,
+        y_min=float(np.min(ys_model)),
+        y_max=float(np.max(ys_model)),
+    )
+
+
+def apply_phase_lock_to_state(
+    state: BaselineState,
+    best_period: int,
+    best_origin: int,
+) -> None:
+    best_period = int(best_period)
+
+    if best_period <= 1:
+        return
+
+    if len(state.template) != best_period:
+        state.template = resample_template(
+            np.array(state.template, dtype=float),
+            best_period,
+        ).astype(float).tolist()
+
+    if len(state.lower_template) != best_period:
+        state.lower_template = resample_template(
+            np.array(state.lower_template, dtype=float),
+            best_period,
+        ).astype(float).tolist()
+
+    if len(state.upper_template) != best_period:
+        state.upper_template = resample_template(
+            np.array(state.upper_template, dtype=float),
+            best_period,
+        ).astype(float).tolist()
+
+    state.period = best_period
+    state.phase_origin_ts = int(best_origin)
+
+
+def maybe_update_state(
+    key: str,
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+    target: Dict,
+) -> Tuple[Optional[BaselineState], bool, float, float, float]:
+    now_sec = int(time.time())
+    state = BASELINE_STATES.get(key)
+
+    if state is None:
+        state = create_initial_state(
+            ts_grid=ts_grid,
+            ys_model=ys_model,
+            target=target,
+            now_sec=now_sec,
+        )
+
+        if state is None:
+            return None, False, 0.0, 0.0, 0.0
+
+        BASELINE_STATES[key] = state
+
+        logger.info(
+            "初始化健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss",
+            key,
+            state.strategy,
+            state.period,
+            datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+            state.clean_seconds,
+        )
+
+        return state, False, 0.0, 0.0, 0.0
+
+    elapsed = max(1, now_sec - int(state.last_seen_ts))
+    elapsed = min(elapsed, POLL_INTERVAL * 2)
+    state.last_seen_ts = now_sec
+
+    (
+        is_anomaly,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        best_period,
+        best_origin,
+    ) = detect_anomaly(
+        state=state,
+        ts_grid=ts_grid,
+        ys_model=ys_model,
+        target=target,
+    )
+
+    if is_anomaly:
+        state.status = BASELINE_STATUS_ANOMALY
+        state.clean_seconds = 0
+        BASELINE_STATES[key] = state
+
+        logger.warning(
+            "检测到异常，冻结模板 key=%s outside_ratio=%.2f mean_abs_err=%.4f mean_rel_err=%.4f",
+            key,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+        )
+
+        return state, True, outside_ratio, mean_abs_err, mean_rel_err
+
+    old_period = int(state.period)
+    old_origin = int(state.phase_origin_ts)
+
+    apply_phase_lock_to_state(state, best_period, best_origin)
+
+    if old_period != state.period or old_origin != state.phase_origin_ts:
+        logger.info(
+            "phase-lock key=%s period %s -> %s origin %s -> %s",
+            key,
+            old_period,
+            state.period,
+            datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"),
+            datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+        )
+
+    if state.status == BASELINE_STATUS_ANOMALY:
+        state.status = BASELINE_STATUS_RECOVERING
+        state.clean_seconds = elapsed
+        BASELINE_STATES[key] = state
+
+        logger.info(
+            "异常开始恢复 key=%s clean_seconds=%ss",
+            key,
+            state.clean_seconds,
+        )
+
+        return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+    if state.status == BASELINE_STATUS_RECOVERING:
+        state.clean_seconds += elapsed
+    else:
+        state.status = BASELINE_STATUS_HEALTHY
+        state.clean_seconds += elapsed
+
+    min_clean_for_update = max(
+        RECOVERY_MIN_SECONDS,
+        int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE,
+    )
+
+    if state.clean_seconds < min_clean_for_update:
+        BASELINE_STATES[key] = state
+        return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+    tail_seconds = min(
+        int(state.clean_seconds),
+        int(state.period) * MAX_CYCLES_FOR_TEMPLATE,
+    )
+
+    baseline = build_current_baseline(
+        ts_grid=ts_grid,
+        ys_grid=ys_model,
+        target=target,
+        tail_seconds=tail_seconds,
+    )
+
+    if baseline is None:
+        BASELINE_STATES[key] = state
+        return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+    new_period, new_origin, new_template, new_lower_template, new_upper_template = baseline
+
+    alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA
+
+    state.template = merge_template(
+        np.array(state.template, dtype=float),
+        new_template,
+        alpha,
+    ).astype(float).tolist()
+
+    state.lower_template = merge_template(
+        np.array(state.lower_template, dtype=float),
+        new_lower_template,
+        alpha,
+    ).astype(float).tolist()
+
+    state.upper_template = merge_template(
+        np.array(state.upper_template, dtype=float),
+        new_upper_template,
+        alpha,
+    ).astype(float).tolist()
+
+    state.period = int(new_period)
+    state.phase_origin_ts = int(new_origin)
+    state.status = BASELINE_STATUS_HEALTHY
+    state.last_update_ts = now_sec
+
+    if tail_seconds > 0 and len(ys_model) >= tail_seconds:
+        state.y_min = float(np.min(ys_model[-tail_seconds:]))
+        state.y_max = float(np.max(ys_model[-tail_seconds:]))
+    else:
+        state.y_min = float(np.min(ys_model))
+        state.y_max = float(np.max(ys_model))
+
+    BASELINE_STATES[key] = state
+
+    logger.info(
+        "更新健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss alpha=%.2f",
+        key,
+        state.strategy,
+        state.period,
+        datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+        state.clean_seconds,
+        alpha,
+    )
+
+    return state, False, outside_ratio, mean_abs_err, mean_rel_err
+
+
+# =============================================================================
+# Prometheus 写入
+# =============================================================================
+
+def prom_escape_label_value(value: str) -> str:
+    return (
+        str(value)
+        .replace("\\", "\\\\")
+        .replace("\n", "\\n")
+        .replace('"', '\\"')
+    )
+
+
+def labels_to_str(labels: Dict[str, str]) -> str:
+    if not labels:
+        return ""
+
+    parts = []
+
+    for k in sorted(labels.keys()):
+        parts.append(f'{k}="{prom_escape_label_value(labels[k])}"')
+
+    return "{" + ",".join(parts) + "}"
+
+
+def write_series(
+    metric_name: str,
+    labels: Dict[str, str],
+    ts_list: List[int],
+    values: List[float],
+) -> bool:
+    if not ts_list or not values or len(ts_list) != len(values):
+        return False
+
+    label_str = labels_to_str(labels)
+    lines = []
+
+    for t, y in zip(ts_list, values):
+        try:
+            ts_sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(ts_sec) or not math.isfinite(val):
+            continue
+
+        lines.append(f"{metric_name}{label_str} {val:.6f} {ts_sec * 1000}")
+
+    if not lines:
+        return False
+
+    payload = "\n".join(lines) + "\n"
+
+    try:
+        resp = requests.post(
+            f"{VM_URL}/api/v1/import/prometheus",
+            data=payload.encode("utf-8"),
+            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
+            timeout=10,
+        )
+        resp.raise_for_status()
+        return True
+
+    except requests.RequestException as e:
+        logger.error("写入数据失败 metric=%s: %s", metric_name, e)
+        return False
+
+
+def write_prediction_bundle(
+    pred_metric: str,
+    anomaly_metric: str,
+    labels: Dict[str, str],
+    ts_future: List[int],
+    pred_values: np.ndarray,
+    lower_values: np.ndarray,
+    upper_values: np.ndarray,
+    is_anomaly: bool,
+    outside_ratio: float,
+    mean_abs_err: float,
+    mean_rel_err: float,
+    event_ts: int,
+) -> bool:
+    ok1 = write_series(
+        metric_name=pred_metric,
+        labels=labels,
+        ts_list=ts_future,
+        values=pred_values.astype(float).tolist(),
+    )
+
+    ok2 = write_series(
+        metric_name=f"{pred_metric}_lower",
+        labels=labels,
+        ts_list=ts_future,
+        values=lower_values.astype(float).tolist(),
+    )
+
+    ok3 = write_series(
+        metric_name=f"{pred_metric}_upper",
+        labels=labels,
+        ts_list=ts_future,
+        values=upper_values.astype(float).tolist(),
+    )
+
+    anomaly_labels = dict(labels)
+    anomaly_labels["type"] = "prediction_deviation"
+
+    ok4 = write_series(
+        metric_name=anomaly_metric,
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[1.0 if is_anomaly else 0.0],
+    )
+
+    ok5 = write_series(
+        metric_name=f"{anomaly_metric}_outside_ratio",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[outside_ratio],
+    )
+
+    ok6 = write_series(
+        metric_name=f"{anomaly_metric}_mean_abs_error",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[mean_abs_err],
+    )
+
+    ok7 = write_series(
+        metric_name=f"{anomaly_metric}_mean_rel_error",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[mean_rel_err],
+    )
+
+    return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7
+
+
+# =============================================================================
+# 标签解析
+# =============================================================================
+
+_LABEL_PATTERN = re.compile(
+    r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
+)
+
+
+def parse_labels_from_query(query: str) -> Dict[str, str]:
+    labels = {}
+
+    if "{" not in query or "}" not in query:
+        return labels
+
+    try:
+        label_part = query[query.index("{") + 1:query.rindex("}")]
+    except Exception:
+        return labels
+
+    for match in _LABEL_PATTERN.finditer(label_part):
+        key = match.group(1)
+        value = match.group(2)
+
+        value = (
+            value
+            .replace('\\"', '"')
+            .replace("\\n", "\n")
+            .replace("\\\\", "\\")
+        )
+
+        labels[key] = value
+
+    return labels
+
+
+def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]:
+    result = {}
+
+    for d in dicts:
+        if d:
+            result.update(d)
+
+    return result
+
+
+def series_key(metric_name: str, labels: Dict[str, str]) -> str:
+    return metric_name + labels_to_str(labels)
+
+
+# =============================================================================
+# 状态持久化
+# =============================================================================
+
+def load_state() -> None:
+    global BASELINE_STATES
+
+    if not os.path.exists(STATE_FILE):
+        return
+
+    try:
+        with open(STATE_FILE, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+
+        states = {}
+
+        for key, value in raw.get("baseline_states", {}).items():
+            required_fields = {
+                "period",
+                "phase_origin_ts",
+                "template",
+                "lower_template",
+                "upper_template",
+                "strategy",
+                "status",
+                "clean_seconds",
+                "last_update_ts",
+                "last_seen_ts",
+                "y_min",
+                "y_max",
+            }
+
+            if not required_fields.issubset(set(value.keys())):
+                continue
+
+            states[key] = BaselineState(**value)
+
+        BASELINE_STATES = states
+
+        logger.info(
+            "已加载预测状态文件 %s，状态数量=%d",
+            STATE_FILE,
+            len(BASELINE_STATES),
+        )
+
+    except Exception as e:
+        logger.warning("加载预测状态文件失败，将重新学习: %s", e)
+
+
+def save_state() -> None:
+    try:
+        raw = {
+            "baseline_states": {
+                key: asdict(value)
+                for key, value in BASELINE_STATES.items()
+            }
+        }
+
+        tmp_file = STATE_FILE + ".tmp"
+
+        with open(tmp_file, "w", encoding="utf-8") as f:
+            json.dump(raw, f, ensure_ascii=False, indent=2)
+
+        os.replace(tmp_file, STATE_FILE)
+
+    except Exception as e:
+        logger.warning("保存预测状态文件失败: %s", e)
+
+
+# =============================================================================
+# 时间轴
+# =============================================================================
+
+def build_prediction_timestamps(
+    key: str,
+    last_real_ts: int,
+    now_sec: int,
+) -> Optional[List[int]]:
+    data_lag = now_sec - last_real_ts
+
+    if data_lag > MAX_DATA_LAG_SECONDS:
+        logger.warning(
+            "真实数据延迟过大，跳过预测 key=%s data_lag=%ss max=%ss",
+            key,
+            data_lag,
+            MAX_DATA_LAG_SECONDS,
+        )
+        return None
+
+    last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key)
+
+    if last_written_real_ts is not None and last_real_ts <= int(last_written_real_ts):
+        logger.info(
+            "真实数据时间戳未推进，跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s",
+            key,
+            last_real_ts,
+            last_written_real_ts,
+        )
+        return None
+
+    base_ts = last_real_ts
+
+    return [
+        base_ts + i + 1
+        for i in range(WRITE_HORIZON_SECONDS)
+    ]
+
+
+# =============================================================================
+# 主流程
+# =============================================================================
+
+def run_once() -> None:
+    now_str = datetime.now().strftime("%H:%M:%S")
+
+    for target in PREDICT_TARGETS:
+        query = target["query"]
+        pred_metric = target["pred_metric"]
+        anomaly_metric = target["anomaly_metric"]
+
+        ts, ys = fetch_history(query)
+
+        if len(ys) < MIN_POINTS:
+            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
+            continue
+
+        ts_grid, ys_grid_raw = normalize_history(ts, ys)
+
+        if len(ys_grid_raw) < MIN_POINTS:
+            logger.info("[%s] %s 清洗后数据不足（%d 点），跳过", now_str, query, len(ys_grid_raw))
+            continue
+
+        ys_grid_model = preprocess_values(ys_grid_raw, target)
+
+        base_labels = parse_labels_from_query(query)
+        write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
+
+        key = series_key(pred_metric, write_labels)
+
+        state, is_anomaly, outside_ratio, mean_abs_err, mean_rel_err = maybe_update_state(
+            key=key,
+            ts_grid=ts_grid,
+            ys_model=ys_grid_model,
+            target=target,
+        )
+
+        if state is None:
+            logger.info("[%s] %s 暂无可用健康模板，等待学习", now_str, query)
+            continue
+
+        now_sec = int(time.time())
+        last_real_ts = int(ts_grid[-1])
+        data_lag = now_sec - last_real_ts
+
+        ts_future = build_prediction_timestamps(
+            key=key,
+            last_real_ts=last_real_ts,
+            now_sec=now_sec,
+        )
+
+        if not ts_future:
+            continue
+
+        pred_values, lower_raw, upper_raw = predict_state_bundle(state, ts_future)
+
+        lower_values, upper_values = calc_final_bounds(
+            state=state,
+            pred=pred_values,
+            lower_raw=lower_raw,
+            upper_raw=upper_raw,
+            target=target,
+        )
+
+        ok = write_prediction_bundle(
+            pred_metric=pred_metric,
+            anomaly_metric=anomaly_metric,
+            labels=write_labels,
+            ts_future=ts_future,
+            pred_values=pred_values,
+            lower_values=lower_values,
+            upper_values=upper_values,
+            is_anomaly=is_anomaly,
+            outside_ratio=outside_ratio,
+            mean_abs_err=mean_abs_err,
+            mean_rel_err=mean_rel_err,
+            event_ts=last_real_ts,
+        )
+
+        if not ok:
+            logger.error("[%s] %s 写入预测数据失败", now_str, query)
+            continue
+
+        LAST_REAL_TS_WRITTEN[key] = last_real_ts
+
+        future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
+        future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
+        last_real_str = datetime.fromtimestamp(last_real_ts).strftime("%H:%M:%S")
+        origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S")
+
+        logger.info(
+            "[%s] %-40s → %-35s strategy=%s status=%s anomaly=%s period=%ss origin=%s last_real=%s lag=%ss 写入 %d 点，预测区间 %s ~ %s",
+            now_str,
+            query,
+            pred_metric,
+            state.strategy,
+            state.status,
+            is_anomaly,
+            state.period,
+            origin_str,
+            last_real_str,
+            data_lag,
+            len(ts_future),
+            future_start,
+            future_end,
+        )
+
+    save_state()
+
+
+def main() -> None:
+    load_state()
+
+    logger.info(
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s",
+        VM_URL,
+        HISTORY_MINUTES,
+        HORIZON_SECONDS,
+        WRITE_HORIZON_SECONDS,
+        POLL_INTERVAL,
+        STATE_FILE,
+        EXTRA_PREDICT_LABELS["forecast"],
+    )
+
+    while True:
+        run_once()
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 3609fbae4e2fbb33700de79ee1f7730ac81e366c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 21:39:55 +0800
Subject: [PATCH 27/31] fix

---
 ai/pridict_v4.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ai/pridict_v4.py b/ai/pridict_v4.py
index 774ad3a..c55f21a 100644
--- a/ai/pridict_v4.py
+++ b/ai/pridict_v4.py
@@ -120,11 +120,11 @@
         "anomaly_metric": "vibration_x_anomaly",
         "strategy": "phase_band",
         "abs_threshold": 0.12,
-        "rel_threshold": 0.35,
+        "rel_threshold": 0.40,
         "smooth_window": 5,
         "band_low_q": 10,
         "band_high_q": 90,
-        "band_pad_abs": 0.06,
+        "band_pad_abs": 0.08,
     },
     {
         "query": 'vibration_y{device_id="fanuc-cnc"}',
@@ -132,11 +132,11 @@
         "anomaly_metric": "vibration_y_anomaly",
         "strategy": "phase_band",
         "abs_threshold": 0.12,
-        "rel_threshold": 0.35,
+        "rel_threshold": 0.40,
         "smooth_window": 5,
         "band_low_q": 10,
         "band_high_q": 90,
-        "band_pad_abs": 0.06,
+        "band_pad_abs": 0.08,
     },
     {
         "query": 'vibration_z{device_id="fanuc-cnc"}',
@@ -144,11 +144,11 @@
         "anomaly_metric": "vibration_z_anomaly",
         "strategy": "phase_band",
         "abs_threshold": 0.12,
-        "rel_threshold": 0.35,
+        "rel_threshold": 0.40,
         "smooth_window": 5,
         "band_low_q": 10,
         "band_high_q": 90,
-        "band_pad_abs": 0.06,
+        "band_pad_abs": 0.08,
     },
 ]
 

From 1c4217b31d9304cddeb6be16949cc54eaa8d31ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Thu, 21 May 2026 21:48:41 +0800
Subject: [PATCH 28/31] fix

---
 ai/pridict_v4.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ai/pridict_v4.py b/ai/pridict_v4.py
index c55f21a..8657944 100644
--- a/ai/pridict_v4.py
+++ b/ai/pridict_v4.py
@@ -119,37 +119,37 @@
         "pred_metric": "vibration_x_predicted",
         "anomaly_metric": "vibration_x_anomaly",
         "strategy": "phase_band",
-        "abs_threshold": 0.12,
-        "rel_threshold": 0.40,
+        "abs_threshold": 0.18,
+        "rel_threshold": 0.50,
         "smooth_window": 5,
-        "band_low_q": 10,
-        "band_high_q": 90,
-        "band_pad_abs": 0.08,
+        "band_low_q": 2,
+        "band_high_q": 98,
+        "band_pad_abs": 0.12,
     },
     {
         "query": 'vibration_y{device_id="fanuc-cnc"}',
         "pred_metric": "vibration_y_predicted",
         "anomaly_metric": "vibration_y_anomaly",
         "strategy": "phase_band",
-        "abs_threshold": 0.12,
-        "rel_threshold": 0.40,
+        "abs_threshold": 0.18,
+        "rel_threshold": 0.50,
         "smooth_window": 5,
-        "band_low_q": 10,
-        "band_high_q": 90,
-        "band_pad_abs": 0.08,
+        "band_low_q": 2,
+        "band_high_q": 98,
+        "band_pad_abs": 0.12,
     },
     {
         "query": 'vibration_z{device_id="fanuc-cnc"}',
         "pred_metric": "vibration_z_predicted",
         "anomaly_metric": "vibration_z_anomaly",
         "strategy": "phase_band",
-        "abs_threshold": 0.12,
-        "rel_threshold": 0.40,
+        "abs_threshold": 0.18,
+        "rel_threshold": 0.50,
         "smooth_window": 5,
-        "band_low_q": 10,
-        "band_high_q": 90,
-        "band_pad_abs": 0.08,
-    },
+        "band_low_q": 2,
+        "band_high_q": 98,
+        "band_pad_abs": 0.12,
+    }
 ]
 
 EXTRA_PREDICT_LABELS = {

From 4077e8f416d9d2f9d60df50d551fc817ddacd982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Fri, 22 May 2026 09:40:02 +0800
Subject: [PATCH 29/31] feat(predict_v5): add predict v5

---
 ai/pridict_v5.py | 1794 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1794 insertions(+)
 create mode 100644 ai/pridict_v5.py

diff --git a/ai/pridict_v5.py b/ai/pridict_v5.py
new file mode 100644
index 0000000..6894a66
--- /dev/null
+++ b/ai/pridict_v5.py
@@ -0,0 +1,1794 @@
+# -*- coding: utf-8 -*-
+"""
+ProtoForge Predictor v12
+
+核心能力：
+1. feed_rate / spindle_speed / spindle_current 使用 phase-lock 点预测。
+2. vibration_x / vibration_y / vibration_z 使用 phase-band 预测带。
+3. vibration 类指标：
+   - predicted 使用平滑后的中位数模板，用于趋势参考。
+   - upper/lower 使用原始波动分位数模板 + padding，用于正常波动容忍带。
+   - 偶发越界不直接报警，只有持续越界 / 高比例越界 / 严重越界才报警。
+4. 预测起点锚定最后一个真实点 last_real_ts，避免时间错位。
+5. 异常期间冻结健康模板，不学习故障数据。
+6. 故障恢复后等待稳定，再恢复模板学习。
+7. 写入：
+   - xxx_predicted
+   - xxx_predicted_upper
+   - xxx_predicted_lower
+   - xxx_anomaly
+   - xxx_anomaly_outside_ratio
+   - xxx_anomaly_mean_abs_error
+   - xxx_anomaly_mean_rel_error
+   - xxx_anomaly_max_consecutive_outside
+   - xxx_anomaly_max_exceed_ratio
+"""
+
+import json
+import logging
+import math
+import os
+import re
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import requests
+
+
+# =============================================================================
+# 日志配置
+# =============================================================================
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# 基础配置
+# =============================================================================
+
+VM_URL = "http://localhost:8428"
+STATE_FILE = "/tmp/protoforge_predictor_state_v12.json"
+
+HISTORY_MINUTES = 30
+HORIZON_SECONDS = 120
+POLL_INTERVAL = 30
+
+WRITE_HORIZON_SECONDS = min(HORIZON_SECONDS, POLL_INTERVAL)
+
+QUERY_STEP = "1s"
+MIN_POINTS = 120
+
+MIN_PERIOD_SECONDS = 5
+MAX_PERIOD_SECONDS = 3600
+
+MIN_FULL_CYCLES_FOR_TEMPLATE = 3
+MAX_CYCLES_FOR_TEMPLATE = 8
+
+DETECT_WINDOW_SECONDS = 30
+RECOVERY_MIN_SECONDS = 60
+
+HEALTHY_EMA_ALPHA = 0.10
+RECOVERY_EMA_ALPHA = 0.25
+
+OUTSIDE_RATIO_THRESHOLD = 0.60
+MIN_CONSECUTIVE_OUTSIDE = 5
+SEVERE_EXCEED_RATIO = 1.8
+
+VALLEY_QUANTILE = 45
+
+MAX_DATA_LAG_SECONDS = 180
+
+PHASE_LOCK_MIN_WINDOW_SECONDS = 45
+PHASE_LOCK_MAX_WINDOW_SECONDS = 180
+PHASE_LOCK_PERIOD_SEARCH_RATIO = 0.12
+PHASE_LOCK_ORIGIN_SEARCH_RATIO = 0.35
+PHASE_LOCK_PERIOD_STEP = 1
+PHASE_LOCK_ORIGIN_STEP = 1
+
+
+# =============================================================================
+# 指标配置
+# =============================================================================
+
+PREDICT_TARGETS = [
+    {
+        "query": 'feed_rate{device_id="fanuc-cnc"}',
+        "pred_metric": "feed_rate_predicted",
+        "anomaly_metric": "feed_rate_anomaly",
+        "strategy": "phase_point",
+        "abs_threshold": 400.0,
+        "rel_threshold": 0.25,
+        "smooth_window": 1,
+        "outside_ratio_threshold": 0.60,
+        "min_consecutive_outside": 5,
+        "severe_exceed_ratio": 1.8,
+    },
+    {
+        "query": 'spindle_speed{device_id="fanuc-cnc"}',
+        "pred_metric": "spindle_speed_predicted",
+        "anomaly_metric": "spindle_speed_anomaly",
+        "strategy": "phase_point",
+        "abs_threshold": 500.0,
+        "rel_threshold": 0.25,
+        "smooth_window": 1,
+        "outside_ratio_threshold": 0.60,
+        "min_consecutive_outside": 5,
+        "severe_exceed_ratio": 1.8,
+    },
+    {
+        "query": 'spindle_current{device_id="fanuc-cnc"}',
+        "pred_metric": "spindle_current_predicted",
+        "anomaly_metric": "spindle_current_anomaly",
+        "strategy": "phase_point",
+        "abs_threshold": 5.0,
+        "rel_threshold": 0.25,
+        "smooth_window": 1,
+        "outside_ratio_threshold": 0.60,
+        "min_consecutive_outside": 5,
+        "severe_exceed_ratio": 1.8,
+    },
+    {
+        "query": 'vibration_x{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_x_predicted",
+        "anomaly_metric": "vibration_x_anomaly",
+        "strategy": "phase_band",
+
+        # vibration 类指标噪声、尖峰较多，不建议用很窄的阈值。
+        "abs_threshold": 0.18,
+        "rel_threshold": 0.55,
+
+        # 平滑只用于相位锁定和 predicted 中位趋势。
+        "smooth_window": 5,
+
+        # upper/lower 用原始值分位数，范围放宽，覆盖正常尖峰。
+        "band_low_q": 1,
+        "band_high_q": 99,
+        "band_pad_abs": 0.15,
+
+        # 偶发越界容忍。
+        "outside_ratio_threshold": 0.70,
+        "min_consecutive_outside": 5,
+        "severe_exceed_ratio": 2.0,
+    },
+    {
+        "query": 'vibration_y{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_y_predicted",
+        "anomaly_metric": "vibration_y_anomaly",
+        "strategy": "phase_band",
+        "abs_threshold": 0.18,
+        "rel_threshold": 0.55,
+        "smooth_window": 5,
+        "band_low_q": 1,
+        "band_high_q": 99,
+        "band_pad_abs": 0.15,
+        "outside_ratio_threshold": 0.70,
+        "min_consecutive_outside": 5,
+        "severe_exceed_ratio": 2.0,
+    },
+    {
+        "query": 'vibration_z{device_id="fanuc-cnc"}',
+        "pred_metric": "vibration_z_predicted",
+        "anomaly_metric": "vibration_z_anomaly",
+        "strategy": "phase_band",
+        "abs_threshold": 0.18,
+        "rel_threshold": 0.55,
+        "smooth_window": 5,
+        "band_low_q": 1,
+        "band_high_q": 99,
+        "band_pad_abs": 0.15,
+        "outside_ratio_threshold": 0.70,
+        "min_consecutive_outside": 5,
+        "severe_exceed_ratio": 2.0,
+    },
+]
+
+EXTRA_PREDICT_LABELS = {
+    "forecast": "phase_band_health_v12",
+    "source": "protoforge",
+}
+
+BASELINE_STATUS_HEALTHY = "healthy"
+BASELINE_STATUS_ANOMALY = "anomaly"
+BASELINE_STATUS_RECOVERING = "recovering"
+
+
+# =============================================================================
+# 状态结构
+# =============================================================================
+
+@dataclass
+class BaselineState:
+    period: int
+    phase_origin_ts: int
+    template: List[float]
+    lower_template: List[float]
+    upper_template: List[float]
+    strategy: str
+    status: str
+    clean_seconds: int
+    last_update_ts: int
+    last_seen_ts: int
+    y_min: float
+    y_max: float
+
+
+BASELINE_STATES: Dict[str, BaselineState] = {}
+LAST_REAL_TS_WRITTEN: Dict[str, int] = {}
+
+
+# =============================================================================
+# VictoriaMetrics 读取
+# =============================================================================
+
+def fetch_history(query: str, minutes: int = HISTORY_MINUTES) -> Tuple[List[float], List[float]]:
+    now = datetime.now()
+    start = now - timedelta(minutes=minutes)
+
+    try:
+        resp = requests.get(
+            f"{VM_URL}/api/v1/query_range",
+            params={
+                "query": query,
+                "start": start.timestamp(),
+                "end": now.timestamp(),
+                "step": QUERY_STEP,
+            },
+            timeout=10,
+        )
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        logger.error("拉取数据失败 query=%s: %s", query, e)
+        return [], []
+
+    try:
+        result = resp.json().get("data", {}).get("result", [])
+    except Exception as e:
+        logger.error("解析 VM 返回失败 query=%s: %s", query, e)
+        return [], []
+
+    if not result:
+        return [], []
+
+    values = result[0].get("values", [])
+
+    ts = []
+    ys = []
+
+    for item in values:
+        if len(item) < 2:
+            continue
+
+        try:
+            t = float(item[0])
+            y = float(item[1])
+        except Exception:
+            continue
+
+        if not math.isfinite(t) or not math.isfinite(y):
+            continue
+
+        ts.append(t)
+        ys.append(y)
+
+    return ts, ys
+
+
+def normalize_history(ts: List[float], ys: List[float]) -> Tuple[np.ndarray, np.ndarray]:
+    if not ts or not ys or len(ts) != len(ys):
+        return np.array([]), np.array([])
+
+    data = {}
+
+    for t, y in zip(ts, ys):
+        try:
+            sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(sec) or not math.isfinite(val):
+            continue
+
+        data[sec] = val
+
+    if not data:
+        return np.array([]), np.array([])
+
+    sorted_items = sorted(data.items(), key=lambda x: x[0])
+
+    ts_clean = np.array([x[0] for x in sorted_items], dtype=float)
+    ys_clean = np.array([x[1] for x in sorted_items], dtype=float)
+
+    if len(ts_clean) < 2:
+        return ts_clean, ys_clean
+
+    start_sec = int(ts_clean[0])
+    end_sec = int(ts_clean[-1])
+
+    if end_sec <= start_sec:
+        return ts_clean, ys_clean
+
+    ts_grid = np.arange(start_sec, end_sec + 1, 1, dtype=float)
+    ys_grid = np.interp(ts_grid, ts_clean, ys_clean)
+
+    return ts_grid, ys_grid
+
+
+# =============================================================================
+# 平滑与预处理
+# =============================================================================
+
+def rolling_median(arr: np.ndarray, window: int) -> np.ndarray:
+    if window <= 1 or len(arr) < window:
+        return arr.astype(float)
+
+    if window % 2 == 0:
+        window += 1
+
+    pad = window // 2
+    padded = np.pad(arr.astype(float), (pad, pad), mode="edge")
+
+    result = []
+
+    for i in range(len(arr)):
+        result.append(float(np.median(padded[i:i + window])))
+
+    return np.array(result, dtype=float)
+
+
+def moving_average(arr: np.ndarray, window: int) -> np.ndarray:
+    if window <= 1 or len(arr) < window:
+        return arr.astype(float)
+
+    if window % 2 == 0:
+        window += 1
+
+    kernel = np.ones(window, dtype=float) / window
+    pad = window // 2
+    padded = np.pad(arr.astype(float), (pad, pad), mode="edge")
+
+    return np.convolve(padded, kernel, mode="valid")
+
+
+def preprocess_values(ys_grid: np.ndarray, target: Dict) -> np.ndarray:
+    strategy = target.get("strategy", "phase_point")
+    smooth_window = int(target.get("smooth_window", 1))
+
+    if strategy == "phase_band":
+        return rolling_median(ys_grid, smooth_window)
+
+    if smooth_window > 1:
+        return moving_average(ys_grid, smooth_window)
+
+    return ys_grid.astype(float)
+
+
+# =============================================================================
+# 周期估计
+# =============================================================================
+
+def estimate_period_by_fft(ys_arr: np.ndarray) -> float:
+    n = len(ys_arr)
+
+    if n < 8:
+        return 60.0
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return 60.0
+
+    fft_vals = np.fft.rfft(centered)
+    freqs = np.fft.rfftfreq(n, d=1.0)
+
+    if len(freqs) <= 1:
+        return 60.0
+
+    power = np.abs(fft_vals[1:])
+
+    if len(power) == 0 or np.max(power) <= 0:
+        return 60.0
+
+    dominant_idx = int(np.argmax(power)) + 1
+    dominant_freq = float(freqs[dominant_idx])
+
+    if dominant_freq <= 0:
+        return 60.0
+
+    period = 1.0 / dominant_freq
+
+    return float(np.clip(period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def refine_period_by_autocorr(ys_arr: np.ndarray, init_period: float) -> float:
+    n = len(ys_arr)
+
+    if n < 20:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    centered = ys_arr - np.mean(ys_arr)
+
+    if np.allclose(centered, 0):
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    corr = np.correlate(centered, centered, mode="full")[n - 1:]
+
+    p0 = int(round(init_period))
+    left = max(int(MIN_PERIOD_SECONDS), int(max(2, p0 * 0.7)))
+    right = min(n // 2, int(max(left + 1, p0 * 1.3)))
+
+    if right <= left:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    search = corr[left:right + 1]
+
+    if len(search) == 0:
+        return float(np.clip(init_period, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+    best_lag = left + int(np.argmax(search))
+
+    return float(np.clip(best_lag, MIN_PERIOD_SECONDS, MAX_PERIOD_SECONDS))
+
+
+def estimate_period_rough(ys_arr: np.ndarray) -> int:
+    p_fft = estimate_period_by_fft(ys_arr)
+    p_refined = refine_period_by_autocorr(ys_arr, p_fft)
+
+    period = int(round(p_refined))
+    period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period))
+
+    return int(period)
+
+
+# =============================================================================
+# 谷底检测
+# =============================================================================
+
+def find_valley_indices(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+    expected_period: int,
+) -> List[int]:
+    n = len(ys_grid)
+
+    if n < max(10, expected_period * 2):
+        return []
+
+    period = max(3, int(expected_period))
+    smooth_window = max(3, int(round(period * 0.08)))
+    smooth_window = min(smooth_window, 21)
+
+    ys_smooth = moving_average(ys_grid, smooth_window)
+    threshold = float(np.percentile(ys_smooth, VALLEY_QUANTILE))
+
+    candidates = []
+
+    for i in range(1, n - 1):
+        if (
+            ys_smooth[i] <= ys_smooth[i - 1]
+            and ys_smooth[i] < ys_smooth[i + 1]
+            and ys_smooth[i] <= threshold
+        ):
+            candidates.append(i)
+
+    if len(candidates) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        candidates = []
+
+        for i in range(1, n - 1):
+            if ys_smooth[i] <= ys_smooth[i - 1] and ys_smooth[i] < ys_smooth[i + 1]:
+                candidates.append(i)
+
+    if not candidates:
+        return []
+
+    min_distance = max(2, int(round(period * 0.55)))
+    selected = []
+
+    for idx in candidates:
+        if not selected:
+            selected.append(idx)
+            continue
+
+        if idx - selected[-1] >= min_distance:
+            selected.append(idx)
+            continue
+
+        if ys_smooth[idx] < ys_smooth[selected[-1]]:
+            selected[-1] = idx
+
+    if len(selected) < 2:
+        return selected
+
+    cleaned = [selected[0]]
+
+    for idx in selected[1:]:
+        diff = int(ts_grid[idx] - ts_grid[cleaned[-1]])
+
+        if int(period * 0.55) <= diff <= int(period * 1.60):
+            cleaned.append(idx)
+            continue
+
+        if diff < int(period * 0.55):
+            if ys_smooth[idx] < ys_smooth[cleaned[-1]]:
+                cleaned[-1] = idx
+            continue
+
+        cleaned.append(idx)
+
+    return cleaned
+
+
+def detect_period_and_valleys(
+    ts_grid: np.ndarray,
+    ys_grid: np.ndarray,
+) -> Tuple[int, List[int]]:
+    rough = estimate_period_rough(ys_grid)
+    valleys = find_valley_indices(ts_grid, ys_grid, rough)
+
+    if len(valleys) >= 3:
+        diffs = np.diff(ts_grid[valleys])
+        good = diffs[(diffs >= rough * 0.55) & (diffs <= rough * 1.60)]
+
+        if len(good) > 0:
+            period = int(round(float(np.median(good))))
+        else:
+            period = rough
+    else:
+        period = rough
+
+    period = max(int(MIN_PERIOD_SECONDS), min(int(MAX_PERIOD_SECONDS), period))
+
+    return int(period), valleys
+
+
+# =============================================================================
+# 模板构建
+# =============================================================================
+
+def build_templates_from_valleys(
+    ts_grid: np.ndarray,
+    ys_mid_grid: np.ndarray,
+    ys_band_grid: np.ndarray,
+    period: int,
+    valleys: List[int],
+    target: Dict,
+) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
+    if period <= 1 or len(valleys) < MIN_FULL_CYCLES_FOR_TEMPLATE + 1:
+        return None
+
+    strategy = target.get("strategy", "phase_point")
+    low_q = float(target.get("band_low_q", 10))
+    high_q = float(target.get("band_high_q", 90))
+
+    pairs = []
+
+    for a, b in zip(valleys[:-1], valleys[1:]):
+        cycle_len = float(ts_grid[b] - ts_grid[a])
+
+        if period * 0.55 <= cycle_len <= period * 1.60:
+            pairs.append((a, b, cycle_len))
+
+    if len(pairs) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    pairs = pairs[-MAX_CYCLES_FOR_TEMPLATE:]
+
+    phase_grid = np.arange(period, dtype=float)
+    mid_segments = []
+    band_segments = []
+    weights = []
+
+    for idx, (a, b, cycle_len) in enumerate(pairs):
+        seg_ts = ts_grid[a:b + 1]
+        seg_mid_y = ys_mid_grid[a:b + 1]
+        seg_band_y = ys_band_grid[a:b + 1]
+
+        if len(seg_mid_y) < 3 or len(seg_band_y) < 3:
+            continue
+
+        x_old = (seg_ts - seg_ts[0]) / cycle_len * period
+
+        mid_seg = np.interp(phase_grid, x_old, seg_mid_y)
+        band_seg = np.interp(phase_grid, x_old, seg_band_y)
+
+        mid_segments.append(mid_seg.astype(float))
+        band_segments.append(band_seg.astype(float))
+        weights.append(0.5 + 0.5 * ((idx + 1) / len(pairs)))
+
+    if len(mid_segments) < MIN_FULL_CYCLES_FOR_TEMPLATE:
+        return None
+
+    mid_arr = np.vstack(mid_segments)
+    band_arr = np.vstack(band_segments)
+    w_arr = np.array(weights, dtype=float)
+
+    if strategy == "phase_band":
+        mid_template = np.percentile(mid_arr, 50, axis=0)
+
+        # upper/lower 使用原始值分布，而不是平滑值分布。
+        lower_template = np.percentile(band_arr, low_q, axis=0)
+        upper_template = np.percentile(band_arr, high_q, axis=0)
+    else:
+        mid_template = np.average(mid_arr, axis=0, weights=w_arr)
+        lower_template = mid_template.copy()
+        upper_template = mid_template.copy()
+
+    return (
+        mid_template.astype(float),
+        lower_template.astype(float),
+        upper_template.astype(float),
+    )
+
+
+def build_current_baseline(
+    ts_grid: np.ndarray,
+    ys_mid_grid: np.ndarray,
+    ys_band_grid: np.ndarray,
+    target: Dict,
+    tail_seconds: Optional[int] = None,
+) -> Optional[Tuple[int, int, np.ndarray, np.ndarray, np.ndarray]]:
+    if len(ys_mid_grid) < MIN_POINTS or len(ys_band_grid) < MIN_POINTS:
+        return None
+
+    if tail_seconds is not None and tail_seconds > 0:
+        cutoff = ts_grid[-1] - int(tail_seconds)
+        mask = ts_grid >= cutoff
+        ts_use = ts_grid[mask]
+        ys_mid_use = ys_mid_grid[mask]
+        ys_band_use = ys_band_grid[mask]
+    else:
+        ts_use = ts_grid
+        ys_mid_use = ys_mid_grid
+        ys_band_use = ys_band_grid
+
+    if len(ys_mid_use) < MIN_POINTS or len(ys_band_use) < MIN_POINTS:
+        return None
+
+    period, valleys = detect_period_and_valleys(ts_use, ys_mid_use)
+
+    templates = build_templates_from_valleys(
+        ts_grid=ts_use,
+        ys_mid_grid=ys_mid_use,
+        ys_band_grid=ys_band_use,
+        period=period,
+        valleys=valleys,
+        target=target,
+    )
+
+    if templates is None or len(valleys) == 0:
+        return None
+
+    template, lower_template, upper_template = templates
+    phase_origin_ts = int(round(float(ts_use[valleys[-1]])))
+
+    return int(period), phase_origin_ts, template, lower_template, upper_template
+
+
+# =============================================================================
+# 模板预测
+# =============================================================================
+
+def circular_template_value(template: np.ndarray, phase: float) -> float:
+    period = len(template)
+
+    if period == 0:
+        return 0.0
+
+    phase = float(phase) % period
+    i0 = int(math.floor(phase)) % period
+    i1 = (i0 + 1) % period
+    frac = phase - math.floor(phase)
+
+    return float((1.0 - frac) * template[i0] + frac * template[i1])
+
+
+def resample_template(old_template: np.ndarray, new_period: int) -> np.ndarray:
+    old_period = len(old_template)
+
+    if old_period == new_period:
+        return old_template.astype(float)
+
+    if old_period <= 1 or new_period <= 1:
+        return np.full(new_period, float(np.mean(old_template)), dtype=float)
+
+    old_x = np.linspace(0.0, 1.0, old_period, endpoint=False)
+    new_x = np.linspace(0.0, 1.0, new_period, endpoint=False)
+
+    old_x_ext = np.concatenate([old_x - 1.0, old_x, old_x + 1.0])
+    old_y_ext = np.concatenate([old_template, old_template, old_template])
+
+    return np.interp(new_x, old_x_ext, old_y_ext).astype(float)
+
+
+def predict_template_values(
+    template: np.ndarray,
+    period: int,
+    phase_origin_ts: int,
+    ts_list: List[int],
+) -> np.ndarray:
+    if period <= 1:
+        return np.zeros(len(ts_list), dtype=float)
+
+    if len(template) != period:
+        template = resample_template(template, period)
+
+    values = []
+
+    for ts in ts_list:
+        phase = (int(ts) - int(phase_origin_ts)) % period
+        values.append(circular_template_value(template, phase))
+
+    return np.array(values, dtype=float)
+
+
+def predict_state_bundle(
+    state: BaselineState,
+    ts_list: List[int],
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    period = int(state.period)
+    origin = int(state.phase_origin_ts)
+
+    mid = predict_template_values(
+        template=np.array(state.template, dtype=float),
+        period=period,
+        phase_origin_ts=origin,
+        ts_list=ts_list,
+    )
+
+    lower = predict_template_values(
+        template=np.array(state.lower_template, dtype=float),
+        period=period,
+        phase_origin_ts=origin,
+        ts_list=ts_list,
+    )
+
+    upper = predict_template_values(
+        template=np.array(state.upper_template, dtype=float),
+        period=period,
+        phase_origin_ts=origin,
+        ts_list=ts_list,
+    )
+
+    return mid, lower, upper
+
+
+def normalize_origin_near(origin: int, period: int, near_ts: int) -> int:
+    if period <= 1:
+        return origin
+
+    origin = int(origin)
+    period = int(period)
+    near_ts = int(near_ts)
+
+    while origin + period <= near_ts:
+        origin += period
+
+    while origin > near_ts:
+        origin -= period
+
+    return origin
+
+
+def merge_template(
+    old_template: np.ndarray,
+    new_template: np.ndarray,
+    alpha: float,
+) -> np.ndarray:
+    alpha = float(np.clip(alpha, 0.0, 1.0))
+
+    if len(old_template) != len(new_template):
+        old_template = resample_template(old_template, len(new_template))
+
+    merged = (1.0 - alpha) * old_template + alpha * new_template
+
+    return merged.astype(float)
+
+
+# =============================================================================
+# Phase Lock
+# =============================================================================
+
+def phase_lock_recent(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+) -> Tuple[int, int, np.ndarray, float]:
+    base_period = int(state.period)
+    base_origin = int(state.phase_origin_ts)
+    base_template = np.array(state.template, dtype=float)
+
+    if base_period <= 1 or len(base_template) <= 1:
+        ts_recent = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int).tolist()
+        pred = predict_template_values(base_template, base_period, base_origin, ts_recent)
+        actual = ys_model[-len(ts_recent):].astype(float)
+        mae = float(np.mean(np.abs(actual - pred))) if len(actual) else 0.0
+        return base_period, base_origin, pred, mae
+
+    window_seconds = max(
+        PHASE_LOCK_MIN_WINDOW_SECONDS,
+        min(PHASE_LOCK_MAX_WINDOW_SECONDS, int(base_period * 2)),
+    )
+
+    cutoff = ts_grid[-1] - window_seconds
+    mask = ts_grid >= cutoff
+
+    ts_recent_arr = ts_grid[mask].astype(int)
+    actual = ys_model[mask].astype(float)
+
+    if len(ts_recent_arr) < max(10, DETECT_WINDOW_SECONDS):
+        ts_recent_arr = ts_grid[-DETECT_WINDOW_SECONDS:].astype(int)
+        actual = ys_model[-DETECT_WINDOW_SECONDS:].astype(float)
+
+    ts_recent = ts_recent_arr.tolist()
+    last_ts = int(ts_recent[-1])
+
+    p_min = max(
+        int(MIN_PERIOD_SECONDS),
+        int(round(base_period * (1.0 - PHASE_LOCK_PERIOD_SEARCH_RATIO))),
+    )
+    p_max = min(
+        int(MAX_PERIOD_SECONDS),
+        int(round(base_period * (1.0 + PHASE_LOCK_PERIOD_SEARCH_RATIO))),
+    )
+
+    best_period = base_period
+    best_origin = normalize_origin_near(base_origin, base_period, last_ts)
+    best_template = resample_template(base_template, best_period)
+
+    best_pred = predict_template_values(
+        template=best_template,
+        period=best_period,
+        phase_origin_ts=best_origin,
+        ts_list=ts_recent,
+    )
+
+    best_mae = float(np.mean(np.abs(actual - best_pred)))
+
+    for period in range(p_min, p_max + 1, PHASE_LOCK_PERIOD_STEP):
+        template = resample_template(base_template, period)
+        center_origin = normalize_origin_near(base_origin, period, last_ts)
+        origin_shift = max(2, int(round(period * PHASE_LOCK_ORIGIN_SEARCH_RATIO)))
+
+        for shift in range(-origin_shift, origin_shift + 1, PHASE_LOCK_ORIGIN_STEP):
+            origin = center_origin + shift
+
+            pred = predict_template_values(
+                template=template,
+                period=period,
+                phase_origin_ts=origin,
+                ts_list=ts_recent,
+            )
+
+            mae = float(np.mean(np.abs(actual - pred)))
+            penalty = abs(period - base_period) * 0.5
+            score = mae + penalty
+
+            best_score = best_mae + abs(best_period - base_period) * 0.5
+
+            if score < best_score:
+                best_period = period
+                best_origin = origin
+                best_pred = pred
+                best_mae = mae
+
+    best_origin = normalize_origin_near(best_origin, best_period, last_ts)
+
+    return int(best_period), int(best_origin), best_pred, float(best_mae)
+
+
+# =============================================================================
+# 异常检测
+# =============================================================================
+
+def max_consecutive_true(flags: np.ndarray) -> int:
+    max_count = 0
+    current = 0
+
+    for flag in flags:
+        if bool(flag):
+            current += 1
+            max_count = max(max_count, current)
+        else:
+            current = 0
+
+    return int(max_count)
+
+
+def calc_point_bounds(
+    pred: np.ndarray,
+    abs_threshold: float,
+    rel_threshold: float,
+) -> Tuple[np.ndarray, np.ndarray]:
+    threshold = np.maximum(abs_threshold, np.abs(pred) * rel_threshold)
+    return pred - threshold, pred + threshold
+
+
+def calc_final_bounds(
+    state: BaselineState,
+    pred: np.ndarray,
+    lower_raw: np.ndarray,
+    upper_raw: np.ndarray,
+    target: Dict,
+) -> Tuple[np.ndarray, np.ndarray]:
+    strategy = target.get("strategy", "phase_point")
+    abs_threshold = float(target.get("abs_threshold", 1.0))
+    rel_threshold = float(target.get("rel_threshold", 0.25))
+
+    if strategy == "phase_band":
+        pad_abs = float(target.get("band_pad_abs", abs_threshold))
+
+        # 对 vibration 类指标：边界更像正常波动容忍带，不是硬边界。
+        dynamic_pad = np.maximum(
+            pad_abs,
+            np.abs(pred) * rel_threshold * 0.25,
+        )
+
+        lower = lower_raw - dynamic_pad
+        upper = upper_raw + dynamic_pad
+
+        return lower, upper
+
+    return calc_point_bounds(pred, abs_threshold, rel_threshold)
+
+
+def detect_anomaly(
+    state: BaselineState,
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+    ys_actual: np.ndarray,
+    target: Dict,
+) -> Tuple[bool, float, float, float, int, int, int, float]:
+    best_period, best_origin, pred_recent, _ = phase_lock_recent(
+        state=state,
+        ts_grid=ts_grid,
+        ys_model=ys_model,
+    )
+
+    recent_len = len(pred_recent)
+
+    if recent_len <= 0:
+        return False, 0.0, 0.0, 0.0, best_period, best_origin, 0, 0.0
+
+    if target.get("strategy", "phase_point") == "phase_band":
+        actual = ys_actual[-recent_len:].astype(float)
+    else:
+        actual = ys_model[-recent_len:].astype(float)
+
+    tmp_state = BaselineState(
+        period=best_period,
+        phase_origin_ts=best_origin,
+        template=state.template,
+        lower_template=state.lower_template,
+        upper_template=state.upper_template,
+        strategy=state.strategy,
+        status=state.status,
+        clean_seconds=state.clean_seconds,
+        last_update_ts=state.last_update_ts,
+        last_seen_ts=state.last_seen_ts,
+        y_min=state.y_min,
+        y_max=state.y_max,
+    )
+
+    recent_ts = ts_grid[-recent_len:].astype(int).tolist()
+    pred, lower_raw, upper_raw = predict_state_bundle(tmp_state, recent_ts)
+
+    lower, upper = calc_final_bounds(
+        state=tmp_state,
+        pred=pred,
+        lower_raw=lower_raw,
+        upper_raw=upper_raw,
+        target=target,
+    )
+
+    above_upper = actual - upper
+    below_lower = lower - actual
+
+    exceed = np.maximum(above_upper, below_lower)
+    exceed = np.maximum(exceed, 0.0)
+
+    outside = exceed > 0
+
+    band_width = np.maximum(upper - lower, 1e-6)
+    exceed_ratio = exceed / band_width
+
+    abs_err = np.abs(actual - pred)
+
+    outside_ratio = float(np.mean(outside))
+    mean_abs_err = float(np.mean(abs_err))
+    mean_rel_err = float(np.mean(abs_err / np.maximum(np.abs(pred), 1e-6)))
+
+    max_outside_seconds = max_consecutive_true(outside)
+    max_exceed_ratio = float(np.max(exceed_ratio)) if len(exceed_ratio) > 0 else 0.0
+
+    outside_ratio_threshold = float(
+        target.get("outside_ratio_threshold", OUTSIDE_RATIO_THRESHOLD)
+    )
+    min_consecutive_outside = int(
+        target.get("min_consecutive_outside", MIN_CONSECUTIVE_OUTSIDE)
+    )
+    severe_exceed_ratio = float(
+        target.get("severe_exceed_ratio", SEVERE_EXCEED_RATIO)
+    )
+
+    # 核心优化：
+    # 1. 偶发 1~3 个点越界不报警。
+    # 2. 持续越界才报警。
+    # 3. 高比例越界才报警。
+    # 4. 严重越界才立即报警。
+    is_anomaly = (
+        outside_ratio >= outside_ratio_threshold
+        or max_outside_seconds >= min_consecutive_outside
+        or max_exceed_ratio >= severe_exceed_ratio
+    )
+
+    return (
+        is_anomaly,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        int(best_period),
+        int(best_origin),
+        int(max_outside_seconds),
+        float(max_exceed_ratio),
+    )
+
+
+# =============================================================================
+# 状态管理
+# =============================================================================
+
+def create_initial_state(
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+    ys_actual: np.ndarray,
+    target: Dict,
+    now_sec: int,
+) -> Optional[BaselineState]:
+    baseline = build_current_baseline(
+        ts_grid=ts_grid,
+        ys_mid_grid=ys_model,
+        ys_band_grid=ys_actual,
+        target=target,
+    )
+
+    if baseline is None:
+        return None
+
+    period, phase_origin_ts, template, lower_template, upper_template = baseline
+
+    return BaselineState(
+        period=int(period),
+        phase_origin_ts=int(phase_origin_ts),
+        template=template.astype(float).tolist(),
+        lower_template=lower_template.astype(float).tolist(),
+        upper_template=upper_template.astype(float).tolist(),
+        strategy=str(target.get("strategy", "phase_point")),
+        status=BASELINE_STATUS_HEALTHY,
+        clean_seconds=int(period * MAX_CYCLES_FOR_TEMPLATE),
+        last_update_ts=now_sec,
+        last_seen_ts=now_sec,
+        y_min=float(np.min(ys_actual)),
+        y_max=float(np.max(ys_actual)),
+    )
+
+
+def apply_phase_lock_to_state(
+    state: BaselineState,
+    best_period: int,
+    best_origin: int,
+) -> None:
+    best_period = int(best_period)
+
+    if best_period <= 1:
+        return
+
+    if len(state.template) != best_period:
+        state.template = resample_template(
+            np.array(state.template, dtype=float),
+            best_period,
+        ).astype(float).tolist()
+
+    if len(state.lower_template) != best_period:
+        state.lower_template = resample_template(
+            np.array(state.lower_template, dtype=float),
+            best_period,
+        ).astype(float).tolist()
+
+    if len(state.upper_template) != best_period:
+        state.upper_template = resample_template(
+            np.array(state.upper_template, dtype=float),
+            best_period,
+        ).astype(float).tolist()
+
+    state.period = best_period
+    state.phase_origin_ts = int(best_origin)
+
+
+def maybe_update_state(
+    key: str,
+    ts_grid: np.ndarray,
+    ys_model: np.ndarray,
+    ys_actual: np.ndarray,
+    target: Dict,
+) -> Tuple[Optional[BaselineState], bool, float, float, float, int, float]:
+    now_sec = int(time.time())
+    state = BASELINE_STATES.get(key)
+
+    if state is None:
+        state = create_initial_state(
+            ts_grid=ts_grid,
+            ys_model=ys_model,
+            ys_actual=ys_actual,
+            target=target,
+            now_sec=now_sec,
+        )
+
+        if state is None:
+            return None, False, 0.0, 0.0, 0.0, 0, 0.0
+
+        BASELINE_STATES[key] = state
+
+        logger.info(
+            "初始化健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss",
+            key,
+            state.strategy,
+            state.period,
+            datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+            state.clean_seconds,
+        )
+
+        return state, False, 0.0, 0.0, 0.0, 0, 0.0
+
+    elapsed = max(1, now_sec - int(state.last_seen_ts))
+    elapsed = min(elapsed, POLL_INTERVAL * 2)
+    state.last_seen_ts = now_sec
+
+    (
+        is_anomaly,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        best_period,
+        best_origin,
+        max_outside_seconds,
+        max_exceed_ratio,
+    ) = detect_anomaly(
+        state=state,
+        ts_grid=ts_grid,
+        ys_model=ys_model,
+        ys_actual=ys_actual,
+        target=target,
+    )
+
+    if is_anomaly:
+        state.status = BASELINE_STATUS_ANOMALY
+        state.clean_seconds = 0
+        BASELINE_STATES[key] = state
+
+        logger.warning(
+            "检测到异常，冻结模板 key=%s outside_ratio=%.2f max_outside=%ss max_exceed_ratio=%.2f mean_abs_err=%.4f mean_rel_err=%.4f",
+            key,
+            outside_ratio,
+            max_outside_seconds,
+            max_exceed_ratio,
+            mean_abs_err,
+            mean_rel_err,
+        )
+
+        return (
+            state,
+            True,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+            max_outside_seconds,
+            max_exceed_ratio,
+        )
+
+    old_period = int(state.period)
+    old_origin = int(state.phase_origin_ts)
+
+    apply_phase_lock_to_state(state, best_period, best_origin)
+
+    if old_period != state.period or old_origin != state.phase_origin_ts:
+        logger.info(
+            "phase-lock key=%s period %s -> %s origin %s -> %s",
+            key,
+            old_period,
+            state.period,
+            datetime.fromtimestamp(old_origin).strftime("%H:%M:%S"),
+            datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+        )
+
+    if state.status == BASELINE_STATUS_ANOMALY:
+        state.status = BASELINE_STATUS_RECOVERING
+        state.clean_seconds = elapsed
+        BASELINE_STATES[key] = state
+
+        logger.info(
+            "异常开始恢复 key=%s clean_seconds=%ss",
+            key,
+            state.clean_seconds,
+        )
+
+        return (
+            state,
+            False,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+            max_outside_seconds,
+            max_exceed_ratio,
+        )
+
+    if state.status == BASELINE_STATUS_RECOVERING:
+        state.clean_seconds += elapsed
+    else:
+        state.status = BASELINE_STATUS_HEALTHY
+        state.clean_seconds += elapsed
+
+    min_clean_for_update = max(
+        RECOVERY_MIN_SECONDS,
+        int(state.period) * MIN_FULL_CYCLES_FOR_TEMPLATE,
+    )
+
+    if state.clean_seconds < min_clean_for_update:
+        BASELINE_STATES[key] = state
+        return (
+            state,
+            False,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+            max_outside_seconds,
+            max_exceed_ratio,
+        )
+
+    tail_seconds = min(
+        int(state.clean_seconds),
+        int(state.period) * MAX_CYCLES_FOR_TEMPLATE,
+    )
+
+    baseline = build_current_baseline(
+        ts_grid=ts_grid,
+        ys_mid_grid=ys_model,
+        ys_band_grid=ys_actual,
+        target=target,
+        tail_seconds=tail_seconds,
+    )
+
+    if baseline is None:
+        BASELINE_STATES[key] = state
+        return (
+            state,
+            False,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+            max_outside_seconds,
+            max_exceed_ratio,
+        )
+
+    new_period, new_origin, new_template, new_lower_template, new_upper_template = baseline
+
+    alpha = RECOVERY_EMA_ALPHA if state.status == BASELINE_STATUS_RECOVERING else HEALTHY_EMA_ALPHA
+
+    state.template = merge_template(
+        np.array(state.template, dtype=float),
+        new_template,
+        alpha,
+    ).astype(float).tolist()
+
+    state.lower_template = merge_template(
+        np.array(state.lower_template, dtype=float),
+        new_lower_template,
+        alpha,
+    ).astype(float).tolist()
+
+    state.upper_template = merge_template(
+        np.array(state.upper_template, dtype=float),
+        new_upper_template,
+        alpha,
+    ).astype(float).tolist()
+
+    state.period = int(new_period)
+    state.phase_origin_ts = int(new_origin)
+    state.status = BASELINE_STATUS_HEALTHY
+    state.last_update_ts = now_sec
+
+    if tail_seconds > 0 and len(ys_actual) >= tail_seconds:
+        state.y_min = float(np.min(ys_actual[-tail_seconds:]))
+        state.y_max = float(np.max(ys_actual[-tail_seconds:]))
+    else:
+        state.y_min = float(np.min(ys_actual))
+        state.y_max = float(np.max(ys_actual))
+
+    BASELINE_STATES[key] = state
+
+    logger.info(
+        "更新健康模板 key=%s strategy=%s period=%ss origin=%s clean=%ss alpha=%.2f",
+        key,
+        state.strategy,
+        state.period,
+        datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S"),
+        state.clean_seconds,
+        alpha,
+    )
+
+    return (
+        state,
+        False,
+        outside_ratio,
+        mean_abs_err,
+        mean_rel_err,
+        max_outside_seconds,
+        max_exceed_ratio,
+    )
+
+
+# =============================================================================
+# Prometheus 写入
+# =============================================================================
+
+def prom_escape_label_value(value: str) -> str:
+    return (
+        str(value)
+        .replace("\\", "\\\\")
+        .replace("\n", "\\n")
+        .replace('"', '\\"')
+    )
+
+
+def labels_to_str(labels: Dict[str, str]) -> str:
+    if not labels:
+        return ""
+
+    parts = []
+
+    for k in sorted(labels.keys()):
+        parts.append(f'{k}="{prom_escape_label_value(labels[k])}"')
+
+    return "{" + ",".join(parts) + "}"
+
+
+def write_series(
+    metric_name: str,
+    labels: Dict[str, str],
+    ts_list: List[int],
+    values: List[float],
+) -> bool:
+    if not ts_list or not values or len(ts_list) != len(values):
+        return False
+
+    label_str = labels_to_str(labels)
+    lines = []
+
+    for t, y in zip(ts_list, values):
+        try:
+            ts_sec = int(round(float(t)))
+            val = float(y)
+        except Exception:
+            continue
+
+        if not math.isfinite(ts_sec) or not math.isfinite(val):
+            continue
+
+        lines.append(f"{metric_name}{label_str} {val:.6f} {ts_sec * 1000}")
+
+    if not lines:
+        return False
+
+    payload = "\n".join(lines) + "\n"
+
+    try:
+        resp = requests.post(
+            f"{VM_URL}/api/v1/import/prometheus",
+            data=payload.encode("utf-8"),
+            headers={"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
+            timeout=10,
+        )
+        resp.raise_for_status()
+        return True
+
+    except requests.RequestException as e:
+        logger.error("写入数据失败 metric=%s: %s", metric_name, e)
+        return False
+
+
+def write_prediction_bundle(
+    pred_metric: str,
+    anomaly_metric: str,
+    labels: Dict[str, str],
+    ts_future: List[int],
+    pred_values: np.ndarray,
+    lower_values: np.ndarray,
+    upper_values: np.ndarray,
+    is_anomaly: bool,
+    outside_ratio: float,
+    mean_abs_err: float,
+    mean_rel_err: float,
+    max_outside_seconds: int,
+    max_exceed_ratio: float,
+    event_ts: int,
+) -> bool:
+    ok1 = write_series(
+        metric_name=pred_metric,
+        labels=labels,
+        ts_list=ts_future,
+        values=pred_values.astype(float).tolist(),
+    )
+
+    ok2 = write_series(
+        metric_name=f"{pred_metric}_lower",
+        labels=labels,
+        ts_list=ts_future,
+        values=lower_values.astype(float).tolist(),
+    )
+
+    ok3 = write_series(
+        metric_name=f"{pred_metric}_upper",
+        labels=labels,
+        ts_list=ts_future,
+        values=upper_values.astype(float).tolist(),
+    )
+
+    anomaly_labels = dict(labels)
+    anomaly_labels["type"] = "prediction_deviation"
+
+    ok4 = write_series(
+        metric_name=anomaly_metric,
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[1.0 if is_anomaly else 0.0],
+    )
+
+    ok5 = write_series(
+        metric_name=f"{anomaly_metric}_outside_ratio",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[outside_ratio],
+    )
+
+    ok6 = write_series(
+        metric_name=f"{anomaly_metric}_mean_abs_error",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[mean_abs_err],
+    )
+
+    ok7 = write_series(
+        metric_name=f"{anomaly_metric}_mean_rel_error",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[mean_rel_err],
+    )
+
+    ok8 = write_series(
+        metric_name=f"{anomaly_metric}_max_consecutive_outside",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[float(max_outside_seconds)],
+    )
+
+    ok9 = write_series(
+        metric_name=f"{anomaly_metric}_max_exceed_ratio",
+        labels=anomaly_labels,
+        ts_list=[event_ts],
+        values=[float(max_exceed_ratio)],
+    )
+
+    return ok1 and ok2 and ok3 and ok4 and ok5 and ok6 and ok7 and ok8 and ok9
+
+
+# =============================================================================
+# 标签解析
+# =============================================================================
+
+_LABEL_PATTERN = re.compile(
+    r'\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"((?:\\.|[^"])*)"\s*'
+)
+
+
+def parse_labels_from_query(query: str) -> Dict[str, str]:
+    labels = {}
+
+    if "{" not in query or "}" not in query:
+        return labels
+
+    try:
+        label_part = query[query.index("{") + 1:query.rindex("}")]
+    except Exception:
+        return labels
+
+    for match in _LABEL_PATTERN.finditer(label_part):
+        key = match.group(1)
+        value = match.group(2)
+
+        value = (
+            value
+            .replace('\\"', '"')
+            .replace("\\n", "\n")
+            .replace("\\\\", "\\")
+        )
+
+        labels[key] = value
+
+    return labels
+
+
+def merge_labels(*dicts: Dict[str, str]) -> Dict[str, str]:
+    result = {}
+
+    for d in dicts:
+        if d:
+            result.update(d)
+
+    return result
+
+
+def series_key(metric_name: str, labels: Dict[str, str]) -> str:
+    return metric_name + labels_to_str(labels)
+
+
+# =============================================================================
+# 状态持久化
+# =============================================================================
+
+def load_state() -> None:
+    global BASELINE_STATES
+
+    if not os.path.exists(STATE_FILE):
+        return
+
+    try:
+        with open(STATE_FILE, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+
+        states = {}
+
+        for key, value in raw.get("baseline_states", {}).items():
+            required_fields = {
+                "period",
+                "phase_origin_ts",
+                "template",
+                "lower_template",
+                "upper_template",
+                "strategy",
+                "status",
+                "clean_seconds",
+                "last_update_ts",
+                "last_seen_ts",
+                "y_min",
+                "y_max",
+            }
+
+            if not required_fields.issubset(set(value.keys())):
+                continue
+
+            states[key] = BaselineState(**value)
+
+        BASELINE_STATES = states
+
+        logger.info(
+            "已加载预测状态文件 %s，状态数量=%d",
+            STATE_FILE,
+            len(BASELINE_STATES),
+        )
+
+    except Exception as e:
+        logger.warning("加载预测状态文件失败，将重新学习: %s", e)
+
+
+def save_state() -> None:
+    try:
+        raw = {
+            "baseline_states": {
+                key: asdict(value)
+                for key, value in BASELINE_STATES.items()
+            }
+        }
+
+        tmp_file = STATE_FILE + ".tmp"
+
+        with open(tmp_file, "w", encoding="utf-8") as f:
+            json.dump(raw, f, ensure_ascii=False, indent=2)
+
+        os.replace(tmp_file, STATE_FILE)
+
+    except Exception as e:
+        logger.warning("保存预测状态文件失败: %s", e)
+
+
+# =============================================================================
+# 时间轴
+# =============================================================================
+
+def build_prediction_timestamps(
+    key: str,
+    last_real_ts: int,
+    now_sec: int,
+) -> Optional[List[int]]:
+    data_lag = now_sec - last_real_ts
+
+    if data_lag > MAX_DATA_LAG_SECONDS:
+        logger.warning(
+            "真实数据延迟过大，跳过预测 key=%s data_lag=%ss max=%ss",
+            key,
+            data_lag,
+            MAX_DATA_LAG_SECONDS,
+        )
+        return None
+
+    last_written_real_ts = LAST_REAL_TS_WRITTEN.get(key)
+
+    if last_written_real_ts is not None and last_real_ts <= int(last_written_real_ts):
+        logger.info(
+            "真实数据时间戳未推进，跳过重复写入 key=%s last_real_ts=%s last_written_real_ts=%s",
+            key,
+            last_real_ts,
+            last_written_real_ts,
+        )
+        return None
+
+    base_ts = last_real_ts
+
+    return [
+        base_ts + i + 1
+        for i in range(WRITE_HORIZON_SECONDS)
+    ]
+
+
+# =============================================================================
+# 主流程
+# =============================================================================
+
+def run_once() -> None:
+    now_str = datetime.now().strftime("%H:%M:%S")
+
+    for target in PREDICT_TARGETS:
+        query = target["query"]
+        pred_metric = target["pred_metric"]
+        anomaly_metric = target["anomaly_metric"]
+
+        ts, ys = fetch_history(query)
+
+        if len(ys) < MIN_POINTS:
+            logger.info("[%s] %s 数据不足（%d 点），跳过", now_str, query, len(ys))
+            continue
+
+        ts_grid, ys_grid_raw = normalize_history(ts, ys)
+
+        if len(ys_grid_raw) < MIN_POINTS:
+            logger.info("[%s] %s 清洗后数据不足（%d 点），跳过", now_str, query, len(ys_grid_raw))
+            continue
+
+        ys_grid_model = preprocess_values(ys_grid_raw, target)
+
+        base_labels = parse_labels_from_query(query)
+        write_labels = merge_labels(base_labels, EXTRA_PREDICT_LABELS)
+
+        key = series_key(pred_metric, write_labels)
+
+        (
+            state,
+            is_anomaly,
+            outside_ratio,
+            mean_abs_err,
+            mean_rel_err,
+            max_outside_seconds,
+            max_exceed_ratio,
+        ) = maybe_update_state(
+            key=key,
+            ts_grid=ts_grid,
+            ys_model=ys_grid_model,
+            ys_actual=ys_grid_raw,
+            target=target,
+        )
+
+        if state is None:
+            logger.info("[%s] %s 暂无可用健康模板，等待学习", now_str, query)
+            continue
+
+        now_sec = int(time.time())
+        last_real_ts = int(ts_grid[-1])
+        data_lag = now_sec - last_real_ts
+
+        ts_future = build_prediction_timestamps(
+            key=key,
+            last_real_ts=last_real_ts,
+            now_sec=now_sec,
+        )
+
+        if not ts_future:
+            continue
+
+        pred_values, lower_raw, upper_raw = predict_state_bundle(state, ts_future)
+
+        lower_values, upper_values = calc_final_bounds(
+            state=state,
+            pred=pred_values,
+            lower_raw=lower_raw,
+            upper_raw=upper_raw,
+            target=target,
+        )
+
+        ok = write_prediction_bundle(
+            pred_metric=pred_metric,
+            anomaly_metric=anomaly_metric,
+            labels=write_labels,
+            ts_future=ts_future,
+            pred_values=pred_values,
+            lower_values=lower_values,
+            upper_values=upper_values,
+            is_anomaly=is_anomaly,
+            outside_ratio=outside_ratio,
+            mean_abs_err=mean_abs_err,
+            mean_rel_err=mean_rel_err,
+            max_outside_seconds=max_outside_seconds,
+            max_exceed_ratio=max_exceed_ratio,
+            event_ts=last_real_ts,
+        )
+
+        if not ok:
+            logger.error("[%s] %s 写入预测数据失败", now_str, query)
+            continue
+
+        LAST_REAL_TS_WRITTEN[key] = last_real_ts
+
+        future_start = datetime.fromtimestamp(ts_future[0]).strftime("%H:%M:%S")
+        future_end = datetime.fromtimestamp(ts_future[-1]).strftime("%H:%M:%S")
+        last_real_str = datetime.fromtimestamp(last_real_ts).strftime("%H:%M:%S")
+        origin_str = datetime.fromtimestamp(state.phase_origin_ts).strftime("%H:%M:%S")
+
+        logger.info(
+            "[%s] %-40s → %-35s strategy=%s status=%s anomaly=%s outside=%.2f max_outside=%ss max_exceed=%.2f period=%ss origin=%s last_real=%s lag=%ss 写入 %d 点，预测区间 %s ~ %s",
+            now_str,
+            query,
+            pred_metric,
+            state.strategy,
+            state.status,
+            is_anomaly,
+            outside_ratio,
+            max_outside_seconds,
+            max_exceed_ratio,
+            state.period,
+            origin_str,
+            last_real_str,
+            data_lag,
+            len(ts_future),
+            future_start,
+            future_end,
+        )
+
+    save_state()
+
+
+def main() -> None:
+    load_state()
+
+    logger.info(
+        "预测服务启动 VM=%s 历史窗口=%dmin 理论预测窗口=%ds 实际写入窗口=%ds 轮询间隔=%ds state=%s forecast=%s",
+        VM_URL,
+        HISTORY_MINUTES,
+        HORIZON_SECONDS,
+        WRITE_HORIZON_SECONDS,
+        POLL_INTERVAL,
+        STATE_FILE,
+        EXTRA_PREDICT_LABELS["forecast"],
+    )
+
+    while True:
+        run_once()
+        time.sleep(POLL_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From f9b6506452a75ffd1f1f8beee1ebd1008fdc724d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 25 May 2026 13:46:29 +0800
Subject: [PATCH 30/31] feat(protoforge): fault update

---
 protoforge/core/demo.py    |   2 +-
 protoforge/core/fault.py   | 187 +++++++++++++++++++++++++++++++++++++
 protoforge/models/fault.py |   2 +
 web/src/views/Devices.vue  | 117 ++++++++++++++++++++---
 4 files changed, 296 insertions(+), 12 deletions(-)

diff --git a/protoforge/core/demo.py b/protoforge/core/demo.py
index ff0b333..b7ccae7 100644
--- a/protoforge/core/demo.py
+++ b/protoforge/core/demo.py
@@ -108,7 +108,7 @@ async def seed_demo_data(engine: Any, template_manager: Any) -> None:
             "points": [
                 {"name": "weight", "address": "net_weight", "data_type": "float32", "generator_type": "random", "min_value": 0.5, "max_value": 50.0},
                 {"name": "tare", "address": "tare_weight", "data_type": "float32", "generator_type": "fixed", "fixed_value": 2.5},
-                {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": true},
+                {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": True},
             ],
         },
     ]
diff --git a/protoforge/core/fault.py b/protoforge/core/fault.py
index e72842d..5beba87 100644
--- a/protoforge/core/fault.py
+++ b/protoforge/core/fault.py
@@ -41,6 +41,7 @@
         name="刀具磨损",
         description="刀具切削刃磨损，切削阻力增大，主轴电流升高，振动增大，进给速率下降",
         category="mechanical",
+        scenario_type="trend_drift",
         default_duration=300.0,
         tags=["刀具", "磨损", "渐进"],
         point_faults=[
@@ -67,6 +68,7 @@
         name="刀具崩刃",
         description="刀具突发性崩刃，振动剧烈突增，主轴电流峰值，进给停止",
         category="mechanical",
+        scenario_type="sudden_spike",
         default_duration=15.0,
         tags=["刀具", "崩刃", "突发"],
         point_faults=[
@@ -93,6 +95,7 @@
         name="主轴过热",
         description="主轴长时间高负荷运转或冷却不足，电流持续偏高，转速因热保护下降",
         category="thermal",
+        scenario_type="trend_drift",
         default_duration=240.0,
         tags=["主轴", "过热", "渐进"],
         point_faults=[
@@ -117,6 +120,7 @@
         name="主轴轴承故障",
         description="主轴轴承磨损或润滑不足，振动幅度持续升高，伴随电流轻微上升",
         category="mechanical",
+        scenario_type="trend_drift",
         default_duration=360.0,
         tags=["主轴", "轴承", "渐进"],
         point_faults=[
@@ -141,6 +145,7 @@
         name="进给堵转",
         description="进给轴卡死，进给速率降为零，主轴电流急剧升高",
         category="process",
+        scenario_type="sudden_spike",
         default_duration=20.0,
         tags=["进给", "堵转", "突发"],
         point_faults=[
@@ -163,6 +168,7 @@
         name="振动异常",
         description="工件装夹松动或切削共振，三轴振动突然大幅增加",
         category="mechanical",
+        scenario_type="sudden_spike",
         default_duration=60.0,
         tags=["振动", "装夹", "突发"],
         point_faults=[
@@ -185,6 +191,7 @@
         name="切削液不足",
         description="切削液供给不足，冷却润滑失效，热量积累导致振动和电流缓慢升高",
         category="process",
+        scenario_type="trend_drift",
         default_duration=480.0,
         tags=["切削液", "冷却", "渐进"],
         point_faults=[
@@ -211,6 +218,7 @@
         name="电源波动",
         description="供电电压不稳定，主轴转速和进给速率出现随机波动",
         category="electrical",
+        scenario_type="high_noise",
         default_duration=90.0,
         tags=["电源", "波动", "突发"],
         point_faults=[
@@ -222,6 +230,176 @@
                              multiplier=1.0, noise_scale=150.0),
         ],
     ),
+
+    # ==================================================================
+    # 以下为新增故障类型
+    # ==================================================================
+
+    # ------------------------------------------------------------------
+    # 传感器强干扰 — 高噪声波动型
+    # 场景：电磁干扰、接地不良、信号线屏蔽失效等导致传感器读数剧烈抖动
+    # 特征：均值基本不变，但噪声幅度突然增大数倍，信号看起来"毛刺"严重
+    # 区别于真实故障：设备本身没有坏，只是采集信号质量变差
+    # 模式：瞬间注入，持续期间每次采样都叠加大幅随机噪声
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="sensor_noise",
+        name="传感器强干扰",
+        description=(
+            "【高噪声波动型】电磁干扰或接地不良导致传感器信号质量恶化。"
+            "均值基本不变，但每次采样叠加大幅随机噪声，曲线呈现密集毛刺。"
+            "典型场景：变频器附近的传感器、信号线屏蔽层破损、接地回路故障。"
+        ),
+        category="electrical",
+        scenario_type="high_noise",
+        default_duration=120.0,
+        tags=["传感器", "干扰", "噪声", "高噪声波动型"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=8.0),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=2.5),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=2.5),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=3.0),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=80.0),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 换工件/换程序段 — 工况切换型（高速加工 → 低速精加工）
+    # 场景：CNC 机床切换加工程序，从粗加工切换到精加工
+    # 特征：转速降低、进给降低、电流降低，所有指标跳到新的正常范围并稳定
+    # 关键：这不是故障！数据本身没有坏，只是工况变了，正常范围完全不同
+    # 模式：STEP 阶跃，立即跳到新基线并在整个 duration 内保持
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="mode_switch_fine_machining",
+        name="切换精加工工况",
+        description=(
+            "【工况切换型】从粗加工切换到精加工程序段。"
+            "主轴转速升高、进给速率降低、切削电流降低，各指标立即跳到新的正常范围并保持稳定。"
+            "数据本身没有异常，但与粗加工基线相比会触发阈值告警。"
+            "典型场景：换刀后进入精加工、加工不同特征面、程序跳段。"
+        ),
+        category="process",
+        scenario_type="mode_switch",
+        default_duration=300.0,
+        tags=["工况切换", "精加工", "程序段", "工况切换型"],
+        point_faults=[
+            PointFaultConfig(point="spindle_speed", mode=FaultMode.STEP,
+                             multiplier=1.4, noise_scale=30.0),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.STEP,
+                             multiplier=0.3, noise_scale=10.0),
+            PointFaultConfig(point="spindle_current", mode=FaultMode.STEP,
+                             multiplier=0.55, noise_scale=0.5),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.STEP,
+                             multiplier=0.6, noise_scale=0.1),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.STEP,
+                             multiplier=0.6, noise_scale=0.1),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.STEP,
+                             multiplier=0.6, noise_scale=0.1),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 进入空载工况 — 工况切换型（加工中 → 空载运行）
+    # 场景：加工完成、等待上料、程序暂停，主轴空转
+    # 特征：进给降为 0，电流大幅下降到空载水平，转速维持，振动降低
+    # 模式：STEP 阶跃，立即切换到空载基线
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="mode_switch_idle",
+        name="切换空载工况",
+        description=(
+            "【工况切换型】机床进入空载运行状态（加工完成等待上料、程序暂停）。"
+            "进给速率降为零，主轴电流降至空载水平（约为加工时的 20-30%），"
+            "主轴转速维持，振动明显降低。"
+            "典型场景：换料等待、程序暂停、加工间隙、换刀等待。"
+        ),
+        category="process",
+        scenario_type="mode_switch",
+        default_duration=180.0,
+        tags=["工况切换", "空载", "等待", "工况切换型"],
+        point_faults=[
+            PointFaultConfig(point="feed_rate", mode=FaultMode.STEP,
+                             target_value=0.0, noise_scale=2.0),
+            PointFaultConfig(point="spindle_current", mode=FaultMode.STEP,
+                             multiplier=0.22, noise_scale=0.3),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.STEP,
+                             multiplier=0.25, noise_scale=0.05),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.STEP,
+                             multiplier=0.25, noise_scale=0.05),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.STEP,
+                             multiplier=0.25, noise_scale=0.05),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 突发电流尖峰 — 突发脉冲型
+    # 场景：切削过程中遇到硬质夹杂物、刀具切入角突变、工件材质不均
+    # 特征：主轴电流瞬间冲高（持续 2-5 秒），然后恢复正常，其他指标基本不变
+    # 区别于刀具崩刃：电流尖峰后能自动恢复，不会导致停机
+    # 模式：瞬间注入，持续时间极短
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="current_spike",
+        name="突发电流尖峰",
+        description=(
+            "【突发脉冲型】切削过程中遇到硬质夹杂物或材质不均，主轴电流瞬间冲高后自动恢复。"
+            "电流短暂升至正常值的 3-4 倍，持续仅数秒，振动轻微抖动，进给基本不受影响。"
+            "典型场景：铸件内部硬质点、焊缝区域、材料硬度不均匀。"
+            "与刀具崩刃的区别：能自动恢复，不触发停机报警。"
+        ),
+        category="mechanical",
+        scenario_type="sudden_spike",
+        default_duration=5.0,
+        tags=["电流", "尖峰", "脉冲", "突发脉冲型"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
+                             multiplier=3.5, noise_scale=1.5),
+            PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT,
+                             multiplier=2.0, noise_scale=0.5),
+            PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT,
+                             multiplier=2.0, noise_scale=0.5),
+            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
+                             multiplier=2.5, noise_scale=0.8),
+        ],
+    ),
+
+    # ------------------------------------------------------------------
+    # 主轴负载异常 — 关系约束型
+    # 场景：刀具钝化但未完全磨损、切削参数不匹配、工件材料变硬
+    # 特征：主轴转速正常、进给速率正常，但主轴电流异常升高
+    # 关键：单看任何一个指标都"正常"，只有多指标关系才能发现异常
+    # 模式：渐进式，电流缓慢爬升，转速和进给保持不变
+    # ------------------------------------------------------------------
+    FaultTypeDefinition(
+        id="spindle_load_anomaly",
+        name="主轴负载异常",
+        description=(
+            "【关系约束型】主轴转速正常、进给速率正常，但主轴电流异常升高。"
+            "单看任何一个指标都在正常范围内，只有分析多指标关系才能发现异常。"
+            "物理含义：切削阻力增大（刀具钝化初期、材料变硬），"
+            "系统尚未触发保护降速，但电流已超出正常切削功率范围。"
+            "典型场景：刀具轻度钝化、切削液浓度不足、工件材料批次差异。"
+        ),
+        category="mechanical",
+        scenario_type="relation_constraint",
+        default_duration=240.0,
+        tags=["主轴", "负载", "关系约束", "关系约束型"],
+        point_faults=[
+            PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL,
+                             multiplier=2.8, noise_scale=1.0),
+            # 转速和进给保持不变（multiplier=1.0），只叠加极小噪声维持真实感
+            PointFaultConfig(point="spindle_speed", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=15.0),
+            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
+                             multiplier=1.0, noise_scale=5.0),
+        ],
+    ),
 ]
 
 # 按 id 索引
@@ -374,6 +552,15 @@ def _compute_value(
                 target = baseline * (1.0 + (pf.multiplier - 1.0) * intensity)
             else:
                 target = baseline
+        elif pf.mode == FaultMode.STEP:
+            # 阶跃模式：立即跳到新基线并在整个 duration 内保持（工况切换专用）
+            # 与 INSTANT 的区别：STEP 的 multiplier 表示新工况的正常倍数，不受 intensity 缩放
+            if pf.target_value is not None:
+                target = pf.target_value
+            elif pf.multiplier is not None:
+                target = baseline * pf.multiplier
+            else:
+                target = baseline
         else:
             # 渐进模式：随 progress 线性劣化
             if pf.target_value is not None:
diff --git a/protoforge/models/fault.py b/protoforge/models/fault.py
index cc038e0..025da96 100644
--- a/protoforge/models/fault.py
+++ b/protoforge/models/fault.py
@@ -8,6 +8,7 @@ class FaultMode(str, Enum):
     """故障注入模式"""
     INSTANT = "instant"       # 瞬间跳变到异常值，持续 duration 后恢复
     GRADUAL = "gradual"       # 渐进式劣化，随时间线性恶化，到 duration 时达到峰值后恢复
+    STEP = "step"             # 阶跃切换到新工况基线，整个 duration 内保持新基线（工况切换专用）
 
 
 class FaultStatus(str, Enum):
@@ -36,6 +37,7 @@ class FaultTypeDefinition(BaseModel):
     name: str
     description: str
     category: str                          # 故障分类：mechanical / electrical / thermal / process
+    scenario_type: str = "trend_drift"     # 异常场景类型：trend_drift / sudden_spike / high_noise / mode_switch / relation_constraint
     default_duration: float = 120.0        # 默认持续时间（秒）
     point_faults: list[PointFaultConfig] = Field(default_factory=list)
     tags: list[str] = Field(default_factory=list)
diff --git a/web/src/views/Devices.vue b/web/src/views/Devices.vue
index 65e0535..36141a4 100644
--- a/web/src/views/Devices.vue
+++ b/web/src/views/Devices.vue
@@ -92,24 +92,55 @@
       </n-modal>
 
       <!-- 故障注入 Modal -->
-      <n-modal v-model:show="showFaultModal" preset="card" title="故障注入" style="width:480px">
+      <n-modal v-model:show="showFaultModal" preset="card" title="故障注入" style="width:560px">
         <n-space vertical size="medium">
-          <n-text depth="3" style="font-size:13px">设备：{{ faultTargetDevice?.name }}</n-text>
+          <n-text depth="3" style="font-size:13px">目标设备：<n-text strong>{{ faultTargetDevice?.name }}</n-text></n-text>
+
           <n-form-item label="故障类型" label-placement="left" label-width="80">
             <n-select
               v-model:value="faultTypeId"
-              :options="faultTypeOptions"
+              :options="faultTypeGroupedOptions"
               placeholder="选择故障类型"
               @update:value="onFaultTypeChange"
             />
           </n-form-item>
-          <n-alert v-if="selectedFaultType" type="warning" :bordered="false" style="font-size:12px">
-            <div style="font-weight:500;margin-bottom:4px">{{ selectedFaultType.name }} · {{ faultCategoryLabel(selectedFaultType.category) }}</div>
-            <div style="color:#94a3b8">{{ selectedFaultType.description }}</div>
-            <div style="margin-top:6px;color:#94a3b8">
-              影响测点：{{ selectedFaultType.point_faults.map(p => p.point).join('、') }}
+
+          <!-- 场景说明卡片 -->
+          <div v-if="selectedFaultType" style="background:#1a1a2e;border:1px solid #2d2d4e;border-radius:8px;padding:14px 16px;">
+            <!-- 标题行：故障名 + 场景类型标签 + 分类标签 -->
+            <n-space align="center" style="margin-bottom:10px;flex-wrap:wrap;gap:6px">
+              <n-text strong style="font-size:14px">{{ selectedFaultType.name }}</n-text>
+              <n-tag :type="scenarioTagType(selectedFaultType.scenario_type)" size="small" round>
+                {{ scenarioTypeLabel(selectedFaultType.scenario_type) }}
+              </n-tag>
+              <n-tag size="small" :bordered="false" style="background:#2d2d4e;color:#94a3b8">
+                {{ faultCategoryLabel(selectedFaultType.category) }}
+              </n-tag>
+            </n-space>
+
+            <!-- 描述文本 -->
+            <n-text depth="3" style="font-size:12px;line-height:1.7;display:block;white-space:pre-wrap">{{ selectedFaultType.description }}</n-text>
+
+            <!-- 影响测点 -->
+            <div style="margin-top:10px;padding-top:10px;border-top:1px solid #2d2d4e">
+              <n-text depth="3" style="font-size:11px">影响测点：</n-text>
+              <n-space size="small" style="margin-top:4px;flex-wrap:wrap">
+                <n-tag
+                  v-for="pf in selectedFaultType.point_faults"
+                  :key="pf.point"
+                  size="tiny"
+                  :bordered="false"
+                  style="background:#2d2d4e;color:#e2e8f0;font-family:monospace"
+                >
+                  {{ pf.point }}
+                  <span style="color:#94a3b8;margin-left:4px">
+                    {{ pointFaultModeLabel(pf) }}
+                  </span>
+                </n-tag>
+              </n-space>
             </div>
-          </n-alert>
+          </div>
+
           <n-form-item label="持续时间" label-placement="left" label-width="80">
             <n-input-number
               v-model:value="faultDuration"
@@ -120,11 +151,15 @@
               <template #suffix>秒</template>
             </n-input-number>
           </n-form-item>
+
           <n-form-item label="故障强度" label-placement="left" label-width="80">
             <n-space vertical style="width:100%">
               <n-slider v-model:value="faultIntensity" :min="0.1" :max="1.0" :step="0.1" />
               <n-text depth="3" style="font-size:12px">
                 {{ faultIntensityLabel }}（{{ faultIntensity }}）
+                <span v-if="selectedFaultType?.scenario_type === 'mode_switch'" style="color:#f59e0b">
+                  · 工况切换型强度不影响切换幅度
+                </span>
               </n-text>
             </n-space>
           </n-form-item>
@@ -243,12 +278,17 @@ const columns = [
   },
   { title: '测点', key: 'points', width: 70, render: (row) => (row.points || []).length },
   {
-    title: '故障', key: 'fault', width: 90,
+    title: '故障', key: 'fault', width: 130,
     render: (row) => {
       const fault = activeFaults.value[row.id]
       if (!fault || fault.status === 'none') return h(NTag, { size: 'tiny', bordered: false }, () => '正常')
       const pct = Math.round((fault.progress || 0) * 100)
-      return h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`)
+      const ft = faultTypes.value.find(t => t.id === fault.fault_type_id)
+      const scenarioLabel = ft ? scenarioTypeLabel(ft.scenario_type) : ''
+      return h(NSpace, { size: 2, vertical: false, align: 'center' }, () => [
+        h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`),
+        scenarioLabel ? h(NTag, { size: 'tiny', bordered: false, style: 'font-size:10px;background:#2d1b1b;color:#f87171' }, () => scenarioLabel) : null,
+      ])
     }
   },
   {
@@ -351,6 +391,25 @@ const faultTypeOptions = computed(() =>
   faultTypes.value.map(t => ({ label: `${t.name}（${faultCategoryLabel(t.category)}）`, value: t.id }))
 )
 
+// 按场景类型分组的故障选项
+const SCENARIO_ORDER = ['trend_drift', 'sudden_spike', 'high_noise', 'mode_switch', 'relation_constraint']
+const faultTypeGroupedOptions = computed(() => {
+  const groups = {}
+  for (const t of faultTypes.value) {
+    const st = t.scenario_type || 'trend_drift'
+    if (!groups[st]) groups[st] = []
+    groups[st].push({ label: t.name, value: t.id })
+  }
+  return SCENARIO_ORDER
+    .filter(st => groups[st])
+    .map(st => ({
+      type: 'group',
+      label: scenarioTypeLabel(st),
+      key: st,
+      children: groups[st],
+    }))
+})
+
 const selectedFaultType = computed(() =>
   faultTypes.value.find(t => t.id === faultTypeId.value) || null
 )
@@ -368,6 +427,42 @@ function faultCategoryLabel(category) {
   return map[category] || category
 }
 
+function scenarioTypeLabel(scenarioType) {
+  const map = {
+    trend_drift: '趋势漂移型',
+    sudden_spike: '突发脉冲型',
+    high_noise: '高噪声波动型',
+    mode_switch: '工况切换型',
+    relation_constraint: '关系约束型',
+  }
+  return map[scenarioType] || scenarioType
+}
+
+function scenarioTagType(scenarioType) {
+  const map = {
+    trend_drift: 'warning',
+    sudden_spike: 'error',
+    high_noise: 'info',
+    mode_switch: 'success',
+    relation_constraint: 'default',
+  }
+  return map[scenarioType] || 'default'
+}
+
+function pointFaultModeLabel(pf) {
+  if (pf.mode === 'step') return '→ 阶跃'
+  if (pf.mode === 'gradual') {
+    if (pf.multiplier != null) return `→ ×${pf.multiplier}`
+    if (pf.target_value != null) return `→ ${pf.target_value}`
+  }
+  if (pf.mode === 'instant') {
+    if (pf.target_value != null) return `→ ${pf.target_value}`
+    if (pf.multiplier != null && pf.multiplier !== 1.0) return `→ ×${pf.multiplier}`
+    return '± 噪声'
+  }
+  return ''
+}
+
 function onFaultTypeChange(val) {
   const t = faultTypes.value.find(f => f.id === val)
   if (t && t.default_duration) faultDuration.value = t.default_duration

From 685ae6b79c28cf8094e20fc6f9f7c00a32d0fe4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=B0=91=E6=99=BA?=
 <zhangshaozhi@bailongma-inc.com>
Date: Mon, 25 May 2026 14:00:07 +0800
Subject: [PATCH 31/31] Revert "feat(protoforge): fault update"

This reverts commit f9b6506452a75ffd1f1f8beee1ebd1008fdc724d.
---
 protoforge/core/demo.py    |   2 +-
 protoforge/core/fault.py   | 187 -------------------------------------
 protoforge/models/fault.py |   2 -
 web/src/views/Devices.vue  | 117 +++--------------------
 4 files changed, 12 insertions(+), 296 deletions(-)

diff --git a/protoforge/core/demo.py b/protoforge/core/demo.py
index b7ccae7..ff0b333 100644
--- a/protoforge/core/demo.py
+++ b/protoforge/core/demo.py
@@ -108,7 +108,7 @@ async def seed_demo_data(engine: Any, template_manager: Any) -> None:
             "points": [
                 {"name": "weight", "address": "net_weight", "data_type": "float32", "generator_type": "random", "min_value": 0.5, "max_value": 50.0},
                 {"name": "tare", "address": "tare_weight", "data_type": "float32", "generator_type": "fixed", "fixed_value": 2.5},
-                {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": True},
+                {"name": "stable", "address": "stable_flag", "data_type": "bool", "generator_type": "fixed", "fixed_value": true},
             ],
         },
     ]
diff --git a/protoforge/core/fault.py b/protoforge/core/fault.py
index 5beba87..e72842d 100644
--- a/protoforge/core/fault.py
+++ b/protoforge/core/fault.py
@@ -41,7 +41,6 @@
         name="刀具磨损",
         description="刀具切削刃磨损，切削阻力增大，主轴电流升高，振动增大，进给速率下降",
         category="mechanical",
-        scenario_type="trend_drift",
         default_duration=300.0,
         tags=["刀具", "磨损", "渐进"],
         point_faults=[
@@ -68,7 +67,6 @@
         name="刀具崩刃",
         description="刀具突发性崩刃，振动剧烈突增，主轴电流峰值，进给停止",
         category="mechanical",
-        scenario_type="sudden_spike",
         default_duration=15.0,
         tags=["刀具", "崩刃", "突发"],
         point_faults=[
@@ -95,7 +93,6 @@
         name="主轴过热",
         description="主轴长时间高负荷运转或冷却不足，电流持续偏高，转速因热保护下降",
         category="thermal",
-        scenario_type="trend_drift",
         default_duration=240.0,
         tags=["主轴", "过热", "渐进"],
         point_faults=[
@@ -120,7 +117,6 @@
         name="主轴轴承故障",
         description="主轴轴承磨损或润滑不足，振动幅度持续升高，伴随电流轻微上升",
         category="mechanical",
-        scenario_type="trend_drift",
         default_duration=360.0,
         tags=["主轴", "轴承", "渐进"],
         point_faults=[
@@ -145,7 +141,6 @@
         name="进给堵转",
         description="进给轴卡死，进给速率降为零，主轴电流急剧升高",
         category="process",
-        scenario_type="sudden_spike",
         default_duration=20.0,
         tags=["进给", "堵转", "突发"],
         point_faults=[
@@ -168,7 +163,6 @@
         name="振动异常",
         description="工件装夹松动或切削共振，三轴振动突然大幅增加",
         category="mechanical",
-        scenario_type="sudden_spike",
         default_duration=60.0,
         tags=["振动", "装夹", "突发"],
         point_faults=[
@@ -191,7 +185,6 @@
         name="切削液不足",
         description="切削液供给不足，冷却润滑失效，热量积累导致振动和电流缓慢升高",
         category="process",
-        scenario_type="trend_drift",
         default_duration=480.0,
         tags=["切削液", "冷却", "渐进"],
         point_faults=[
@@ -218,7 +211,6 @@
         name="电源波动",
         description="供电电压不稳定，主轴转速和进给速率出现随机波动",
         category="electrical",
-        scenario_type="high_noise",
         default_duration=90.0,
         tags=["电源", "波动", "突发"],
         point_faults=[
@@ -230,176 +222,6 @@
                              multiplier=1.0, noise_scale=150.0),
         ],
     ),
-
-    # ==================================================================
-    # 以下为新增故障类型
-    # ==================================================================
-
-    # ------------------------------------------------------------------
-    # 传感器强干扰 — 高噪声波动型
-    # 场景：电磁干扰、接地不良、信号线屏蔽失效等导致传感器读数剧烈抖动
-    # 特征：均值基本不变，但噪声幅度突然增大数倍，信号看起来"毛刺"严重
-    # 区别于真实故障：设备本身没有坏，只是采集信号质量变差
-    # 模式：瞬间注入，持续期间每次采样都叠加大幅随机噪声
-    # ------------------------------------------------------------------
-    FaultTypeDefinition(
-        id="sensor_noise",
-        name="传感器强干扰",
-        description=(
-            "【高噪声波动型】电磁干扰或接地不良导致传感器信号质量恶化。"
-            "均值基本不变，但每次采样叠加大幅随机噪声，曲线呈现密集毛刺。"
-            "典型场景：变频器附近的传感器、信号线屏蔽层破损、接地回路故障。"
-        ),
-        category="electrical",
-        scenario_type="high_noise",
-        default_duration=120.0,
-        tags=["传感器", "干扰", "噪声", "高噪声波动型"],
-        point_faults=[
-            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=8.0),
-            PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=2.5),
-            PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=2.5),
-            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=3.0),
-            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=80.0),
-        ],
-    ),
-
-    # ------------------------------------------------------------------
-    # 换工件/换程序段 — 工况切换型（高速加工 → 低速精加工）
-    # 场景：CNC 机床切换加工程序，从粗加工切换到精加工
-    # 特征：转速降低、进给降低、电流降低，所有指标跳到新的正常范围并稳定
-    # 关键：这不是故障！数据本身没有坏，只是工况变了，正常范围完全不同
-    # 模式：STEP 阶跃，立即跳到新基线并在整个 duration 内保持
-    # ------------------------------------------------------------------
-    FaultTypeDefinition(
-        id="mode_switch_fine_machining",
-        name="切换精加工工况",
-        description=(
-            "【工况切换型】从粗加工切换到精加工程序段。"
-            "主轴转速升高、进给速率降低、切削电流降低，各指标立即跳到新的正常范围并保持稳定。"
-            "数据本身没有异常，但与粗加工基线相比会触发阈值告警。"
-            "典型场景：换刀后进入精加工、加工不同特征面、程序跳段。"
-        ),
-        category="process",
-        scenario_type="mode_switch",
-        default_duration=300.0,
-        tags=["工况切换", "精加工", "程序段", "工况切换型"],
-        point_faults=[
-            PointFaultConfig(point="spindle_speed", mode=FaultMode.STEP,
-                             multiplier=1.4, noise_scale=30.0),
-            PointFaultConfig(point="feed_rate", mode=FaultMode.STEP,
-                             multiplier=0.3, noise_scale=10.0),
-            PointFaultConfig(point="spindle_current", mode=FaultMode.STEP,
-                             multiplier=0.55, noise_scale=0.5),
-            PointFaultConfig(point="vibration_x", mode=FaultMode.STEP,
-                             multiplier=0.6, noise_scale=0.1),
-            PointFaultConfig(point="vibration_y", mode=FaultMode.STEP,
-                             multiplier=0.6, noise_scale=0.1),
-            PointFaultConfig(point="vibration_z", mode=FaultMode.STEP,
-                             multiplier=0.6, noise_scale=0.1),
-        ],
-    ),
-
-    # ------------------------------------------------------------------
-    # 进入空载工况 — 工况切换型（加工中 → 空载运行）
-    # 场景：加工完成、等待上料、程序暂停，主轴空转
-    # 特征：进给降为 0，电流大幅下降到空载水平，转速维持，振动降低
-    # 模式：STEP 阶跃，立即切换到空载基线
-    # ------------------------------------------------------------------
-    FaultTypeDefinition(
-        id="mode_switch_idle",
-        name="切换空载工况",
-        description=(
-            "【工况切换型】机床进入空载运行状态（加工完成等待上料、程序暂停）。"
-            "进给速率降为零，主轴电流降至空载水平（约为加工时的 20-30%），"
-            "主轴转速维持，振动明显降低。"
-            "典型场景：换料等待、程序暂停、加工间隙、换刀等待。"
-        ),
-        category="process",
-        scenario_type="mode_switch",
-        default_duration=180.0,
-        tags=["工况切换", "空载", "等待", "工况切换型"],
-        point_faults=[
-            PointFaultConfig(point="feed_rate", mode=FaultMode.STEP,
-                             target_value=0.0, noise_scale=2.0),
-            PointFaultConfig(point="spindle_current", mode=FaultMode.STEP,
-                             multiplier=0.22, noise_scale=0.3),
-            PointFaultConfig(point="vibration_x", mode=FaultMode.STEP,
-                             multiplier=0.25, noise_scale=0.05),
-            PointFaultConfig(point="vibration_y", mode=FaultMode.STEP,
-                             multiplier=0.25, noise_scale=0.05),
-            PointFaultConfig(point="vibration_z", mode=FaultMode.STEP,
-                             multiplier=0.25, noise_scale=0.05),
-        ],
-    ),
-
-    # ------------------------------------------------------------------
-    # 突发电流尖峰 — 突发脉冲型
-    # 场景：切削过程中遇到硬质夹杂物、刀具切入角突变、工件材质不均
-    # 特征：主轴电流瞬间冲高（持续 2-5 秒），然后恢复正常，其他指标基本不变
-    # 区别于刀具崩刃：电流尖峰后能自动恢复，不会导致停机
-    # 模式：瞬间注入，持续时间极短
-    # ------------------------------------------------------------------
-    FaultTypeDefinition(
-        id="current_spike",
-        name="突发电流尖峰",
-        description=(
-            "【突发脉冲型】切削过程中遇到硬质夹杂物或材质不均，主轴电流瞬间冲高后自动恢复。"
-            "电流短暂升至正常值的 3-4 倍，持续仅数秒，振动轻微抖动，进给基本不受影响。"
-            "典型场景：铸件内部硬质点、焊缝区域、材料硬度不均匀。"
-            "与刀具崩刃的区别：能自动恢复，不触发停机报警。"
-        ),
-        category="mechanical",
-        scenario_type="sudden_spike",
-        default_duration=5.0,
-        tags=["电流", "尖峰", "脉冲", "突发脉冲型"],
-        point_faults=[
-            PointFaultConfig(point="spindle_current", mode=FaultMode.INSTANT,
-                             multiplier=3.5, noise_scale=1.5),
-            PointFaultConfig(point="vibration_x", mode=FaultMode.INSTANT,
-                             multiplier=2.0, noise_scale=0.5),
-            PointFaultConfig(point="vibration_y", mode=FaultMode.INSTANT,
-                             multiplier=2.0, noise_scale=0.5),
-            PointFaultConfig(point="vibration_z", mode=FaultMode.INSTANT,
-                             multiplier=2.5, noise_scale=0.8),
-        ],
-    ),
-
-    # ------------------------------------------------------------------
-    # 主轴负载异常 — 关系约束型
-    # 场景：刀具钝化但未完全磨损、切削参数不匹配、工件材料变硬
-    # 特征：主轴转速正常、进给速率正常，但主轴电流异常升高
-    # 关键：单看任何一个指标都"正常"，只有多指标关系才能发现异常
-    # 模式：渐进式，电流缓慢爬升，转速和进给保持不变
-    # ------------------------------------------------------------------
-    FaultTypeDefinition(
-        id="spindle_load_anomaly",
-        name="主轴负载异常",
-        description=(
-            "【关系约束型】主轴转速正常、进给速率正常，但主轴电流异常升高。"
-            "单看任何一个指标都在正常范围内，只有分析多指标关系才能发现异常。"
-            "物理含义：切削阻力增大（刀具钝化初期、材料变硬），"
-            "系统尚未触发保护降速，但电流已超出正常切削功率范围。"
-            "典型场景：刀具轻度钝化、切削液浓度不足、工件材料批次差异。"
-        ),
-        category="mechanical",
-        scenario_type="relation_constraint",
-        default_duration=240.0,
-        tags=["主轴", "负载", "关系约束", "关系约束型"],
-        point_faults=[
-            PointFaultConfig(point="spindle_current", mode=FaultMode.GRADUAL,
-                             multiplier=2.8, noise_scale=1.0),
-            # 转速和进给保持不变（multiplier=1.0），只叠加极小噪声维持真实感
-            PointFaultConfig(point="spindle_speed", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=15.0),
-            PointFaultConfig(point="feed_rate", mode=FaultMode.INSTANT,
-                             multiplier=1.0, noise_scale=5.0),
-        ],
-    ),
 ]
 
 # 按 id 索引
@@ -552,15 +374,6 @@ def _compute_value(
                 target = baseline * (1.0 + (pf.multiplier - 1.0) * intensity)
             else:
                 target = baseline
-        elif pf.mode == FaultMode.STEP:
-            # 阶跃模式：立即跳到新基线并在整个 duration 内保持（工况切换专用）
-            # 与 INSTANT 的区别：STEP 的 multiplier 表示新工况的正常倍数，不受 intensity 缩放
-            if pf.target_value is not None:
-                target = pf.target_value
-            elif pf.multiplier is not None:
-                target = baseline * pf.multiplier
-            else:
-                target = baseline
         else:
             # 渐进模式：随 progress 线性劣化
             if pf.target_value is not None:
diff --git a/protoforge/models/fault.py b/protoforge/models/fault.py
index 025da96..cc038e0 100644
--- a/protoforge/models/fault.py
+++ b/protoforge/models/fault.py
@@ -8,7 +8,6 @@ class FaultMode(str, Enum):
     """故障注入模式"""
     INSTANT = "instant"       # 瞬间跳变到异常值，持续 duration 后恢复
     GRADUAL = "gradual"       # 渐进式劣化，随时间线性恶化，到 duration 时达到峰值后恢复
-    STEP = "step"             # 阶跃切换到新工况基线，整个 duration 内保持新基线（工况切换专用）
 
 
 class FaultStatus(str, Enum):
@@ -37,7 +36,6 @@ class FaultTypeDefinition(BaseModel):
     name: str
     description: str
     category: str                          # 故障分类：mechanical / electrical / thermal / process
-    scenario_type: str = "trend_drift"     # 异常场景类型：trend_drift / sudden_spike / high_noise / mode_switch / relation_constraint
     default_duration: float = 120.0        # 默认持续时间（秒）
     point_faults: list[PointFaultConfig] = Field(default_factory=list)
     tags: list[str] = Field(default_factory=list)
diff --git a/web/src/views/Devices.vue b/web/src/views/Devices.vue
index 36141a4..65e0535 100644
--- a/web/src/views/Devices.vue
+++ b/web/src/views/Devices.vue
@@ -92,55 +92,24 @@
       </n-modal>
 
       <!-- 故障注入 Modal -->
-      <n-modal v-model:show="showFaultModal" preset="card" title="故障注入" style="width:560px">
+      <n-modal v-model:show="showFaultModal" preset="card" title="故障注入" style="width:480px">
         <n-space vertical size="medium">
-          <n-text depth="3" style="font-size:13px">目标设备：<n-text strong>{{ faultTargetDevice?.name }}</n-text></n-text>
-
+          <n-text depth="3" style="font-size:13px">设备：{{ faultTargetDevice?.name }}</n-text>
           <n-form-item label="故障类型" label-placement="left" label-width="80">
             <n-select
               v-model:value="faultTypeId"
-              :options="faultTypeGroupedOptions"
+              :options="faultTypeOptions"
               placeholder="选择故障类型"
               @update:value="onFaultTypeChange"
             />
           </n-form-item>
-
-          <!-- 场景说明卡片 -->
-          <div v-if="selectedFaultType" style="background:#1a1a2e;border:1px solid #2d2d4e;border-radius:8px;padding:14px 16px;">
-            <!-- 标题行：故障名 + 场景类型标签 + 分类标签 -->
-            <n-space align="center" style="margin-bottom:10px;flex-wrap:wrap;gap:6px">
-              <n-text strong style="font-size:14px">{{ selectedFaultType.name }}</n-text>
-              <n-tag :type="scenarioTagType(selectedFaultType.scenario_type)" size="small" round>
-                {{ scenarioTypeLabel(selectedFaultType.scenario_type) }}
-              </n-tag>
-              <n-tag size="small" :bordered="false" style="background:#2d2d4e;color:#94a3b8">
-                {{ faultCategoryLabel(selectedFaultType.category) }}
-              </n-tag>
-            </n-space>
-
-            <!-- 描述文本 -->
-            <n-text depth="3" style="font-size:12px;line-height:1.7;display:block;white-space:pre-wrap">{{ selectedFaultType.description }}</n-text>
-
-            <!-- 影响测点 -->
-            <div style="margin-top:10px;padding-top:10px;border-top:1px solid #2d2d4e">
-              <n-text depth="3" style="font-size:11px">影响测点：</n-text>
-              <n-space size="small" style="margin-top:4px;flex-wrap:wrap">
-                <n-tag
-                  v-for="pf in selectedFaultType.point_faults"
-                  :key="pf.point"
-                  size="tiny"
-                  :bordered="false"
-                  style="background:#2d2d4e;color:#e2e8f0;font-family:monospace"
-                >
-                  {{ pf.point }}
-                  <span style="color:#94a3b8;margin-left:4px">
-                    {{ pointFaultModeLabel(pf) }}
-                  </span>
-                </n-tag>
-              </n-space>
+          <n-alert v-if="selectedFaultType" type="warning" :bordered="false" style="font-size:12px">
+            <div style="font-weight:500;margin-bottom:4px">{{ selectedFaultType.name }} · {{ faultCategoryLabel(selectedFaultType.category) }}</div>
+            <div style="color:#94a3b8">{{ selectedFaultType.description }}</div>
+            <div style="margin-top:6px;color:#94a3b8">
+              影响测点：{{ selectedFaultType.point_faults.map(p => p.point).join('、') }}
             </div>
-          </div>
-
+          </n-alert>
           <n-form-item label="持续时间" label-placement="left" label-width="80">
             <n-input-number
               v-model:value="faultDuration"
@@ -151,15 +120,11 @@
               <template #suffix>秒</template>
             </n-input-number>
           </n-form-item>
-
           <n-form-item label="故障强度" label-placement="left" label-width="80">
             <n-space vertical style="width:100%">
               <n-slider v-model:value="faultIntensity" :min="0.1" :max="1.0" :step="0.1" />
               <n-text depth="3" style="font-size:12px">
                 {{ faultIntensityLabel }}（{{ faultIntensity }}）
-                <span v-if="selectedFaultType?.scenario_type === 'mode_switch'" style="color:#f59e0b">
-                  · 工况切换型强度不影响切换幅度
-                </span>
               </n-text>
             </n-space>
           </n-form-item>
@@ -278,17 +243,12 @@ const columns = [
   },
   { title: '测点', key: 'points', width: 70, render: (row) => (row.points || []).length },
   {
-    title: '故障', key: 'fault', width: 130,
+    title: '故障', key: 'fault', width: 90,
     render: (row) => {
       const fault = activeFaults.value[row.id]
       if (!fault || fault.status === 'none') return h(NTag, { size: 'tiny', bordered: false }, () => '正常')
       const pct = Math.round((fault.progress || 0) * 100)
-      const ft = faultTypes.value.find(t => t.id === fault.fault_type_id)
-      const scenarioLabel = ft ? scenarioTypeLabel(ft.scenario_type) : ''
-      return h(NSpace, { size: 2, vertical: false, align: 'center' }, () => [
-        h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`),
-        scenarioLabel ? h(NTag, { size: 'tiny', bordered: false, style: 'font-size:10px;background:#2d1b1b;color:#f87171' }, () => scenarioLabel) : null,
-      ])
+      return h(NTag, { size: 'tiny', type: 'error', bordered: false }, () => `${fault.fault_type_name} ${pct}%`)
     }
   },
   {
@@ -391,25 +351,6 @@ const faultTypeOptions = computed(() =>
   faultTypes.value.map(t => ({ label: `${t.name}（${faultCategoryLabel(t.category)}）`, value: t.id }))
 )
 
-// 按场景类型分组的故障选项
-const SCENARIO_ORDER = ['trend_drift', 'sudden_spike', 'high_noise', 'mode_switch', 'relation_constraint']
-const faultTypeGroupedOptions = computed(() => {
-  const groups = {}
-  for (const t of faultTypes.value) {
-    const st = t.scenario_type || 'trend_drift'
-    if (!groups[st]) groups[st] = []
-    groups[st].push({ label: t.name, value: t.id })
-  }
-  return SCENARIO_ORDER
-    .filter(st => groups[st])
-    .map(st => ({
-      type: 'group',
-      label: scenarioTypeLabel(st),
-      key: st,
-      children: groups[st],
-    }))
-})
-
 const selectedFaultType = computed(() =>
   faultTypes.value.find(t => t.id === faultTypeId.value) || null
 )
@@ -427,42 +368,6 @@ function faultCategoryLabel(category) {
   return map[category] || category
 }
 
-function scenarioTypeLabel(scenarioType) {
-  const map = {
-    trend_drift: '趋势漂移型',
-    sudden_spike: '突发脉冲型',
-    high_noise: '高噪声波动型',
-    mode_switch: '工况切换型',
-    relation_constraint: '关系约束型',
-  }
-  return map[scenarioType] || scenarioType
-}
-
-function scenarioTagType(scenarioType) {
-  const map = {
-    trend_drift: 'warning',
-    sudden_spike: 'error',
-    high_noise: 'info',
-    mode_switch: 'success',
-    relation_constraint: 'default',
-  }
-  return map[scenarioType] || 'default'
-}
-
-function pointFaultModeLabel(pf) {
-  if (pf.mode === 'step') return '→ 阶跃'
-  if (pf.mode === 'gradual') {
-    if (pf.multiplier != null) return `→ ×${pf.multiplier}`
-    if (pf.target_value != null) return `→ ${pf.target_value}`
-  }
-  if (pf.mode === 'instant') {
-    if (pf.target_value != null) return `→ ${pf.target_value}`
-    if (pf.multiplier != null && pf.multiplier !== 1.0) return `→ ×${pf.multiplier}`
-    return '± 噪声'
-  }
-  return ''
-}
-
 function onFaultTypeChange(val) {
   const t = faultTypes.value.find(f => f.id === val)
   if (t && t.default_duration) faultDuration.value = t.default_duration