From a1f77800d3e81ddd8a23de5ac744cf1276c188c3 Mon Sep 17 00:00:00 2001 From: Reiase Date: Sun, 18 Jan 2026 21:42:37 +0800 Subject: [PATCH 1/4] Update README and documentation to reflect changes in actor command structure and remove deprecated features - Updated README files to clarify the usage of the `pulsing actor` command, now requiring full class paths for actor types. - Enhanced documentation for starting actors, including examples for the new Router and worker classes. - Removed references to the deprecated `pulsing actor list` command, replacing it with `pulsing inspect` for actor system observation. - Updated various examples and guides to align with the new command structure and improve user understanding. --- README.md | 8 +- README.zh.md | 8 +- docs/actor-list-guide.md | 128 ------------ docs/actor-list-implementation.md | 254 ------------------------ docs/mkdocs.yml | 2 + docs/overrides/home.html | 6 +- docs/src/api_reference.md | 32 ++- docs/src/api_reference.zh.md | 32 ++- docs/src/examples/llm_inference.md | 4 +- docs/src/examples/llm_inference.zh.md | 4 +- docs/src/guide/operations.md | 20 +- docs/src/guide/operations.zh.md | 20 +- docs/src/guide/style.md | 69 +++++++ docs/src/guide/style.zh.md | 69 +++++++ docs/src/quickstart/llm_inference.md | 18 +- docs/src/quickstart/llm_inference.zh.md | 18 +- examples/bash/README.md | 87 ++------ examples/bash/demo_actor_list.sh | 155 --------------- examples/bash/demo_actor_list_remote.sh | 145 -------------- examples/inspect/demo_service.py | 22 +- python/pulsing/actor/helpers.py | 28 +-- python/pulsing/actors/__init__.py | 3 +- python/pulsing/actors/router.py | 119 ++++++++++- python/pulsing/cli/__main__.py | 16 +- test_actor_list_integration.py | 73 ------- test_actor_list_same_process.py | 61 ------ test_actor_system.py | 45 ----- 27 files changed, 424 insertions(+), 1022 deletions(-) delete mode 100644 docs/actor-list-guide.md delete mode 100644 docs/actor-list-implementation.md create mode 100644 docs/src/guide/style.md create mode 100644 docs/src/guide/style.zh.md delete mode 100755 examples/bash/demo_actor_list.sh delete mode 100644 examples/bash/demo_actor_list_remote.sh delete mode 100644 test_actor_list_integration.py delete mode 100644 test_actor_list_same_process.py delete mode 100644 test_actor_system.py diff --git a/README.md b/README.md index 27444117f..12367a7cb 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,13 @@ **Lightweight distributed framework designed for high-performance AI applications.** 🚀 **Zero Dependencies** — Pure Rust + Tokio, no NATS/etcd/Redis + 🌐 **Auto Discovery** — Built-in Gossip protocol for cluster management + 🔀 **Location Transparent** — Same API for local and remote Actors + ⚡ **Streaming Ready** — Native support for LLM streaming responses + 🀖 **Agent Friendly** — Integrates with AutoGen, LangGraph out of the box ## 🚀 Get Started in 5 Minutes @@ -129,10 +133,10 @@ Out-of-the-box GPU cluster inference: ```bash # Start Router (OpenAI-compatible API) -pulsing actor router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm +pulsing actor pulsing.actors.Router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm # Start vLLM Worker (can have multiple) -pulsing actor vllm --model Qwen/Qwen2.5-0.5B --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 +pulsing actor pulsing.actors.VllmWorker --model Qwen/Qwen2.5-0.5B --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 # Test curl http://localhost:8080/v1/chat/completions \ diff --git a/README.zh.md b/README.zh.md index daab762e0..634427498 100644 --- a/README.zh.md +++ b/README.zh.md @@ -10,9 +10,13 @@ **蜻量级分垃匏框架䞓䞺高性胜 AI 应甚讟计。** 🚀 **零倖郚䟝赖** — 纯 Rust + Tokio无需 NATS/etcd/Redis + 🌐 **自劚发现** — 内眮 Gossip 协议管理集矀 + 🔀 **䜍眮透明** — 本地和远皋 Actor 䜿甚盞同 API + ⚡ **流匏支持** — 原生支持 LLM 流匏响应 + 🀖 **Agent 友奜** — 匀箱即甚集成 AutoGen、LangGraph ## 🚀 5分钟快速䜓验 @@ -129,10 +133,10 @@ async with runtime(addr="0.0.0.0:8002", seeds=["node1:8001"]): ```bash # 启劚 RouterOpenAI 兌容 API -pulsing actor router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm +pulsing actor pulsing.actors.Router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm # 启劚 vLLM Worker可倚䞪 -pulsing actor vllm --model Qwen/Qwen2.5-0.5B --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 +pulsing actor pulsing.actors.VllmWorker --model Qwen/Qwen2.5-0.5B --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 # 测试 curl http://localhost:8080/v1/chat/completions \ diff --git a/docs/actor-list-guide.md b/docs/actor-list-guide.md deleted file mode 100644 index 9817b4a4d..000000000 --- a/docs/actor-list-guide.md +++ /dev/null @@ -1,128 +0,0 @@ -# Actor List 呜什䜿甚指南 - -!!! note "文档迁移" - 本页已迁移到文档站点的 **Guide** 䞭并䌚以站点版本䞺准 - - `docs/src/guide/actor_list.zh.md` - - `docs/src/guide/actor_list.md` - -`pulsing actor list` 呜什甚于列出圓前 Actor 系统䞭的 actors。 - -## 基本甚法 - -### 列出甚户 actors默讀 - -```bash -pulsing actor list -``` - -默讀情况䞋只星瀺甚户创建的呜名 actors䞍包括以 `_` 匀倎的系统内郚 actors。 - -蟓出瀺䟋 - -``` -Name Type Uptime Code Path ---------------------------------------------------------------------------------------------------- -counter-1 user 5m 23s - -counter-2 user 5m 23s - -calculator user 5m 23s - - -Total: 3 actor(s) -``` - -### 列出所有 actors包括系统 actors - -```bash -pulsing actor list --all_actors True -``` - -包括系统内郚的 actors - -``` -Name Type Uptime Code Path ---------------------------------------------------------------------------------------------------- -counter-1 user 5m 23s - -_system_internal system 5m 30s - -_python_actor_service system 5m 30s - - -Total: 5 actor(s) -``` - -### JSON 蟓出栌匏 - -```bash -pulsing actor list --json True -``` - -以 JSON 栌匏蟓出方䟿脚本倄理 - -```json -[ - { - "name": "counter-1", - "type": "user", - "code_path": null, - "uptime": "5m 23s" - }, - { - "name": "counter-2", - "type": "user", - "code_path": null, - "uptime": "5m 23s" - } -] -``` - -## 圚 Python 代码䞭䜿甚 - -`pulsing actor list` CLI 呜什需芁圚运行 actor system 的进皋内调甚。曎垞见的甚法是盎接圚 Python 代码䞭䜿甚 - -```python -import asyncio -from pulsing.actor import init, remote, get_system -from pulsing.cli.actor_list import list_actors_impl - - -@remote -class Counter: - def __init__(self): - self.count = 0 - - -async def main(): - # 初始化系统 - await init() - system = get_system() - - # 创建䞀些 actors - await Counter.remote(system, name="counter-1") - await Counter.remote(system, name="counter-2") - - # 列出 actors - await list_actors_impl(all_actors=False, output_format="table") - - # 或者盎接䜿甚底层 API - actor_names = system.local_actor_names() - user_actors = [n for n in actor_names if not n.startswith("_")] - print(f"User actors: {user_actors}") - - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## 字段诎明 - -- **Name**: Actor 的名称 -- **Type**: Actor 类型 - - `user`: 甚户创建的 actors - - `system`: 系统内郚 actors -- **Uptime**: Actor 运行时闎圓前䞺系统启劚时闎的近䌌倌 -- **Code Path**: Python 类的代码路埄圓前版本暂未实现星瀺䞺 `-` - -## 未来改进 - -- [ ] 星瀺每䞪 actor 的粟确创建时闎/运行时闎 -- [ ] 星瀺 Python actor 的类型类名和代码路埄 -- [ ] 星瀺 actor 的消息倄理统计倄理数量、错误数等 -- [ ] 支持通过 `--seeds` 参数查询远皋集矀的 actors -- [ ] 支持过滀和搜玢按名称、类型等 diff --git a/docs/actor-list-implementation.md b/docs/actor-list-implementation.md deleted file mode 100644 index b25f8dd5e..000000000 --- a/docs/actor-list-implementation.md +++ /dev/null @@ -1,254 +0,0 @@ -# Pulsing Actor List 完敎实现总结 - -!!! note "文档迁移" - 本页偏实现细节面向甚户的最新版已迁移到文档站点的 **Guide** - - `docs/src/guide/actor_list.zh.md` - - `docs/src/guide/actor_list.md` - - `docs/src/guide/operations.zh.md` - - `docs/src/guide/operations.md` - -## ✅ 已完成功胜 - -### 1. 本地查询暡匏 -圚运行 actor system 的进皋内查询 actors - -```bash -# 圚应甚代码䞭 -from pulsing.cli.actor_list import list_actors_impl -await list_actors_impl() - -# 或圚同䞀进皋䞭䜜䞺 Python API -from pulsing.actor import get_system -names = get_system().local_actor_names() -``` - -**功胜** -- ✅ 列出甚户 actors默讀 -- ✅ 列出所有 actors`--all_actors True` -- ✅ 星瀺 Python 类名劂 `__main__.Counter` -- ✅ 星瀺代码路埄劂 `/path/to/file.py` -- ✅ 衚栌和 JSON 蟓出栌匏 - -### 2. 远皋查询暡匏 -从倖郚连接到远皋集矀并查询 actors - -```bash -# 查询敎䞪集矀 -pulsing actor list --list_seeds "127.0.0.1:8000" - -# 查询特定节点 -pulsing actor list --list_seeds "127.0.0.1:8000" --node_id 12345 - -# JSON 蟓出 -pulsing actor list --list_seeds "127.0.0.1:8000" --json True -``` - -**功胜** -- ✅ 通过 seeds 连接远皋集矀 -- ✅ 自劚发现集矀䞭的所有节点 -- ✅ 查询每䞪节点的 actors -- ✅ 星瀺节点状态和响应性 -- ✅ 支持查询特定节点`--node_id` - -## 实现架构 - -### 组件层次 - -``` -┌─────────────────────────────────────────┐ -│ CLI: pulsing actor list │ -│ (python/pulsing/cli/__main__.py) │ -└────────────┬────────────────────────────┘ - │ - â–Œ -┌─────────────────────────────────────────┐ -│ list_actors_command() │ -│ (python/pulsing/cli/actor_list.py) │ -│ - 解析参数 │ -│ - 选择本地/远皋暡匏 │ -└────────────┬────────────────────────────┘ - │ - ┌──────┮──────┐ - â–Œ â–Œ -┌───────────┐ ┌──────────────────┐ -│ 本地暡匏 │ │ 远皋暡匏 │ -│ │ │ │ -│ get_ │ │ create_actor_ │ -│ system() │ │ system(seeds) │ -│ │ │ │ -│ local_ │ │ all_named_ │ -│ actor_ │ │ actors() │ -│ names() │ │ │ -└───────────┘ └──────────────────┘ - │ │ - â–Œ â–Œ -┌─────────────────────────────────────────┐ -│ Metadata Registry │ -│ (_actor_metadata_registry) │ -│ - Python class name │ -│ - Source file path │ -│ - Module name │ -└─────────────────────────────────────────┘ -``` - -### 关键代码䜍眮 - -1. **Rust 䟧元信息提取** (`crates/pulsing-py/src/actor.rs`) - ```rust - impl Actor for PythonActorWrapper { - fn metadata(&self) -> HashMap { - // 自劚提取 __class__, __module__, __file__ - } - } - ``` - -2. **Python 䟧元信息泚册** (`python/pulsing/actor/remote.py`) - ```python - def _register_actor_metadata(name: str, cls: type): - """圚创建 actor 时泚册类型信息""" - - def get_actor_metadata(name: str) -> dict[str, str] | None: - """查询 actor 的元信息""" - ``` - -3. **CLI 实现** (`python/pulsing/cli/actor_list.py`) - - `list_actors_impl()`: 栞心查询逻蟑 - - `_list_remote_node_actors()`: 远皋节点查询 - - `_print_actors_output()`: 栌匏化蟓出 - -## 蟓出瀺䟋 - -### 本地查询衚栌栌匏 -``` -Name Type Class Code Path ----------------------------------------------------------------------------------------------------------------------------------- -counter-1 user __main__.Counter /tmp/demo.py -counter-2 user __main__.Counter /tmp/demo.py -calculator user __main__.Calculator /tmp/demo.py - -Total: 3 actor(s) -``` - -### 远皋查询倚节点 -``` -Connecting to cluster via seeds: ['127.0.0.1:9001']... -Found 2 nodes in cluster - -================================================================================ -Node 12345 (127.0.0.1:9001) - Status: Alive -================================================================================ - Node is responsive (ping: 1234567890) - Name Type Class Code Path - ---------------------------------------------------------------------------------------------------------------------------------- - service-a-1 user - - - service-a-2 user - - - - Total: 2 actor(s) - -================================================================================ -Node 67890 (127.0.0.1:9002) - Status: Alive -================================================================================ - Node is responsive (ping: 1234567891) - Name Type Class Code Path - ---------------------------------------------------------------------------------------------------------------------------------- - service-b-1 user - - - service-b-2 user - - - service-b-3 user - - - - Total: 3 actor(s) -``` - -## 䜿甚场景 - -### 场景 1: 匀发调试 -圚应甚内郚快速查看创建了哪些 actors - -```python -from pulsing.actor import init, remote, get_system -from pulsing.cli.actor_list import list_actors_impl - -await init() -# ... 创建 actors ... - -# 查看圓前 actors -await list_actors_impl() -``` - -### 场景 2: 运绎监控 -从倖郚查看生产集矀的 actors 分垃 - -```bash -# 查看敎䞪集矀 -pulsing actor list --list_seeds "prod-node-1:8000" - -# 查看特定节点 -pulsing actor list --list_seeds "prod-node-1:8000" --node_id 12345 - -# 富出䞺 JSON 䟛监控系统䜿甚 -pulsing actor list --list_seeds "prod-node-1:8000" --json True > actors.json -``` - -### 场景 3: 集矀诊断 -结合 `pulsing inspect` 䜿甚党面了解集矀状态 - -```bash -# 先查看集矀拓扑 -pulsing inspect --seeds "127.0.0.1:8000" - -# 再查看诊细的 actor 列衚 -pulsing actor list --list_seeds "127.0.0.1:8000" --all_actors True -``` - -## 局限性和未来改进 - -### 圓前局限 - -1. **远皋元信息猺倱**查询远皋节点时无法获取 Python 类名和代码路埄 - - 原因metadata 存傚圚本地进皋内存䞭 - - 圱响远皋查询只胜看到 actor 名字 - -2. **Uptime 粟床**圓前星瀺的是系统 uptime䞍是单䞪 actor 的创建时闎 - - 原因ActorRegistry 存傚创建时闎䜆 local_actor_names() 䞍返回 - -3. **性胜**查询倧集矀时需芁逐䞪 ping 节点 - - 可胜的䌘化并发查询、猓存结果 - -### 建议改进䌘先级从高到䜎 - -- [ ] **P1**: 圚 Rust 的 ActorRegistry 䞭存傚并返回 metadata - - 让远皋查询也胜看到类型信息 - -- [ ] **P2**: 添加每䞪 actor 的粟确 uptime - - 修改 `local_actor_names()` 返回曎诊细信息 - -- [ ] **P2**: 添加消息统计倄理量、错误率等 - - 从 metrics 系统获取 - -- [ ] **P3**: 支持过滀和搜玢 - - 按名称、类型、节点等过滀 - -- [ ] **P3**: 亀互匏暡匏实时刷新 - - 类䌌 `top` 呜什的䜓验 - -## 测试 - -```bash -# 运行测试 -cd /Users/reiase/workspace/Pulsing -PYTHONPATH=python pyenv exec python -m pytest tests/python/test_actor_list.py -v - -# 运行挔瀺 -bash examples/bash/demo_actor_list.sh -bash examples/bash/demo_actor_list_remote.sh -``` - -## 盞关文件 - -- `python/pulsing/cli/actor_list.py` - 栞心实现 -- `python/pulsing/cli/__main__.py` - CLI 集成 -- `python/pulsing/actor/remote.py` - 元信息泚册 -- `crates/pulsing-py/src/actor.rs` - Rust 元信息提取 -- `tests/python/test_actor_list.py` - 测试甚䟋 -- `examples/bash/demo_actor_list.sh` - 本地挔瀺 -- `examples/bash/demo_actor_list_remote.sh` - 远皋挔瀺 -- `docs/actor-list-guide.md` - 甚户文档 diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index a71091ab8..c701fe72a 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -94,6 +94,7 @@ plugins: Operations: CLI 运绎 Distributed Queue: 分垃匏内存队列 Semantics: 语义䞎保证 + Style Guide: 术语䞎风栌 Agent: Agent 框架 Overview: 抂述 Pulsing Native: Pulsing 原生 @@ -138,6 +139,7 @@ nav: - Security: guide/security.md - Distributed Queue: guide/queue.md - Semantics: guide/semantics.md + - Style Guide: guide/style.md - Agent: - agent/index.md - Pulsing Native: agent/native.md diff --git a/docs/overrides/home.html b/docs/overrides/home.html index 601290c3c..bf5d6d3fe 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -874,11 +874,13 @@

LLM Inference Ready

# Start OpenAI-compatible Router
-pulsing actor router --addr 0.0.0.0:8000 \
+pulsing actor pulsing.actors.Router \
+    --addr 0.0.0.0:8000 \
     --http_port 8080 --model_name my-llm
 
 # Start vLLM Worker
-pulsing actor vllm --model Qwen/Qwen2.5-0.5B \
+pulsing actor pulsing.actors.VllmWorker \
+    --model Qwen/Qwen2.5-0.5B \
     --addr 0.0.0.0:8001 --seeds 127.0.0.1:8000
 
 # Test with curl
diff --git a/docs/src/api_reference.md b/docs/src/api_reference.md
index 68c189427..c85794749 100644
--- a/docs/src/api_reference.md
+++ b/docs/src/api_reference.md
@@ -176,7 +176,7 @@ class ActorSystem:
 
 ### ActorRef
 
-Reference to an actor (local or remote).
+Low-level reference to an actor (local or remote). Usually not used directly; prefer `ActorProxy`.
 
 ```python
 class ActorRef:
@@ -193,6 +193,30 @@ class ActorRef:
         pass
 ```
 
+### ActorProxy
+
+High-level proxy wrapper for actors, returned by `@remote` decorator's `spawn()` and `resolve()`.
+**Recommended: use ActorProxy to call methods directly**, no need to manually construct `Message`.
+
+```python
+class ActorProxy:
+    @property
+    def ref(self) -> ActorRef:
+        """Get underlying ActorRef (for low-level ask/tell)"""
+        pass
+
+    # Call actor methods directly, e.g.:
+    # result = await proxy.my_method(arg1, arg2)
+```
+
+**ActorProxy vs ActorRef comparison**:
+
+| Scenario | Recommendation |
+|----------|----------------|
+| Call `@remote` class methods | `ActorProxy`: `await proxy.method()` |
+| Need low-level ask/tell | `ActorRef`: `await proxy.ref.ask(msg)` |
+| Need actor_id | `ActorRef`: `proxy.ref.actor_id` |
+
 ## Decorators
 
 ### @remote
@@ -259,7 +283,11 @@ result = await ask_with_timeout(ref, {"op": "compute"}, timeout=10.0)
 
 After decoration, the class provides:
 
-- `spawn(**kwargs) -> ActorRef`: Create actor (uses global system from `init()`)
+- `spawn(**kwargs) -> ActorProxy`: Create actor and return proxy (uses global system from `init()`)
+- `local(system, **kwargs) -> ActorProxy`: Create actor on specified system
+- `resolve(name) -> ActorProxy`: Resolve an existing actor by name
+
+**Recommended**: Use the returned `ActorProxy` to call methods directly; use `proxy.ref` for low-level `ask/tell`
 
 ## Functions
 
diff --git a/docs/src/api_reference.zh.md b/docs/src/api_reference.zh.md
index 188b7c07f..90910cf8b 100644
--- a/docs/src/api_reference.zh.md
+++ b/docs/src/api_reference.zh.md
@@ -176,7 +176,7 @@ class ActorSystem:
 
 ### ActorRef
 
-Actor 的匕甚本地或远皋。
+Actor 的底层匕甚本地或远皋。通垞䞍需芁盎接䜿甚掚荐䜿甚 `ActorProxy`。
 
 ```python
 class ActorRef:
@@ -193,6 +193,30 @@ class ActorRef:
         pass
 ```
 
+### ActorProxy
+
+对 Actor 的高级代理封装由 `@remote` 装饰噚的 `spawn()` 和 `resolve()` 返回。
+**掚荐盎接䜿甚 ActorProxy 调甚方法**无需手劚构造 `Message`。
+
+```python
+class ActorProxy:
+    @property
+    def ref(self) -> ActorRef:
+        """获取底层 ActorRef需芁䜎级 ask/tell 时䜿甚"""
+        pass
+
+    # 可盎接调甚 actor 䞊的方法䟋劂
+    # result = await proxy.my_method(arg1, arg2)
+```
+
+**ActorProxy vs ActorRef 对比**
+
+| 场景 | 掚荐 |
+|------|------|
+| 调甚 `@remote` 类的方法 | `ActorProxy``await proxy.method()` |
+| 需芁底层 ask/tell | `ActorRef``await proxy.ref.ask(msg)` |
+| 需芁 actor_id | `ActorRef``proxy.ref.actor_id` |
+
 ## 装饰噚
 
 ### @remote
@@ -259,7 +283,11 @@ result = await ask_with_timeout(ref, {"op": "compute"}, timeout=10.0)
 
 装饰后类提䟛
 
-- `spawn(**kwargs) -> ActorRef`: 创建 actor䜿甚 `init()` 初始化的党局系统
+- `spawn(**kwargs) -> ActorProxy`: 创建 actor 并返回代理䜿甚 `init()` 初始化的党局系统
+- `local(system, **kwargs) -> ActorProxy`: 圚指定 system 䞊创建 actor
+- `resolve(name) -> ActorProxy`: 按名称解析已存圚的 actor
+
+**掚荐**盎接䜿甚返回的 `ActorProxy` 调甚方法劂需底层 `ask/tell`䜿甚 `proxy.ref`
 
 ## 凜数
 
diff --git a/docs/src/examples/llm_inference.md b/docs/src/examples/llm_inference.md
index f09a41e95..7481f98d7 100644
--- a/docs/src/examples/llm_inference.md
+++ b/docs/src/examples/llm_inference.md
@@ -19,7 +19,7 @@ This guide shows how to run a **router + worker** LLM service with Pulsing, and
 The router needs an **actor system address** so workers can join the same cluster:
 
 ```bash
-pulsing actor pulsing.actors.router.RouterActor \
+pulsing actor pulsing.actors.Router \
   --addr 0.0.0.0:8000 \
   --http_host 0.0.0.0 \
   --http_port 8080 \
@@ -63,7 +63,7 @@ pulsing inspect actors --endpoint 127.0.0.1:8000
 ### Inspect cluster
 
 ```bash
-pulsing inspect --seeds 127.0.0.1:8000
+pulsing inspect cluster --seeds 127.0.0.1:8000
 ```
 
 ## 4) Call the OpenAI-compatible API
diff --git a/docs/src/examples/llm_inference.zh.md b/docs/src/examples/llm_inference.zh.md
index a5ef977aa..da2f4724a 100644
--- a/docs/src/examples/llm_inference.zh.md
+++ b/docs/src/examples/llm_inference.zh.md
@@ -19,7 +19,7 @@
 Router 需芁指定 **actor system 地址**以䟿其它进皋启劚的 workers 加入同䞀集矀
 
 ```bash
-pulsing actor pulsing.actors.router.RouterActor \
+pulsing actor pulsing.actors.Router \
   --addr 0.0.0.0:8000 \
   --http_host 0.0.0.0 \
   --http_port 8080 \
@@ -63,7 +63,7 @@ pulsing inspect actors --endpoint 127.0.0.1:8000
 ### 巡检集矀
 
 ```bash
-pulsing inspect --seeds 127.0.0.1:8000
+pulsing inspect cluster --seeds 127.0.0.1:8000
 ```
 
 ## 4调甚 OpenAI 兌容 API
diff --git a/docs/src/guide/operations.md b/docs/src/guide/operations.md
index 628d69c7a..7fc18e19f 100644
--- a/docs/src/guide/operations.md
+++ b/docs/src/guide/operations.md
@@ -12,9 +12,9 @@ The `pulsing actor` command starts actors by providing their full class path. Th
 
 Actor type must be a full class path:
 - Format: `module.path.ClassName`
-- Example: `pulsing.actors.router.RouterActor`
-- Example: `pulsing.actors.worker.TransformersWorker`
-- Example: `pulsing.actors.vllm.VllmWorker`
+- Example: `pulsing.actors.Router`
+- Example: `pulsing.actors.TransformersWorker`
+- Example: `pulsing.actors.VllmWorker`
 - Example: `my_module.my_actor.MyCustomActor`
 
 ### Examples
@@ -22,13 +22,13 @@ Actor type must be a full class path:
 #### Router (OpenAI-compatible HTTP API)
 
 ```bash
-pulsing actor pulsing.actors.router.RouterActor \
+pulsing actor pulsing.actors.Router \
   --addr 0.0.0.0:8000 \
   --http_host 0.0.0.0 \
   --http_port 8080 \
   --model_name my-llm \
   --worker_name worker \
-  --scheduler stream_load
+  --scheduler_type stream_load
 ```
 
 #### Transformers Worker
@@ -69,7 +69,7 @@ pulsing actor pulsing.actors.worker.TransformersWorker \
   --seeds 127.0.0.1:8000
 
 # Router targeting specific worker name
-pulsing actor pulsing.actors.router.RouterActor \
+pulsing actor pulsing.actors.Router \
   --worker_name worker-1 \
   --seeds 127.0.0.1:8000
 ```
@@ -227,10 +227,10 @@ pulsing bench gpt2 --url http://localhost:8080
 
 | Task | Command |
 |------|---------|
-| Start router | `pulsing actor pulsing.actors.router.RouterActor --addr 0.0.0.0:8000 --http_port 8080` |
-| Start worker | `pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --seeds ...` |
-| Start multiple workers | `pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --name worker-1 --seeds ...` |
-| Router with custom worker | `pulsing actor pulsing.actors.router.RouterActor --worker_name worker-1 --seeds ...` |
+| Start router | `pulsing actor pulsing.actors.Router --addr 0.0.0.0:8000 --http_port 8080` |
+| Start worker | `pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --seeds ...` |
+| Start multiple workers | `pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --name worker-1 --seeds ...` |
+| Router with custom worker | `pulsing actor pulsing.actors.Router --worker_name worker-1 --seeds ...` |
 | List actors | `pulsing inspect actors --endpoint 127.0.0.1:8000` |
 | Inspect cluster | `pulsing inspect cluster --seeds 127.0.0.1:8000` |
 | Inspect actors | `pulsing inspect actors --seeds 127.0.0.1:8000 --top 10` |
diff --git a/docs/src/guide/operations.zh.md b/docs/src/guide/operations.zh.md
index 759f88257..0e19dc52d 100644
--- a/docs/src/guide/operations.zh.md
+++ b/docs/src/guide/operations.zh.md
@@ -12,9 +12,9 @@ Pulsing 内眮 CLI 工具甚于启劚 actors、检查系统和基准测试分
 
 Actor 类型必须是完敎的类路埄
 - 栌匏: `module.path.ClassName`
-- 瀺䟋: `pulsing.actors.router.RouterActor`
-- 瀺䟋: `pulsing.actors.worker.TransformersWorker`
-- 瀺䟋: `pulsing.actors.vllm.VllmWorker`
+- 瀺䟋: `pulsing.actors.Router`
+- 瀺䟋: `pulsing.actors.TransformersWorker`
+- 瀺䟋: `pulsing.actors.VllmWorker`
 - 瀺䟋: `my_module.my_actor.MyCustomActor`
 
 ### 瀺䟋
@@ -22,13 +22,13 @@ Actor 类型必须是完敎的类路埄
 #### RouterOpenAI 兌容 HTTP API
 
 ```bash
-pulsing actor pulsing.actors.router.RouterActor \
+pulsing actor pulsing.actors.Router \
   --addr 0.0.0.0:8000 \
   --http_host 0.0.0.0 \
   --http_port 8080 \
   --model_name my-llm \
   --worker_name worker \
-  --scheduler stream_load
+  --scheduler_type stream_load
 ```
 
 #### Transformers Worker
@@ -69,7 +69,7 @@ pulsing actor pulsing.actors.worker.TransformersWorker \
   --seeds 127.0.0.1:8000
 
 # Router 路由到特定 worker 名称
-pulsing actor pulsing.actors.router.RouterActor \
+pulsing actor pulsing.actors.Router \
   --worker_name worker-1 \
   --seeds 127.0.0.1:8000
 ```
@@ -209,10 +209,10 @@ pulsing bench gpt2 --url http://localhost:8080
 
 | 任务 | 呜什 |
 |------|------|
-| 启劚 router | `pulsing actor pulsing.actors.router.RouterActor --addr 0.0.0.0:8000 --http_port 8080` |
-| 启劚 worker | `pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --seeds ...` |
-| 启劚倚䞪 worker | `pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --name worker-1 --seeds ...` |
-| Router 指定 worker | `pulsing actor pulsing.actors.router.RouterActor --worker_name worker-1 --seeds ...` |
+| 启劚 router | `pulsing actor pulsing.actors.Router --addr 0.0.0.0:8000 --http_port 8080` |
+| 启劚 worker | `pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --seeds ...` |
+| 启劚倚䞪 worker | `pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --name worker-1 --seeds ...` |
+| Router 指定 worker | `pulsing actor pulsing.actors.Router --worker_name worker-1 --seeds ...` |
 | 列出 actors | `pulsing inspect actors --endpoint 127.0.0.1:8000` |
 | 检查集矀 | `pulsing inspect cluster --seeds 127.0.0.1:8000` |
 | 检查 actors | `pulsing inspect actors --seeds 127.0.0.1:8000 --top 10` |
diff --git a/docs/src/guide/style.md b/docs/src/guide/style.md
new file mode 100644
index 000000000..3095c937f
--- /dev/null
+++ b/docs/src/guide/style.md
@@ -0,0 +1,69 @@
+# Terminology & Style Guide
+
+This page defines terminology and style conventions for Pulsing documentation and code to ensure consistency.
+
+## Core Terminology
+
+| Term | Usage | Description |
+|------|-------|-------------|
+| `ActorSystem` | Code symbol | Rust/Python class name, use in code references |
+| actor system | Conceptual | General description, lowercase |
+| `Actor` | Code symbol | Base class name |
+| actor | Conceptual | General description |
+| `ActorRef` | Code symbol | Low-level actor reference |
+| `ActorProxy` | Code symbol | High-level proxy returned by `@remote` decorator |
+
+## Component Naming
+
+| Component | CLI Actor Class Path | Description |
+|-----------|---------------------|-------------|
+| Router | `pulsing.actors.Router` | OpenAI-compatible HTTP router |
+| TransformersWorker | `pulsing.actors.TransformersWorker` | Transformers inference worker |
+| VllmWorker | `pulsing.actors.VllmWorker` | vLLM inference worker |
+
+**Note**: When documentation mentions "Router", it typically refers to the HTTP routing component for LLM inference services. Example code requiring task dispatch logic should use names like `Dispatcher` to avoid confusion.
+
+## CLI Command Format
+
+### Starting Actors
+
+```bash
+pulsing actor  [options]
+
+# Examples
+pulsing actor pulsing.actors.Router --http_port 8080 --model_name my-llm
+pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --device cpu
+```
+
+### Inspect Commands (Observer Mode)
+
+`pulsing inspect` uses a subcommand structure:
+
+```bash
+# Cluster status
+pulsing inspect cluster --seeds 
+ +# Actor distribution +pulsing inspect actors --seeds
[--top N] [--filter ...] +pulsing inspect actors --endpoint
[--detailed] + +# Metrics +pulsing inspect metrics --seeds
[--raw] + +# Live watch +pulsing inspect watch --seeds
[--interval ...] [--kind ...] +``` + +**Removed**: `pulsing actor list` → Use `pulsing inspect actors` instead + +## Code Style + +- **Python**: Follow PEP 8, prefer type annotations +- **Rust**: Use `cargo fmt` and `cargo clippy` +- **Documentation**: Bilingual (`.zh.md` suffix for Chinese) + +## Documentation References + +- Use backticks for code symbols: `` `ActorSystem` `` +- Use code blocks for commands +- Use backticks for file paths: `` `python/pulsing/actor/` `` diff --git a/docs/src/guide/style.zh.md b/docs/src/guide/style.zh.md new file mode 100644 index 000000000..b8ec752cf --- /dev/null +++ b/docs/src/guide/style.zh.md @@ -0,0 +1,69 @@ +# 术语䞎风栌纊定 + +本页定义 Pulsing 文档和代码䞭的术语䞎风栌规范确保䞀臎性。 + +## 栞心术语 + +| 术语 | 甚法 | 诎明 | +|------|------|------| +| `ActorSystem` | 代码笊号 | Rust/Python 类名甚于代码匕甚 | +| actor system | 抂念描述 | 䞀般性描述时䜿甚小写 | +| `Actor` | 代码笊号 | 基类名 | +| actor | 抂念描述 | 䞀般性描述 | +| `ActorRef` | 代码笊号 | 底层 actor 匕甚 | +| `ActorProxy` | 代码笊号 | `@remote` 装饰噚返回的高级代理 | + +## 组件呜名 + +| 组件 | CLI actor 类路埄 | 诎明 | +|------|------------------|------| +| Router | `pulsing.actors.Router` | OpenAI 兌容 HTTP 路由 | +| TransformersWorker | `pulsing.actors.TransformersWorker` | Transformers 掚理 Worker | +| VllmWorker | `pulsing.actors.VllmWorker` | vLLM 掚理 Worker | + +**泚意**文档䞭提到"Router"时通垞指 LLM 掚理服务的 HTTP 路由组件。瀺䟋代码䞭若需芁任务分发逻蟑应䜿甚 `Dispatcher` 等名称以避免混淆。 + +## CLI 呜什栌匏 + +### 启劚 Actor + +```bash +pulsing actor <完敎类路埄> [选项] + +# 瀺䟋 +pulsing actor pulsing.actors.Router --http_port 8080 --model_name my-llm +pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --device cpu +``` + +### 检查呜什观察者暡匏 + +`pulsing inspect` 䜿甚子呜什结构 + +```bash +# 集矀状态 +pulsing inspect cluster --seeds <地址> + +# Actor 分垃 +pulsing inspect actors --seeds <地址> [--top N] [--filter ...] +pulsing inspect actors --endpoint <地址> [--detailed] + +# 指标 +pulsing inspect metrics --seeds <地址> [--raw] + +# 实时监视 +pulsing inspect watch --seeds <地址> [--interval ...] [--kind ...] +``` + +**已移陀**`pulsing actor list` → 请䜿甚 `pulsing inspect actors` + +## 代码风栌 + +- **Python**遵埪 PEP 8䌘先䜿甚类型泚解 +- **Rust**䜿甚 `cargo fmt` 和 `cargo clippy` +- **文档**䞭英双语`.zh.md` 䞺䞭文版 + +## 文档匕甚 + +- 代码笊号䜿甚反匕号`` `ActorSystem` `` +- 呜什䜿甚代码块 +- 文件路埄䜿甚反匕号`` `python/pulsing/actor/` `` diff --git a/docs/src/quickstart/llm_inference.md b/docs/src/quickstart/llm_inference.md index 61137d7a0..99a49f6d3 100644 --- a/docs/src/quickstart/llm_inference.md +++ b/docs/src/quickstart/llm_inference.md @@ -43,7 +43,7 @@ Choose a backend: Open **Terminal A**: ```bash -pulsing actor router \ +pulsing actor pulsing.actors.Router \ --addr 0.0.0.0:8000 \ --http_port 8080 \ --model_name my-llm @@ -64,8 +64,8 @@ Open **Terminal B**: === "Transformers (CPU)" ```bash - pulsing actor transformers \ - --model gpt2 \ + pulsing actor pulsing.actors.TransformersWorker \ + --model_name gpt2 \ --device cpu \ --addr 0.0.0.0:8001 \ --seeds 127.0.0.1:8000 @@ -74,7 +74,7 @@ Open **Terminal B**: === "vLLM (GPU)" ```bash - pulsing actor vllm \ + pulsing actor pulsing.actors.VllmWorker \ --model Qwen/Qwen2.5-0.5B \ --addr 0.0.0.0:8002 \ --seeds 127.0.0.1:8000 @@ -82,7 +82,7 @@ Open **Terminal B**: | Flag | Description | |------|-------------| -| `--model` | Model name/path | +| `--model` / `--model_name` | Model name/path (TransformersWorker uses `--model_name`, VllmWorker uses `--model`) | | `--seeds` | Router address to join cluster | --- @@ -91,10 +91,10 @@ Open **Terminal B**: ```bash # List actors -pulsing actor list --endpoint 127.0.0.1:8000 +pulsing inspect actors --endpoint 127.0.0.1:8000 # Inspect cluster state -pulsing inspect --seeds 127.0.0.1:8000 +pulsing inspect cluster --seeds 127.0.0.1:8000 ``` You should see the `router` and `worker` actors. @@ -135,10 +135,10 @@ Add more workers to handle more load: ```bash # Terminal C -pulsing actor transformers --model gpt2 --addr 0.0.0.0:8003 --seeds 127.0.0.1:8000 +pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --addr 0.0.0.0:8003 --seeds 127.0.0.1:8000 # Terminal D -pulsing actor transformers --model gpt2 --addr 0.0.0.0:8004 --seeds 127.0.0.1:8000 +pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --addr 0.0.0.0:8004 --seeds 127.0.0.1:8000 ``` The Router automatically load-balances across all workers. diff --git a/docs/src/quickstart/llm_inference.zh.md b/docs/src/quickstart/llm_inference.zh.md index 3626c93ab..d42dd2982 100644 --- a/docs/src/quickstart/llm_inference.zh.md +++ b/docs/src/quickstart/llm_inference.zh.md @@ -43,7 +43,7 @@ pip install pulsing 打匀**终端 A** ```bash -pulsing actor router \ +pulsing actor pulsing.actors.Router \ --addr 0.0.0.0:8000 \ --http_port 8080 \ --model_name my-llm @@ -64,8 +64,8 @@ pulsing actor router \ === "Transformers (CPU)" ```bash - pulsing actor transformers \ - --model gpt2 \ + pulsing actor pulsing.actors.TransformersWorker \ + --model_name gpt2 \ --device cpu \ --addr 0.0.0.0:8001 \ --seeds 127.0.0.1:8000 @@ -74,7 +74,7 @@ pulsing actor router \ === "vLLM (GPU)" ```bash - pulsing actor vllm \ + pulsing actor pulsing.actors.VllmWorker \ --model Qwen/Qwen2.5-0.5B \ --addr 0.0.0.0:8002 \ --seeds 127.0.0.1:8000 @@ -82,7 +82,7 @@ pulsing actor router \ | 参数 | 诎明 | |------|------| -| `--model` | 暡型名称/路埄 | +| `--model` / `--model_name` | 暡型名称/路埄TransformersWorker 甹 `--model_name`VllmWorker 甹 `--model` | | `--seeds` | 加入集矀的 Router 地址 | --- @@ -91,10 +91,10 @@ pulsing actor router \ ```bash # 列出 actor -pulsing actor list --endpoint 127.0.0.1:8000 +pulsing inspect actors --endpoint 127.0.0.1:8000 # 检查集矀状态 -pulsing inspect --seeds 127.0.0.1:8000 +pulsing inspect cluster --seeds 127.0.0.1:8000 ``` 䜠应该胜看到 `router` 和 `worker` actor。 @@ -135,10 +135,10 @@ curl -N http://localhost:8080/v1/chat/completions \ ```bash # 终端 C -pulsing actor transformers --model gpt2 --addr 0.0.0.0:8003 --seeds 127.0.0.1:8000 +pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --addr 0.0.0.0:8003 --seeds 127.0.0.1:8000 # 终端 D -pulsing actor transformers --model gpt2 --addr 0.0.0.0:8004 --seeds 127.0.0.1:8000 +pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --addr 0.0.0.0:8004 --seeds 127.0.0.1:8000 ``` Router 䌚自劚圚所有 Worker 闎莟蜜均衡。 diff --git a/examples/bash/README.md b/examples/bash/README.md index 3bf59e9bf..a0083196a 100644 --- a/examples/bash/README.md +++ b/examples/bash/README.md @@ -2,90 +2,27 @@ 这䞪目圕包含甚于测试和挔瀺 Pulsing 功胜的 Bash 脚本。 -## 脚本列衚 +## 圓前状态 -### `demo_actor_list.sh` +歀目圕暂无掻跃的挔瀺脚本。 -挔瀺劂䜕圚应甚䞭䜿甚 actor list 功胜查看圓前运行的 actors。 - -**功胜挔瀺** -- 圚应甚启劚后查看 actor 列衚 -- 默讀暡匏只星瀺甚户创建的 actors -- `--all_actors` 暡匏星瀺所有 actors包括系统内郚 actors -- JSON 蟓出栌匏 -- 底层 API 䜿甚`system.local_actor_names()` - -**䜿甚方法** +劂需查看/管理 actors请䜿甚 `pulsing inspect` 呜什观察者暡匏䞍加入 gossip 集矀 ```bash -cd examples/bash -./demo_actor_list.sh -``` +# 查询单䞪节点的 actors +pulsing inspect actors --endpoint 127.0.0.1:8000 -或从项目根目圕 - -```bash -bash examples/bash/demo_actor_list.sh -``` +# 查询敎䞪集矀的 actors +pulsing inspect actors --seeds 127.0.0.1:8000 -**蟓出瀺䟋** +# 查看集矀状态 +pulsing inspect cluster --seeds 127.0.0.1:8000 +# 实时监视 +pulsing inspect watch --seeds 127.0.0.1:8000 ``` -====================================================================== - Pulsing Actor List 挔瀺 -====================================================================== - -Python: Python 3.12.11 - -运行挔瀺... - -================================================================================ -挔瀺圚应甚䞭䜿甚 pulsing actor list -================================================================================ - -1. 初始化 actor system... - ✓ 系统启劚: 0.0.0.0:49724 - -2. 创建䞚务 actors... - ✓ 创建了 3 䞪 actors - -3. 䜿甚 Python API 查看 actors: - ---------------------------------------------------------------------------- - 本地 actors: calculator, counter-1, counter-2 -4. 䜿甚 CLI 栌匏化蟓出只星瀺甚户 actors: - ---------------------------------------------------------------------------- -Name Type Uptime Code Path ------------------------------------------------------------------------------------------------------------ -counter-1 user 0s - -counter-2 user 0s - -calculator user 0s - - -Total: 3 actor(s) - -... -``` - -**重芁诎明** - -`pulsing actor list` 是讟计甚于圚**运行䞭的应甚进皋内**调甚的管理功胜而䞍是独立的呜什行工具。这是因䞺 - -1. Actor system 是进皋本地的需芁圚同䞀进皋䞭才胜访问 -2. 这种讟计曎适合集成到应甚的管理接口䞭 -3. 对于倖郚查看远皋集矀应䜿甚 `pulsing inspect --seeds
` - -**圚应甚䞭集成** - -```python -from pulsing.actor import init, get_system -from pulsing.cli.actor_list import list_actors_impl - -await init() -# ... 创建 actors ... - -# 圚管理端点或 REPL 䞭调甚 -await list_actors_impl(all_actors=False, output_format='table') -``` +曎倚 CLI 甚法参见 [CLI 呜什文档](../../docs/src/guide/operations.zh.md)。 ## 环境芁求 diff --git a/examples/bash/demo_actor_list.sh b/examples/bash/demo_actor_list.sh deleted file mode 100755 index 0850dd6a8..000000000 --- a/examples/bash/demo_actor_list.sh +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env bash -# 挔瀺 pulsing actor list 呜什 -# 䜿甚简单的 HTTP API 查询䞍加入 gossip 集矀 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -export PYTHONPATH="$PROJECT_ROOT/python:$PYTHONPATH" - -# 完党犁甚 Rust 日志 -export RUST_LOG=off - -# 颜色 -GREEN='\033[0;32m' -BLUE='\033[0;34m' -YELLOW='\033[1;33m' -NC='\033[0m' - -echo "======================================================================" -echo " Pulsing Actor List - 挔瀺" -echo "======================================================================" -echo "" - -# 检查 pyenv -if ! command -v pyenv &> /dev/null; then - echo "错误: 需芁 pyenv" - exit 1 -fi - -PYTHON="pyenv exec python" - -# 枅理可胜残留的进皋 -echo -e "${YELLOW}枅理残留进皋...${NC}" -pkill -f "pulsing_server" 2>/dev/null || true -sleep 1 - -# 䜿甚随机端口避免冲突 -PORT=$((19000 + RANDOM % 1000)) - -# 创建䞀䞪䞎时服务端脚本 -SERVER_SCRIPT=$(mktemp /tmp/pulsing_server_XXXXXX.py) - -cat > "$SERVER_SCRIPT" << EOF -import asyncio -import os -import sys - -# 犁甚所有日志 -os.environ["RUST_LOG"] = "off" - -from pulsing.actor import init, remote, get_system - - -@remote -class Counter: - def __init__(self): - self.count = 0 - - def increment(self): - self.count += 1 - return self.count - - -@remote -class Calculator: - def add(self, a, b): - return a + b - - -async def main(): - await init(addr="127.0.0.1:${PORT}") - system = get_system() - - await Counter.remote(system, name="counter-1") - await Counter.remote(system, name="counter-2") - await Calculator.remote(system, name="calculator") - - # Signal ready - print("READY", flush=True) - - await asyncio.Event().wait() - - -if __name__ == "__main__": - asyncio.run(main()) -EOF - -echo -e "${GREEN}1. 启劚 Actor System (127.0.0.1:${PORT})${NC}" - -# 启劚服务端后台运行完党静默 -$PYTHON "$SERVER_SCRIPT" > /dev/null 2>&1 & -SERVER_PID=$! - -# 等埅服务就绪 -echo " 等埅服务启劚..." -sleep 3 - -echo "" -echo -e "${GREEN}2. 测试连接单䞪 endpoint (HTTP API)${NC}" -echo " 呜什: pulsing actor list --endpoint 127.0.0.1:${PORT}" -echo "" - -$PYTHON -m pulsing.cli actor list --endpoint 127.0.0.1:${PORT} - -echo "" -echo -e "${GREEN}3. 星瀺所有 actors (包括内郚)${NC}" -echo " 呜什: pulsing actor list --endpoint 127.0.0.1:${PORT} --all_actors True" -echo "" - -$PYTHON -m pulsing.cli actor list --endpoint 127.0.0.1:${PORT} --all_actors True - -echo "" -echo -e "${GREEN}4. JSON 栌匏蟓出${NC}" -echo " 呜什: pulsing actor list --endpoint 127.0.0.1:${PORT} --json True" -echo "" - -$PYTHON -m pulsing.cli actor list --endpoint 127.0.0.1:${PORT} --json True - -echo "" -echo -e "${GREEN}5. 䜿甚 --seeds 查询集矀${NC}" -echo " 呜什: pulsing actor list --seeds 127.0.0.1:${PORT}" -echo "" - -$PYTHON -m pulsing.cli actor list --seeds 127.0.0.1:${PORT} - -# 枅理 -echo "" -echo -e "${GREEN}枅理...${NC}" -kill $SERVER_PID 2>/dev/null || true -wait $SERVER_PID 2>/dev/null || true -rm -f "$SERVER_SCRIPT" - -echo "" -echo "======================================================================" -echo " 挔瀺完成" -echo "======================================================================" -echo "" -echo "甚法总结:" -echo "" -echo -e " ${BLUE}# 查询单䞪 actor system${NC}" -echo " pulsing actor list --endpoint 127.0.0.1:8000" -echo "" -echo -e " ${BLUE}# 查询敎䞪集矀${NC}" -echo " pulsing actor list --seeds 127.0.0.1:8000,127.0.0.1:8001" -echo "" -echo -e " ${BLUE}# 星瀺所有 actors (包括内郚)${NC}" -echo " pulsing actor list --endpoint 127.0.0.1:8000 --all_actors True" -echo "" -echo -e " ${BLUE}# JSON 栌匏蟓出${NC}" -echo " pulsing actor list --endpoint 127.0.0.1:8000 --json True" -echo "" -echo -e "${GREEN}✓ 䜿甚简单 HTTP API䞍加入 gossip 集矀${NC}" -echo -e "${GREEN}✓ 支持星瀺完敎 Python 元信息: 类名、暡块、代码路埄、Actor ID${NC}" -echo "" diff --git a/examples/bash/demo_actor_list_remote.sh b/examples/bash/demo_actor_list_remote.sh deleted file mode 100644 index 5b9bb0deb..000000000 --- a/examples/bash/demo_actor_list_remote.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env bash -# 挔瀺 pulsing actor list 的远皋查询功胜 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -export PYTHONPATH="$PROJECT_ROOT/python:$PYTHONPATH" - -# 颜色 -GREEN='\033[0;32m' -BLUE='\033[0;34m' -YELLOW='\033[1;33m' -NC='\033[0m' - -echo "======================================================================" -echo " Pulsing Actor List - 远皋查询挔瀺" -echo "======================================================================" -echo "" - -# 检查 pyenv -if ! command -v pyenv &> /dev/null; then - echo "错误: 需芁 pyenv" - exit 1 -fi - -PYTHON="pyenv exec python" - -# 创建䞀䞪节点的应甚 -NODE1_SCRIPT=$(mktemp /tmp/pulsing_node1.XXXXXX.py) -NODE2_SCRIPT=$(mktemp /tmp/pulsing_node2.XXXXXX.py) - -cat > "$NODE1_SCRIPT" << 'EOF' -import asyncio -from pulsing.actor import init, remote, get_system - - -@remote -class ServiceA: - def ping(self): - return "pong from A" - - -async def main(): - await init(addr="127.0.0.1:9001") - system = get_system() - print(f"Node 1 started: {system.addr}") - - # Create actors - await ServiceA.remote(system, name="service-a-1") - await ServiceA.remote(system, name="service-a-2") - print("Created 2 actors on Node 1") - - # Keep running - await asyncio.Event().wait() - - -if __name__ == "__main__": - asyncio.run(main()) -EOF - -cat > "$NODE2_SCRIPT" << 'EOF' -import asyncio -from pulsing.actor import SystemConfig, create_actor_system, remote - - -@remote -class ServiceB: - def process(self, data): - return f"processed: {data}" - - -async def main(): - config = SystemConfig.with_addr("127.0.0.1:9002").with_seeds(["127.0.0.1:9001"]) - system = await create_actor_system(config) - print(f"Node 2 started: {system.addr}, joining cluster...") - - await asyncio.sleep(1) - - # Create actors - await ServiceB.remote(system, name="service-b-1") - await ServiceB.remote(system, name="service-b-2") - await ServiceB.remote(system, name="service-b-3") - print("Created 3 actors on Node 2") - - # Keep running - await asyncio.Event().wait() - - -if __name__ == "__main__": - asyncio.run(main()) -EOF - -echo -e "${GREEN}场景: 查询远皋倚节点集矀${NC}" -echo "================================================================" -echo "" - -# 启劚节点1 -echo "1. 启劚 Node 1 (127.0.0.1:9001)..." -$PYTHON "$NODE1_SCRIPT" 2>&1 | grep -v "INFO" & -NODE1_PID=$! -sleep 2 - -# 启劚节点2 -echo "2. 启劚 Node 2 (127.0.0.1:9002), 加入集矀..." -$PYTHON "$NODE2_SCRIPT" 2>&1 | grep -v "INFO" & -NODE2_PID=$! -sleep 3 - -echo "" -echo -e "${BLUE}3. 䜿甚 pulsing actor list 查询远皋集矀${NC}" -echo " 呜什: pulsing actor list --seeds '127.0.0.1:9001'" -echo "" - -# 䜿甚我们实现的功胜查询集矀 -$PYTHON -c " -from pulsing.cli.actor_list import list_actors_command -list_actors_command( - all_actors=False, - json_output=False, - seeds='127.0.0.1:9001' -) -" 2>&1 | grep -v "INFO" - -echo "" -echo -e "${BLUE}4. 查询特定节点 (劂果实现了 node_id 参数)${NC}" -echo " 这需芁先知道 node_id可以从䞊面的蟓出获取" -echo "" - -# 枅理 -echo -e "${GREEN}枅理...${NC}" -kill $NODE1_PID $NODE2_PID 2>/dev/null || true -wait $NODE1_PID $NODE2_PID 2>/dev/null || true -rm -f "$NODE1_SCRIPT" "$NODE2_SCRIPT" - -echo "" -echo "======================================================================" -echo " 挔瀺完成" -echo "======================================================================" -echo "" -echo "总结" -echo " ✓ 可以通过 --seeds 参数连接远皋集矀" -echo " ✓ 自劚查询集矀䞭所有节点的 actors" -echo " ✓ 星瀺每䞪节点的 actor 列衚" -echo "" diff --git a/examples/inspect/demo_service.py b/examples/inspect/demo_service.py index 348645b1a..57c011d72 100644 --- a/examples/inspect/demo_service.py +++ b/examples/inspect/demo_service.py @@ -50,29 +50,29 @@ def receive(self, msg: Message) -> Message: return Message.empty() -class RouterActor(Actor): - """A router actor that distributes tasks to workers""" +class DispatcherActor(Actor): + """A dispatcher actor that distributes tasks to workers (for demo purposes)""" def __init__(self): self.workers = [] - self.tasks_routed = 0 + self.tasks_dispatched = 0 def on_start(self, actor_id: ActorId): - print("[Router] Started") + print("[Dispatcher] Started") def receive(self, msg: Message) -> Message: if msg.msg_type == "RouteTask": - self.tasks_routed += 1 + self.tasks_dispatched += 1 task = msg.to_json().get("task", "") # Simulate routing logic worker_id = f"worker-{random.randint(1, 3)}" return Message.from_json( - "Routed", - {"task": task, "worker": worker_id, "routed": self.tasks_routed}, + "Dispatched", + {"task": task, "worker": worker_id, "dispatched": self.tasks_dispatched}, ) elif msg.msg_type == "GetStats": return Message.from_json( - "Stats", {"router": True, "tasks_routed": self.tasks_routed} + "Stats", {"dispatcher": True, "tasks_dispatched": self.tasks_dispatched} ) return Message.empty() @@ -120,10 +120,10 @@ async def run_node(port: int, seed: str | None): # Create different actors based on node role if seed is None: - # Node 1: Create router and some workers + # Node 1: Create dispatcher and some workers print("Creating actors on node 1...") - await system.spawn("router", RouterActor(), public=True) - print(" ✓ actors/router") + await system.spawn("dispatcher", DispatcherActor(), public=True) + print(" ✓ actors/dispatcher") for i in range(1, 3): worker_name = f"worker-{i}" diff --git a/python/pulsing/actor/helpers.py b/python/pulsing/actor/helpers.py index 573486c1a..ee33bab92 100644 --- a/python/pulsing/actor/helpers.py +++ b/python/pulsing/actor/helpers.py @@ -9,18 +9,18 @@ from . import Actor, ActorSystem -async def run_until_signal( - system: "ActorSystem", actor_name: str | None = None -) -> None: +async def run_until_signal(actor_name: str | None = None) -> None: """ Run until shutdown signal (SIGTERM or SIGINT) Handles graceful shutdown on first signal, force quits on second signal. + Uses the global system via shutdown() to ensure proper cleanup. Args: - system: ActorSystem instance actor_name: Optional actor name for logging """ + from . import get_system, shutdown + shutdown_event = asyncio.Event() shutting_down = False @@ -43,13 +43,15 @@ def signal_handler(): # Perform graceful shutdown try: + system = get_system() if actor_name: await system.stop(actor_name) except Exception as e: print(f"[{actor_name or 'Actor'}] Stop error: {e}") + # Use module-level shutdown() to properly clear global state try: - await system.shutdown() + await shutdown() except Exception as e: print(f"[{actor_name or 'Actor'}] Shutdown error: {e}") @@ -64,7 +66,10 @@ async def spawn_and_run( public: bool = True, ) -> None: """ - Create ActorSystem, spawn actor, and run until signal + Create ActorSystem via init(), spawn actor, and run until signal + + This function uses init() to ensure the global system is set, + making get_system() available inside actor on_start()/receive(). Args: actor: Actor instance @@ -73,14 +78,11 @@ async def spawn_and_run( seeds: List of seed node addresses for cluster discovery public: Whether to register as public named actor """ - from . import SystemConfig, create_actor_system - - config = SystemConfig.with_addr(addr) if addr else SystemConfig.standalone() - if seeds: - config = config.with_seeds(seeds) + from . import get_system, init - system = await create_actor_system(config) + # Use init() to set global system (makes get_system() work inside actors) + system = await init(addr=addr, seeds=seeds) await system.spawn(name, actor, public=public) print(f"[{name}] Started at {system.addr}") - await run_until_signal(system, name) + await run_until_signal(name) diff --git a/python/pulsing/actors/__init__.py b/python/pulsing/actors/__init__.py index 925c7bb72..d7b40bf56 100644 --- a/python/pulsing/actors/__init__.py +++ b/python/pulsing/actors/__init__.py @@ -4,7 +4,7 @@ # Router # Stream load subscription from .load_stream import LoadSnapshot, LoadStreamConsumer, StreamLoadScheduler -from .router import start_router, stop_router +from .router import Router, start_router, stop_router # Scheduler from .scheduler import ( # Base class; Python schedulers; Rust high-performance schedulers; Factory function @@ -29,6 +29,7 @@ __all__ = [ # Core API + "Router", "TransformersWorker", "VllmWorker", "GenerationConfig", diff --git a/python/pulsing/actors/router.py b/python/pulsing/actors/router.py index 0045f9428..952e84f6a 100644 --- a/python/pulsing/actors/router.py +++ b/python/pulsing/actors/router.py @@ -1,5 +1,6 @@ """Router - OpenAI-compatible HTTP API router""" +import asyncio import json import time import uuid @@ -7,7 +8,7 @@ from aiohttp import web -from pulsing.actor import ActorSystem, Message +from pulsing.actor import Actor, ActorId, ActorSystem, Message, get_system @dataclass @@ -427,3 +428,119 @@ async def stop_router(runner: web.AppRunner): await runner.cleanup() print("[Router] HTTP server stopped") + + +class Router(Actor): + """Router Actor - OpenAI-compatible HTTP API router as an Actor + + This actor wraps the start_router/stop_router functions to provide + a CLI-compatible entry point via `pulsing actor pulsing.actors.Router`. + + Args: + http_host: HTTP listen address (default: "0.0.0.0") + http_port: HTTP listen port (default: 8080) + model_name: Model name for API responses (default: "pulsing-model") + worker_name: Worker actor name to route requests to (default: "worker") + scheduler_type: Scheduler type, supports: + - "stream_load": Stream load-aware (default, recommended) + - "random": Random + - "round_robin": Round robin + - "power_of_two": Power-of-Two Choices + - "cache_aware": Cache-aware + + Example: + # Start via CLI + pulsing actor pulsing.actors.Router \\ + --http_host 0.0.0.0 \\ + --http_port 8080 \\ + --model_name my-llm \\ + --worker_name worker + + # Or programmatically + router = Router(http_port=8080, model_name="my-llm") + await system.spawn("router", router, public=True) + """ + + def __init__( + self, + http_host: str = "0.0.0.0", + http_port: int = 8080, + model_name: str = "pulsing-model", + worker_name: str = "worker", + scheduler_type: str = "stream_load", + ): + self.http_host = http_host + self.http_port = http_port + self.model_name = model_name + self.worker_name = worker_name + self.scheduler_type = scheduler_type + + self._runner: web.AppRunner | None = None + self._actor_id: ActorId | None = None + + async def on_start(self, actor_id: ActorId) -> None: + """Start the HTTP server when actor starts""" + self._actor_id = actor_id + + # Get global system (set by CLI via init()) + system = get_system() + + # Start HTTP server + self._runner = await start_router( + system=system, + http_host=self.http_host, + http_port=self.http_port, + model_name=self.model_name, + worker_name=self.worker_name, + scheduler_type=self.scheduler_type, + ) + + print(f"[Router] Actor started: {actor_id}") + + def on_stop(self) -> None: + """Stop the HTTP server when actor stops""" + if self._runner: + # Schedule cleanup in background (on_stop is sync) + asyncio.create_task(self._cleanup()) + + async def _cleanup(self): + """Async cleanup helper""" + if self._runner: + await stop_router(self._runner) + self._runner = None + + def metadata(self) -> dict[str, str]: + """Return router metadata for diagnostics""" + return { + "type": "router", + "http_host": self.http_host, + "http_port": str(self.http_port), + "model_name": self.model_name, + "worker_name": self.worker_name, + "scheduler_type": self.scheduler_type, + } + + async def receive(self, msg: Message) -> Message | None: + """Handle diagnostic messages""" + if msg.msg_type == "HealthCheck": + return Message.from_json( + "Ok", + { + "status": "healthy", + "http_port": self.http_port, + "model_name": self.model_name, + }, + ) + elif msg.msg_type == "GetConfig": + return Message.from_json( + "Config", + { + "http_host": self.http_host, + "http_port": self.http_port, + "model_name": self.model_name, + "worker_name": self.worker_name, + "scheduler_type": self.scheduler_type, + }, + ) + else: + return Message.from_json("Error", {"error": f"Unknown: {msg.msg_type}"}) diff --git a/python/pulsing/cli/__main__.py b/python/pulsing/cli/__main__.py index ace91c6f6..1b32f4514 100644 --- a/python/pulsing/cli/__main__.py +++ b/python/pulsing/cli/__main__.py @@ -18,9 +18,9 @@ def actor( Actor type must be a full class path: - Format: 'module.path.ClassName' - - Example: 'pulsing.actors.router.RouterActor' - - Example: 'pulsing.actors.worker.TransformersWorker' - - Example: 'pulsing.actors.vllm.VllmWorker' + - Example: 'pulsing.actors.Router' + - Example: 'pulsing.actors.TransformersWorker' + - Example: 'pulsing.actors.VllmWorker' - Example: 'my_module.my_actor.MyCustomActor' Pass constructor parameters directly as command-line arguments. @@ -38,17 +38,17 @@ def actor( Examples: # Start a Transformers worker - pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --device cpu --name my-worker + pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --device cpu --name my-worker # Start a vLLM worker - pulsing actor pulsing.actors.vllm.VllmWorker --model Qwen/Qwen2 --role aggregated --max_new_tokens 512 --name vllm-worker + pulsing actor pulsing.actors.VllmWorker --model Qwen/Qwen2 --role aggregated --max_new_tokens 512 --name vllm-worker # Start a Router with OpenAI-compatible API - pulsing actor pulsing.actors.router.RouterActor --http_host 0.0.0.0 --http_port 8080 --model_name my-llm --worker_name worker + pulsing actor pulsing.actors.Router --http_host 0.0.0.0 --http_port 8080 --model_name my-llm --worker_name worker # Start multiple workers with different names - pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --name worker-1 --seeds 127.0.0.1:8000 - pulsing actor pulsing.actors.worker.TransformersWorker --model_name gpt2 --name worker-2 --seeds 127.0.0.1:8000 + pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --name worker-1 --seeds 127.0.0.1:8000 + pulsing actor pulsing.actors.TransformersWorker --model_name gpt2 --name worker-2 --seeds 127.0.0.1:8000 """ from .actors import start_generic_actor diff --git a/test_actor_list_integration.py b/test_actor_list_integration.py deleted file mode 100644 index 83341e463..000000000 --- a/test_actor_list_integration.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -"""Integration test for pulsing actor list command""" - -import asyncio -import subprocess -import time -from pulsing.actor import init, remote - - -@remote -class Counter: - def __init__(self): - self.count = 0 - - -@remote -class Calculator: - def add(self, a, b): - return a + b - - -async def main(): - # Start system - await init(addr="0.0.0.0:8888") - from pulsing.actor import get_system - - system = get_system() - print("✓ Actor system started on 0.0.0.0:8888") - - # Create actors - await Counter.remote(system, name="counter-1") - await Counter.remote(system, name="counter-2") - await Calculator.remote(system, name="calculator") - print("✓ Created 3 actors") - - # Wait a bit for actors to fully initialize - await asyncio.sleep(0.5) - - # Run list command (subprocess in same PYTHONPATH) - result = subprocess.run( - ["python", "-m", "pulsing.cli", "actor", "list"], capture_output=True, text=True - ) - - print("\n--- Output from 'pulsing actor list' ---") - print(result.stdout) - - if result.returncode != 0: - print("STDERR:") - print(result.stderr) - - # Test --all flag - result_all = subprocess.run( - ["python", "-m", "pulsing.cli", "actor", "list", "--all_actors", "True"], - capture_output=True, - text=True, - ) - - print("\n--- Output from 'pulsing actor list --all_actors True' ---") - print(result_all.stdout) - - # Test JSON output - result_json = subprocess.run( - ["python", "-m", "pulsing.cli", "actor", "list", "--json", "True"], - capture_output=True, - text=True, - ) - - print("\n--- Output from 'pulsing actor list --json True' ---") - print(result_json.stdout) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_actor_list_same_process.py b/test_actor_list_same_process.py deleted file mode 100644 index 0a2b8258e..000000000 --- a/test_actor_list_same_process.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -"""Test pulsing actor list in same process""" - -import asyncio -import sys -import os - -# Ensure we're using the local pulsing -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "python")) - -from pulsing.actor import init, remote, get_system -from pulsing.admin import list_actors - - -@remote -class Counter: - def __init__(self): - self.count = 0 - - -@remote -class Calculator: - def add(self, a, b): - return a + b - - -async def main(): - # Start system - await init(addr="0.0.0.0:8888") - system = get_system() - print("✓ Actor system started on 0.0.0.0:8888\n") - - # Create actors - await Counter.remote(system, name="counter-1") - await Counter.remote(system, name="counter-2") - await Calculator.remote(system, name="calculator") - print("✓ Created 3 actors\n") - - # List actors directly (Python API) - print("=== Using Python API (pulsing.admin.list_actors) ===\n") - - # Get all local actor names - names = system.local_actor_names() - print(f"All local actor names: {names}\n") - - # Filter to user actors - user_actors = [n for n in names if not n.startswith("_")] - print(f"User actors: {user_actors}\n") - - print("=== Formatted list ===\n") - print(f"{'Name':<30} {'Type':<20}") - print("-" * 50) - for name in user_actors: - actor_type = "user" - print(f"{name:<30} {actor_type:<20}") - - print(f"\nTotal: {len(user_actors)} actor(s)") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_actor_system.py b/test_actor_system.py deleted file mode 100644 index d207f976e..000000000 --- a/test_actor_system.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -"""Test script to run a system with actors for testing 'pulsing actor list'""" - -import asyncio -from pulsing.actor import init, remote - - -@remote -class Counter: - def __init__(self): - self.count = 0 - - def increment(self): - self.count += 1 - return self.count - - -@remote -class Calculator: - def add(self, a, b): - return a + b - - -async def main(): - await init(addr="0.0.0.0:8888") - print("Actor system started on 0.0.0.0:8888") - - # Create some named actors - _counter1 = await Counter.remote(name="counter-1") - _counter2 = await Counter.remote(name="counter-2") - _calc = await Calculator.remote(name="calculator") - - print("Created actors: counter-1, counter-2, calculator") - print("Actor system is running. Press Ctrl+C to stop.") - - # Keep running - try: - while True: - await asyncio.sleep(1) - except KeyboardInterrupt: - print("\nShutting down...") - - -if __name__ == "__main__": - asyncio.run(main()) From ee78a65941a0bdb263a4f15c556a1afbef18dc53 Mon Sep 17 00:00:00 2001 From: Reiase Date: Fri, 23 Jan 2026 23:40:33 +0800 Subject: [PATCH 2/4] Add pulsing examples for multi-agent workflows and chaos-proof functionality - Introduced `pulsing_pingpong.py` demonstrating a simple ping-pong interaction using the @remote decorator. - Added `pulsing_research.py` showcasing a multi-agent research workflow with Researcher, Analyst, and Reporter agents. - Created `chaos_proof.py` to illustrate actor resilience with automatic restarts on failure, simulating task completion despite crashes. - Implemented `function_to_fleet.py` to demonstrate scaling functions into a fleet of workers for improved throughput. - Updated README with new examples and usage instructions for enhanced clarity and user guidance. --- comparison_examples/pulsing_pingpong.py | 22 +++++++ comparison_examples/pulsing_research.py | 52 ++++++++++++++++ crates/pulsing-py/src/actor.rs | 5 +- examples/quickstart/README.md | 79 ++++++++++++++++++++++++ examples/quickstart/chaos_proof.py | 54 ++++++++++++++++ examples/quickstart/function_to_fleet.py | 35 +++++++++++ 6 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 comparison_examples/pulsing_pingpong.py create mode 100644 comparison_examples/pulsing_research.py create mode 100644 examples/quickstart/chaos_proof.py create mode 100644 examples/quickstart/function_to_fleet.py diff --git a/comparison_examples/pulsing_pingpong.py b/comparison_examples/pulsing_pingpong.py new file mode 100644 index 000000000..31146ed44 --- /dev/null +++ b/comparison_examples/pulsing_pingpong.py @@ -0,0 +1,22 @@ +""" +Pulsing Ping-Pong Example using @remote decorator +Same functionality as AutoGen version +""" +from pulsing.actor import remote, runtime + +# Define Agent +@remote +class PingPongAgent: + async def ping(self, message: str) -> str: + return f"pong: {message}" + +# Run +async def main(): + async with runtime(): + agent = await PingPongAgent.spawn(name="pingpong") + response = await agent.ping("hello") + print(f"Received: {response}") + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) diff --git a/comparison_examples/pulsing_research.py b/comparison_examples/pulsing_research.py new file mode 100644 index 000000000..8665d590f --- /dev/null +++ b/comparison_examples/pulsing_research.py @@ -0,0 +1,52 @@ +""" +Pulsing Multi-Agent Research Workflow using @agent decorator +Same functionality as AutoGen version +""" +from pulsing.actor import resolve +from pulsing.agent import agent, runtime + +# Researcher Agent +@agent(role="Researcher", goal="Research topics") +class ResearcherAgent: + async def research(self, topic: str) -> list[str]: + return [ + f"Research point 1 about {topic}", + f"Research point 2 about {topic}", + ] + +# Analyst Agent +@agent(role="Analyst", goal="Analyze research results") +class AnalystAgent: + async def analyze(self, points: list[str]) -> str: + combined = " ".join(points) + return f"Analysis: {combined[:50]}..." + +# Reporter Agent +@agent(role="Reporter", goal="Write final report") +class ReporterAgent: + async def write(self, summary: str) -> str: + return f"Final Report:\n{summary}" + +# Workflow +async def run_workflow(): + async with runtime(): + # Spawn agents with names + researcher = await ResearcherAgent.spawn(name="researcher") + analyst = await AnalystAgent.spawn(name="analyst") + reporter = await ReporterAgent.spawn(name="reporter") + + # Resolve by name (automatic load balancing) + researcher = await resolve("researcher") + analyst = await resolve("analyst") + reporter = await resolve("reporter") + + # Execute workflow + research = await researcher.research("Quantum Computing") + analysis = await analyst.analyze(research) + report = await reporter.write(analysis) + + print(report) + +if __name__ == "__main__": + import asyncio + asyncio.run(run_workflow()) diff --git a/crates/pulsing-py/src/actor.rs b/crates/pulsing-py/src/actor.rs index a412ed264..af4a7f5b9 100644 --- a/crates/pulsing-py/src/actor.rs +++ b/crates/pulsing-py/src/actor.rs @@ -1110,10 +1110,9 @@ impl PyActorSystem { } else { // handler is a factory let factory = move || { - let handler = handler.clone(); - let event_loop = event_loop.clone(); - Python::with_gil(|py| -> anyhow::Result { + // Clone PyObjects inside GIL + let event_loop = event_loop.clone_ref(py); // Call factory to get instance let instance = handler .call0(py) diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md index ba4efc188..9d0d4f2e4 100644 --- a/examples/quickstart/README.md +++ b/examples/quickstart/README.md @@ -55,6 +55,85 @@ python examples/quickstart/ai_chat_room.py python examples/quickstart/ai_chat_room.py --topic "远皋办公是吊䌚成䞺䞻流" --rounds 5 ``` +--- + +## ⚡ 进阶瀺䟋 + +### 瀺䟋 3: Function → Fleet暪向扩展 + +**䞀行代码把凜数变成可暪向扩展的服务**——同䞀仜代码workers 从 1→N吞吐线性提升 + +```bash +# 8 䞪 worker 倄理 200 䞪任务 +python examples/quickstart/function_to_fleet.py + +# 调敎 worker 数量 +WORKERS=16 ITEMS=500 python examples/quickstart/function_to_fleet.py +``` + +蟓出 +``` +================================================== +⚡ Function → Fleet Result +================================================== + Workers: 8 + Tasks: 200 + Duration: 0.52s + Throughput: 384.6 qps +================================================== +✅ Same code, more workers = higher throughput +================================================== +``` + +**栞心代码**仅 27 行 +```python +@remote +class Worker: + async def run(self, x: int) -> int: + await asyncio.sleep(0.02) # simulate I/O + return x * x + +async with runtime(): + ws = [await Worker.spawn(name=f"w{i}") for i in range(n)] + res = await asyncio.gather(*(ws[i % n].run(i) for i in range(m))) +``` + +### 瀺䟋 4: Chaos-proof故障自愈 + +**Actor 厩溃后自劚重启**——30% 厩溃率䞋任务仍然党郚完成 + +```bash +python examples/quickstart/chaos_proof.py +``` + +蟓出 +``` +================================================== +🛡 Chaos-proof Result +================================================== + Total tasks: 50 + Succeeded: 50 + Retries: 23 + Crash rate: 30% +================================================== +✅ All succeeded! Actor auto-restarted on crash. +================================================== +``` + +**栞心代码** +```python +@remote(restart_policy="on-failure", max_restarts=50) +class FlakyWorker: + def work(self, x: int) -> int: + if random.random() < 0.3: # 30% 抂率厩溃 + raise RuntimeError("boom") + return x + 1 +``` + +**亮点**30% 厩溃率框架自劚重启 Actor调甚方简单重试最终 100% 完成。 + +--- + ## 栞心抂念10秒理解 ```python diff --git a/examples/quickstart/chaos_proof.py b/examples/quickstart/chaos_proof.py new file mode 100644 index 000000000..f919aa137 --- /dev/null +++ b/examples/quickstart/chaos_proof.py @@ -0,0 +1,54 @@ +""" +🛡 Chaos-proof - Actor 厩溃自劚重启任务䞍䞢倱 +""" +import asyncio, random +from pulsing.actor import remote +from pulsing.agent import runtime + + +@remote(restart_policy="on-failure", max_restarts=50) +class FlakyWorker: + def __init__(self): + self.call_count = 0 + + def work(self, x: int) -> int: + self.call_count += 1 + if random.random() < 0.3: # 30% 抂率厩溃 + raise RuntimeError(f"boom at call {self.call_count}") + return x + 1 + + +async def main(): + async with runtime(): + w = await FlakyWorker.spawn(name="flaky") + + results, retries = [], 0 + for i in range(50): + for attempt in range(10): # 最倚重试 10 次 + try: + results.append(await w.work(i)) + break + except Exception: + retries += 1 + await asyncio.sleep(0.01) + else: + results.append(None) # 真的倱莥了 + + ok = sum(1 for r in results if r is not None) + print("\n" + "=" * 50) + print("🛡 Chaos-proof Result") + print("=" * 50) + print(f" Total tasks: 50") + print(f" Succeeded: {ok}") + print(f" Retries: {retries}") + print(f" Crash rate: 30%") + print("=" * 50) + if ok == 50: + print("✅ All succeeded! Actor auto-restarted on crash.") + else: + print(f"⚠ {50 - ok} tasks failed") + print("=" * 50 + "\n") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/quickstart/function_to_fleet.py b/examples/quickstart/function_to_fleet.py new file mode 100644 index 000000000..16d94a211 --- /dev/null +++ b/examples/quickstart/function_to_fleet.py @@ -0,0 +1,35 @@ +import asyncio, os, time +from pulsing.actor import remote +from pulsing.agent import runtime + + +@remote +class Worker: + async def run(self, x: int) -> int: + await asyncio.sleep(0.02) # simulate I/O + return x * x + + +async def main(): + n = int(os.getenv("WORKERS", "8")) + m = int(os.getenv("ITEMS", "200")) + async with runtime(): + ws = [await Worker.spawn(name=f"w{i}") for i in range(n)] + t0 = time.perf_counter() + res = await asyncio.gather(*(ws[i % n].run(i) for i in range(m))) + dt = time.perf_counter() - t0 + print("\n" + "=" * 50) + print("⚡ Function → Fleet Result") + print("=" * 50) + print(f" Workers: {n}") + print(f" Tasks: {m}") + print(f" Duration: {dt:.2f}s") + print(f" Throughput: {m/dt:.1f} qps") + print("=" * 50) + print("✅ Same code, more workers = higher throughput") + print("=" * 50 + "\n") + + +if __name__ == "__main__": + asyncio.run(main()) + From b2b284474c9be55adfd30db27fd5f5e07879e95e Mon Sep 17 00:00:00 2001 From: Reiase Date: Sat, 24 Jan 2026 22:39:30 +0800 Subject: [PATCH 3/4] Enhance documentation and examples for actor system and API - Added new sections to the API documentation, including an overview of core concepts and detailed usage patterns for the Pulsing framework. - Updated existing examples to reflect recent changes in actor spawning and resolution methods, ensuring clarity and usability. - Improved documentation for the `@pul.remote` decorator and actor lifecycle management, providing clearer guidance for users. - Introduced new API reference files for both Python and Rust, enhancing accessibility and understanding of the framework's capabilities. - Refactored various documentation files to improve consistency and readability across the codebase. --- comparison_examples/pulsing_pingpong.py | 1 + comparison_examples/pulsing_research.py | 3 + crates/pulsing-actor/Cargo.toml | 2 + crates/pulsing-actor/src/actor/address.rs | 128 +- crates/pulsing-actor/src/actor/context.rs | 56 +- crates/pulsing-actor/src/actor/mailbox.rs | 30 +- crates/pulsing-actor/src/actor/mod.rs | 2 +- crates/pulsing-actor/src/actor/reference.rs | 61 +- crates/pulsing-actor/src/actor/traits.rs | 58 +- crates/pulsing-actor/src/behavior/context.rs | 39 +- crates/pulsing-actor/src/behavior/core.rs | 53 +- crates/pulsing-actor/src/behavior/mod.rs | 48 +- .../pulsing-actor/src/behavior/reference.rs | 62 +- .../src/cluster/backends/gossip.rs | 34 +- .../src/cluster/backends/head.rs | 40 +- .../pulsing-actor/src/cluster/backends/mod.rs | 2 +- crates/pulsing-actor/src/cluster/gossip.rs | 60 +- crates/pulsing-actor/src/cluster/member.rs | 62 +- crates/pulsing-actor/src/cluster/mod.rs | 9 +- crates/pulsing-actor/src/cluster/naming.rs | 50 +- crates/pulsing-actor/src/cluster/swim.rs | 47 +- crates/pulsing-actor/src/error.rs | 388 ++++++ crates/pulsing-actor/src/lib.rs | 97 +- crates/pulsing-actor/src/system/config.rs | 328 ++++- crates/pulsing-actor/src/system/lifecycle.rs | 222 ++++ .../pulsing-actor/src/system/load_balancer.rs | 239 ++++ crates/pulsing-actor/src/system/mod.rs | 1074 +---------------- crates/pulsing-actor/src/system/resolve.rs | 338 ++++++ crates/pulsing-actor/src/system/spawn.rs | 247 ++++ crates/pulsing-actor/src/system/traits.rs | 171 +-- crates/pulsing-actor/src/test_helper.rs | 395 ++++++ .../src/transport/http2/client.rs | 34 +- .../src/transport/http2/config.rs | 90 +- .../pulsing-actor/src/transport/http2/mod.rs | 84 +- .../pulsing-actor/src/transport/http2/pool.rs | 53 +- .../src/transport/http2/retry.rs | 40 +- .../src/transport/http2/server.rs | 50 +- .../src/transport/http2/stream.rs | 56 +- .../pulsing-actor/src/transport/http2/tls.rs | 64 +- docs/Makefile | 4 + docs/mkdocs.yml | 16 +- docs/src/api/overview.md | 302 +++++ docs/src/api/overview.zh.md | 299 +++++ docs/src/api/python.md | 39 + docs/src/api/python.zh.md | 39 + docs/src/api/rust.md | 394 ++++++ docs/src/api/rust.zh.md | 394 ++++++ docs/src/api_reference.md | 111 +- docs/src/api_reference.zh.md | 140 ++- docs/src/design/as-actor-decorator.md | 4 +- docs/src/design/as-actor-decorator.zh.md | 4 +- docs/src/examples/index.md | 8 +- docs/src/examples/index.zh.md | 6 +- docs/src/guide/actors.md | 18 +- docs/src/guide/actors.zh.md | 14 +- docs/src/index.md | 6 +- docs/src/index.zh.md | 6 +- python/pulsing/actor/remote.py | 82 +- python/pulsing/actors/router.py | 7 +- python/pulsing/agent/runtime.py | 17 +- python/pulsing/topic/broker.py | 67 +- 61 files changed, 4077 insertions(+), 2717 deletions(-) create mode 100644 crates/pulsing-actor/src/error.rs create mode 100644 crates/pulsing-actor/src/system/lifecycle.rs create mode 100644 crates/pulsing-actor/src/system/load_balancer.rs create mode 100644 crates/pulsing-actor/src/system/resolve.rs create mode 100644 crates/pulsing-actor/src/system/spawn.rs create mode 100644 crates/pulsing-actor/src/test_helper.rs create mode 100644 docs/src/api/overview.md create mode 100644 docs/src/api/overview.zh.md create mode 100644 docs/src/api/python.md create mode 100644 docs/src/api/python.zh.md create mode 100644 docs/src/api/rust.md create mode 100644 docs/src/api/rust.zh.md diff --git a/comparison_examples/pulsing_pingpong.py b/comparison_examples/pulsing_pingpong.py index eb478976c..2733486d1 100644 --- a/comparison_examples/pulsing_pingpong.py +++ b/comparison_examples/pulsing_pingpong.py @@ -12,6 +12,7 @@ class PingPongAgent: async def ping(self, message: str) -> str: return f"pong: {message}" + # Run async def main(): async with runtime(): diff --git a/comparison_examples/pulsing_research.py b/comparison_examples/pulsing_research.py index 57294cfca..4e1fbea6c 100644 --- a/comparison_examples/pulsing_research.py +++ b/comparison_examples/pulsing_research.py @@ -16,6 +16,7 @@ async def research(self, topic: str) -> list[str]: f"Research point 2 about {topic}", ] + # Analyst Agent @agent(role="Analyst", goal="Analyze research results") class AnalystAgent: @@ -23,12 +24,14 @@ async def analyze(self, points: list[str]) -> str: combined = " ".join(points) return f"Analysis: {combined[:50]}..." + # Reporter Agent @agent(role="Reporter", goal="Write final report") class ReporterAgent: async def write(self, summary: str) -> str: return f"Final Report:\n{summary}" + # Workflow async def run_workflow(): async with runtime(): diff --git a/crates/pulsing-actor/Cargo.toml b/crates/pulsing-actor/Cargo.toml index 4761a34a5..c2791a88f 100644 --- a/crates/pulsing-actor/Cargo.toml +++ b/crates/pulsing-actor/Cargo.toml @@ -16,6 +16,8 @@ integration = [] otlp = ["opentelemetry-otlp"] # Enable TLS support with passphrase-derived certificates tls = ["dep:rustls", "dep:tokio-rustls", "dep:rcgen", "dep:ring", "dep:rustls-pemfile", "dep:time"] +# Enable test helpers module for testing in downstream crates +test-helper = [] [dependencies] # Async runtime diff --git a/crates/pulsing-actor/src/actor/address.rs b/crates/pulsing-actor/src/actor/address.rs index 1bd61ba0a..75f249c84 100644 --- a/crates/pulsing-actor/src/actor/address.rs +++ b/crates/pulsing-actor/src/actor/address.rs @@ -1,13 +1,4 @@ -//! Actor addressing - URI-based actor addressing scheme -//! -//! This module implements the actor addressing scheme as defined in the design document. -//! -//! ## Address Types -//! -//! 1. Named Actor Service Address: `actor:///namespace/path/name` -//! 2. Named Actor Instance Address: `actor:///namespace/path/name@node_id` -//! 3. Global Actor Address: `actor://node_id/actor_id` -//! 4. Local Reference: `actor://0/actor_id` (node_id=0 means local) +//! Actor addressing (URI-based). use super::traits::NodeId; use serde::{Deserialize, Serialize}; @@ -65,14 +56,7 @@ impl fmt::Display for AddressParseError { impl std::error::Error for AddressParseError {} -/// Actor path for named actors (namespace + hierarchical path + name) -/// -/// A path must have at least two segments: namespace and name. -/// Additional segments can be used for logical grouping. -/// -/// Examples: -/// - `services/llm/router` (namespace: services, name: router) -/// - `workers/inference/pool` (namespace: workers, name: pool) +/// Actor path for named actors (namespace + path + name). #[derive(Clone, Debug, Hash, Eq, PartialEq, Serialize, Deserialize)] pub struct ActorPath { /// Path segments, e.g., ["services", "llm", "router"] @@ -89,34 +73,10 @@ impl ActorPath { /// Maximum single segment length pub const MAX_SEGMENT_LENGTH: usize = 64; - /// Create a new actor path from a string - /// - /// The path must have at least two segments (namespace/name). - /// - /// # Validation Rules - /// - Path cannot be empty - /// - Path cannot exceed 256 characters - /// - Each segment cannot exceed 64 characters - /// - Each segment can only contain alphanumeric characters, `_`, and `-` - /// - Path must have at least two segments (namespace/name) - /// - User code cannot use reserved system namespaces (use `new_system` for internal use) - /// - /// # Examples - /// ``` - /// use pulsing_actor::actor::ActorPath; - /// - /// let path = ActorPath::new("services/llm/router").unwrap(); - /// assert_eq!(path.namespace(), "services"); - /// assert_eq!(path.name(), "router"); - /// - /// // These will fail: - /// // ActorPath::new("system/internal").unwrap(); // Reserved namespace - /// // ActorPath::new("a".repeat(300)).unwrap(); // Too long - /// ``` - pub fn new(path: impl AsRef) -> Result { - let path = path.as_ref().trim_matches('/'); + /// Validate and parse path components (shared by `new()` and `new_system()`). + fn validate_path_components(path: &str) -> Result, AddressParseError> { + let path = path.trim_matches('/'); - // Check total path length if path.len() > Self::MAX_PATH_LENGTH { return Err(AddressParseError::PathTooLong); } @@ -127,7 +87,6 @@ impl ActorPath { let segments: Vec = path.split('/').map(|s| s.trim().to_string()).collect(); - // Validate segments for segment in &segments { if segment.is_empty() { return Err(AddressParseError::EmptySegment); @@ -140,11 +99,47 @@ impl ActorPath { } } - // Must have at least namespace/name if segments.len() < 2 { return Err(AddressParseError::MissingNamespace); } + Ok(segments) + } + + /// Check if a segment contains only valid characters + fn is_valid_segment(s: &str) -> bool { + !s.is_empty() + && s.chars() + .all(|c| c.is_alphanumeric() || c == '_' || c == '-') + } + + /// Create a new actor path from a string + /// + /// The path must have at least two segments (namespace/name). + /// + /// # Validation Rules + /// - Path cannot be empty + /// - Path cannot exceed 256 characters + /// - Each segment cannot exceed 64 characters + /// - Each segment can only contain alphanumeric characters, `_`, and `-` + /// - Path must have at least two segments (namespace/name) + /// - User code cannot use reserved system namespaces (use `new_system` for internal use) + /// + /// # Examples + /// ``` + /// use pulsing_actor::actor::ActorPath; + /// + /// let path = ActorPath::new("services/llm/router").unwrap(); + /// assert_eq!(path.namespace(), "services"); + /// assert_eq!(path.name(), "router"); + /// + /// // These will fail: + /// // ActorPath::new("system/internal").unwrap(); // Reserved namespace + /// // ActorPath::new("a".repeat(300)).unwrap(); // Too long + /// ``` + pub fn new(path: impl AsRef) -> Result { + let segments = Self::validate_path_components(path.as_ref())?; + // Check for reserved system namespaces (user code cannot use these) if Self::SYSTEM_NAMESPACES.contains(&segments[0].as_str()) { return Err(AddressParseError::ReservedNamespace); @@ -164,47 +159,10 @@ impl ActorPath { /// - Python bindings for `PythonActorService` at `system/python_actor_service` #[doc(hidden)] pub fn new_system(path: impl AsRef) -> Result { - let path = path.as_ref().trim_matches('/'); - - // Check total path length - if path.len() > Self::MAX_PATH_LENGTH { - return Err(AddressParseError::PathTooLong); - } - - if path.is_empty() { - return Err(AddressParseError::MissingNamespace); - } - - let segments: Vec = path.split('/').map(|s| s.trim().to_string()).collect(); - - // Validate segments - for segment in &segments { - if segment.is_empty() { - return Err(AddressParseError::EmptySegment); - } - if segment.len() > Self::MAX_SEGMENT_LENGTH { - return Err(AddressParseError::SegmentTooLong); - } - if !Self::is_valid_segment(segment) { - return Err(AddressParseError::InvalidCharacter); - } - } - - // Must have at least namespace/name - if segments.len() < 2 { - return Err(AddressParseError::MissingNamespace); - } - + let segments = Self::validate_path_components(path.as_ref())?; Ok(Self { segments }) } - /// Check if a segment contains only valid characters - fn is_valid_segment(s: &str) -> bool { - !s.is_empty() - && s.chars() - .all(|c| c.is_alphanumeric() || c == '_' || c == '-') - } - /// Get the namespace (first segment) pub fn namespace(&self) -> &str { &self.segments[0] diff --git a/crates/pulsing-actor/src/actor/context.rs b/crates/pulsing-actor/src/actor/context.rs index 3f0da1994..c3453f10a 100644 --- a/crates/pulsing-actor/src/actor/context.rs +++ b/crates/pulsing-actor/src/actor/context.rs @@ -1,4 +1,4 @@ -//! Actor execution context +//! Actor execution context. use super::mailbox::Envelope; use super::reference::ActorRef; @@ -10,58 +10,38 @@ use std::time::Duration; use tokio::sync::mpsc; use tokio_util::sync::CancellationToken; -/// Context provided to actors during message handling -/// -/// Provides access to: -/// - `id()` - The actor's assigned ID -/// - `actor_ref()` - Get references to other actors -/// - `watch()`/`unwatch()` - Monitor other actors -/// - `schedule_self()` - Schedule a delayed message to self -/// - `is_cancelled()` - Check if shutdown was requested +/// Context provided to actors during message handling. pub struct ActorContext { - /// The actor's own ID actor_id: ActorId, - /// Local node ID node_id: Option, - /// Cancellation token for graceful shutdown cancel_token: CancellationToken, - /// Cached actor references actor_refs: HashMap, - /// System reference for spawning new actors system: Option>, - /// Self mailbox sender for schedule_self self_sender: Option>, - /// Named path (if this is a named actor) named_path: Option, } -/// Trait for system reference (to avoid circular dependency) +/// Trait for system reference. #[async_trait::async_trait] pub trait ActorSystemRef: Send + Sync { - /// Get an actor reference by ID async fn actor_ref(&self, id: &ActorId) -> anyhow::Result; - /// Get the local node ID fn node_id(&self) -> NodeId; - /// Watch an actor - will receive a termination message (ActorId, StopReason) when the watched actor stops async fn watch(&self, watcher: &ActorId, target: &ActorId) -> anyhow::Result<()>; - /// Stop watching an actor async fn unwatch(&self, watcher: &ActorId, target: &ActorId) -> anyhow::Result<()>; - /// Get a local actor reference by name (for behavior-based actors) fn local_actor_ref_by_name(&self, name: &str) -> Option; } impl ActorContext { - /// Create a new context (for testing) pub fn new(actor_id: ActorId) -> Self { Self { actor_id, @@ -74,7 +54,6 @@ impl ActorContext { } } - /// Create context with system reference pub fn with_system( actor_id: ActorId, system: Arc, @@ -93,7 +72,6 @@ impl ActorContext { } } - /// Create context with system reference and named path pub fn with_system_and_name( actor_id: ActorId, system: Arc, @@ -113,44 +91,35 @@ impl ActorContext { } } - /// Get the named path (if this is a named actor) pub fn named_path(&self) -> Option<&str> { self.named_path.as_deref() } - /// Get a reference to the actor system (if available) pub fn system(&self) -> Option> { self.system.clone() } - /// Get the actor's ID pub fn id(&self) -> &ActorId { &self.actor_id } - /// Get the local node ID pub fn node_id(&self) -> Option<&NodeId> { self.node_id.as_ref() } - /// Get the cancellation token pub fn cancel_token(&self) -> &CancellationToken { &self.cancel_token } - /// Check if shutdown was requested pub fn is_cancelled(&self) -> bool { self.cancel_token.is_cancelled() } - /// Get an actor reference pub async fn actor_ref(&mut self, id: &ActorId) -> anyhow::Result { - // Check cache first if let Some(r) = self.actor_refs.get(id) { return Ok(r.clone()); } - // Get from system if let Some(ref system) = self.system { let r = system.actor_ref(id).await?; self.actor_refs.insert(*id, r.clone()); @@ -160,18 +129,7 @@ impl ActorContext { Err(anyhow::anyhow!("No system reference available")) } - /// Schedule a delayed message to self - /// - /// Sends a message to this actor after the specified delay. - /// The message is serialized and sent as a fire-and-forget (tell pattern). - /// - /// # Example - /// ```ignore - /// ctx.schedule_self(MyMessage { value: 42 }, Duration::from_secs(5)); - /// ``` - /// - /// # Panics - /// Returns an error if the actor context doesn't have a self sender (e.g., in tests). + /// Schedule a delayed message to self. pub fn schedule_self( &self, msg: M, @@ -181,10 +139,8 @@ impl ActorContext { anyhow::anyhow!("No self sender available (context not fully initialized)") })?; - // Serialize the message let message = Message::pack(&msg)?; - // Spawn a task that waits for the delay and then sends the message tokio::spawn(async move { tokio::time::sleep(delay).await; let envelope = Envelope::tell(message); @@ -196,7 +152,7 @@ impl ActorContext { Ok(()) } - /// Watch another actor - will receive a termination message (ActorId, StopReason) when it stops + /// Watch another actor. pub async fn watch(&self, target: &ActorId) -> anyhow::Result<()> { if let Some(ref system) = self.system { system.watch(&self.actor_id, target).await @@ -205,7 +161,7 @@ impl ActorContext { } } - /// Stop watching another actor + /// Stop watching another actor. pub async fn unwatch(&self, target: &ActorId) -> anyhow::Result<()> { if let Some(ref system) = self.system { system.unwatch(&self.actor_id, target).await diff --git a/crates/pulsing-actor/src/actor/mailbox.rs b/crates/pulsing-actor/src/actor/mailbox.rs index 9b072e30f..1c4d78930 100644 --- a/crates/pulsing-actor/src/actor/mailbox.rs +++ b/crates/pulsing-actor/src/actor/mailbox.rs @@ -1,16 +1,15 @@ -//! Actor mailbox - message envelope and queue +//! Actor mailbox - message envelope and queue. use super::traits::Message; use tokio::sync::{mpsc, oneshot}; -/// Response channel type +/// Response channel type. pub type ResponseChannel = oneshot::Sender>; -/// Responder - sends response back to caller (no-op for tell pattern) +/// Responder - sends response back to caller (no-op for tell pattern). pub struct Responder(Option); impl Responder { - /// Send response (no-op if this was a tell) pub fn send(self, result: anyhow::Result) { if let Some(tx) = self.0 { let _ = tx.send(result); @@ -18,14 +17,13 @@ impl Responder { } } -/// Message envelope with optional response channel +/// Message envelope with optional response channel. pub struct Envelope { message: Message, respond_to: Option, } impl Envelope { - /// Create envelope for fire-and-forget (tell pattern) pub fn tell(message: Message) -> Self { Self { message, @@ -33,7 +31,6 @@ impl Envelope { } } - /// Create envelope for request-response (ask pattern) pub fn ask(message: Message, respond_to: ResponseChannel) -> Self { Self { message, @@ -41,17 +38,14 @@ impl Envelope { } } - /// Get the message type pub fn msg_type(&self) -> &str { self.message.msg_type() } - /// Decompose into message and responder pub fn into_parts(self) -> (Message, Responder) { (self.message, Responder(self.respond_to)) } - /// Check if this envelope expects a response pub fn expects_response(&self) -> bool { self.respond_to.is_some() } @@ -66,42 +60,35 @@ impl std::fmt::Debug for Envelope { } } -/// Mailbox capacity +/// Mailbox capacity. pub const DEFAULT_MAILBOX_SIZE: usize = 256; -/// Actor mailbox +/// Actor mailbox. pub struct Mailbox { - /// Sender half (cloneable) sender: mpsc::Sender, - /// Receiver half receiver: mpsc::Receiver, } impl Mailbox { - /// Create a new mailbox with default capacity pub fn new() -> Self { Self::with_capacity(DEFAULT_MAILBOX_SIZE) } - /// Create a new mailbox with specified capacity pub fn with_capacity(capacity: usize) -> Self { let (sender, receiver) = mpsc::channel(capacity); Self { sender, receiver } } - /// Get a clone of the sender pub fn sender(&self) -> mpsc::Sender { self.sender.clone() } - /// Take the receiver (consumes it) pub fn take_receiver(&mut self) -> mpsc::Receiver { let (_, new_rx) = mpsc::channel(1); std::mem::replace(&mut self.receiver, new_rx) } - /// Split into sender and receiver pub fn split(self) -> (mpsc::Sender, mpsc::Receiver) { (self.sender, self.receiver) } @@ -113,7 +100,7 @@ impl Default for Mailbox { } } -/// Mailbox sender wrapper with backpressure handling +/// Mailbox sender wrapper with backpressure handling. #[derive(Clone)] pub struct MailboxSender { inner: mpsc::Sender, @@ -124,7 +111,6 @@ impl MailboxSender { Self { inner: sender } } - /// Send a message (blocking if full) pub async fn send(&self, envelope: Envelope) -> anyhow::Result<()> { self.inner .send(envelope) @@ -132,14 +118,12 @@ impl MailboxSender { .map_err(|_| anyhow::anyhow!("Mailbox closed")) } - /// Try to send without blocking pub fn try_send(&self, envelope: Envelope) -> anyhow::Result<()> { self.inner .try_send(envelope) .map_err(|e| anyhow::anyhow!("Mailbox send failed: {}", e)) } - /// Check if mailbox is closed pub fn is_closed(&self) -> bool { self.inner.is_closed() } diff --git a/crates/pulsing-actor/src/actor/mod.rs b/crates/pulsing-actor/src/actor/mod.rs index b1efe093f..f963206a1 100644 --- a/crates/pulsing-actor/src/actor/mod.rs +++ b/crates/pulsing-actor/src/actor/mod.rs @@ -1,4 +1,4 @@ -//! Actor module - core actor abstractions +//! Actor module - core abstractions. mod address; mod context; diff --git a/crates/pulsing-actor/src/actor/reference.rs b/crates/pulsing-actor/src/actor/reference.rs index 63e1dbc33..85c044271 100644 --- a/crates/pulsing-actor/src/actor/reference.rs +++ b/crates/pulsing-actor/src/actor/reference.rs @@ -1,4 +1,4 @@ -//! Actor reference - location-transparent handle to an actor +//! Actor reference - location-transparent handle to an actor. use super::address::ActorPath; use super::mailbox::Envelope; @@ -8,83 +8,56 @@ use std::net::SocketAddr; use std::sync::Arc; use tokio::sync::{mpsc, oneshot, RwLock}; -/// Actor reference - handle for sending messages to an actor -/// -/// ActorRef supports two resolution modes: -/// - **Direct**: Uses a fixed connection (for local actors or known remote actors) -/// - **Lazy**: Re-resolves the actor on each operation (for named actors that may migrate) +/// Actor reference - handle for sending messages to an actor. #[derive(Clone)] pub struct ActorRef { - /// The target actor's ID (may be placeholder for lazy refs) pub(crate) actor_id: ActorId, - /// Inner implementation pub(crate) inner: ActorRefInner, } -/// Inner actor reference - direct or lazy resolution +/// Inner actor reference. #[derive(Clone)] pub enum ActorRefInner { - /// Local actor - direct channel access Local(mpsc::Sender), - /// Remote actor - via network transport Remote(Arc), - /// Lazy resolution - re-resolve on each call (for named actors) Lazy(Arc), } -/// Remote actor reference +/// Remote actor reference. pub struct RemoteActorRef { - /// Remote node address pub node_addr: SocketAddr, - /// Transport client pub transport: Arc, } -/// Lazy actor reference - resolves actor path on each operation -/// -/// This ensures the reference is always up-to-date, even if the actor -/// migrates to a different node. -/// -/// Uses double-checked locking pattern to avoid resolution storms -/// when multiple concurrent requests find the cache expired. +/// Lazy actor reference. pub struct LazyActorRef { - /// The named actor path (e.g., "services/echo") pub path: ActorPath, - /// Resolver function pub resolver: Arc, - /// Cached ActorRef (with version for staleness check) cache: RwLock>, - /// Lock to ensure only one thread refreshes the cache at a time - /// Prevents resolution storms under high concurrency refresh_lock: tokio::sync::Mutex<()>, } -/// Cached reference with version for staleness detection struct CachedRef { actor_ref: ActorRef, - /// Cache timestamp for TTL-based invalidation cached_at: std::time::Instant, } -/// Cache TTL - references older than this are re-resolved const CACHE_TTL: std::time::Duration = std::time::Duration::from_secs(5); -/// Trait for resolving actor paths to ActorRefs +/// Trait for resolving actor paths to ActorRefs. #[async_trait::async_trait] pub trait ActorResolver: Send + Sync { - /// Resolve a named actor path to an ActorRef async fn resolve_path(&self, path: &ActorPath) -> anyhow::Result; } impl LazyActorRef { - /// Create a new lazy actor reference pub fn new(path: ActorPath, resolver: Arc) -> Self { Self { path, @@ -94,13 +67,7 @@ impl LazyActorRef { } } - /// Get the underlying ActorRef, resolving if necessary - /// - /// Uses double-checked locking to prevent resolution storms: - /// 1. Fast path: check cache with read lock - /// 2. Slow path: acquire refresh lock, check again, then resolve async fn get(&self) -> anyhow::Result { - // Fast path: check cache with read lock { let cache = self.cache.read().await; if let Some(ref cached) = *cache { @@ -110,10 +77,8 @@ impl LazyActorRef { } } - // Slow path: acquire refresh lock to prevent concurrent resolution let _refresh_guard = self.refresh_lock.lock().await; - // Double-check: another thread may have refreshed while we waited { let cache = self.cache.read().await; if let Some(ref cached) = *cache { @@ -123,7 +88,6 @@ impl LazyActorRef { } } - // Now we're the only thread refreshing - safe to resolve let resolved = self.resolver.resolve_path(&self.path).await?; { let mut cache = self.cache.write().await; @@ -135,17 +99,15 @@ impl LazyActorRef { Ok(resolved) } - /// Invalidate the cache (call when actor is known to have moved) pub async fn invalidate(&self) { let mut cache = self.cache.write().await; *cache = None; } } -/// Trait for remote transport (HTTP/2, TCP, etc.) +/// Trait for remote transport (HTTP/2, TCP, etc.). #[async_trait::async_trait] pub trait RemoteTransport: Send + Sync { - /// Send a request and wait for response (low-level) async fn request( &self, actor_id: &ActorId, @@ -153,7 +115,6 @@ pub trait RemoteTransport: Send + Sync { payload: Vec, ) -> anyhow::Result>; - /// Send a one-way message (low-level) async fn send( &self, actor_id: &ActorId, @@ -161,10 +122,7 @@ pub trait RemoteTransport: Send + Sync { payload: Vec, ) -> anyhow::Result<()>; - /// Send a message and receive response (unified interface) - /// - /// This is the primary method for communication. It automatically handles - /// both single and stream responses based on the server's response type. + /// Send a message and receive response (unified interface). async fn send_message(&self, actor_id: &ActorId, msg: Message) -> anyhow::Result { let Message::Single { msg_type, data } = msg else { return Err(anyhow::anyhow!("Streaming requests not yet supported")); @@ -173,7 +131,6 @@ pub trait RemoteTransport: Send + Sync { Ok(Message::single("", response)) } - /// Send a one-way message (unified interface) async fn send_oneway(&self, actor_id: &ActorId, msg: Message) -> anyhow::Result<()> { let Message::Single { msg_type, data } = msg else { return Err(anyhow::anyhow!( @@ -185,7 +142,6 @@ pub trait RemoteTransport: Send + Sync { } impl ActorRef { - /// Create a local actor reference pub fn local(actor_id: ActorId, sender: mpsc::Sender) -> Self { Self { actor_id, @@ -193,7 +149,6 @@ impl ActorRef { } } - /// Create a remote actor reference pub fn remote( actor_id: ActorId, node_addr: SocketAddr, diff --git a/crates/pulsing-actor/src/actor/traits.rs b/crates/pulsing-actor/src/actor/traits.rs index b26d1d12f..2dbcecd9c 100644 --- a/crates/pulsing-actor/src/actor/traits.rs +++ b/crates/pulsing-actor/src/actor/traits.rs @@ -1,4 +1,4 @@ -//! Core Actor traits and types +//! Core actor traits and types. use async_trait::async_trait; use futures::Stream; @@ -10,32 +10,23 @@ use std::pin::Pin; use thiserror::Error; use tokio::sync::mpsc; -// ============================================================================ -// Identifiers -// ============================================================================ - -/// Node identifier in the cluster (0 = local) +/// Node identifier in the cluster (0 = local). #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq, Serialize, Deserialize, Default)] pub struct NodeId(pub u64); impl NodeId { - /// Local node id (0) pub const LOCAL: NodeId = NodeId(0); - /// Generate a new unique NodeId using UUID pub fn generate() -> Self { let uuid = uuid::Uuid::new_v4(); - // Use the lower 64 bits of UUID, ensure non-zero (0 is reserved for LOCAL) let id = uuid.as_u128() as u64; Self(if id == 0 { 1 } else { id }) } - /// Create from u64 pub fn new(id: u64) -> Self { Self(id) } - /// Check if this is the local node pub fn is_local(&self) -> bool { self.0 == 0 } @@ -48,28 +39,23 @@ impl fmt::Display for NodeId { } } -/// Actor identifier (globally unique) -/// High 64 bits = node id, Low 64 bits = local actor id +/// Actor identifier (globally unique). #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq, Serialize, Deserialize, Default)] pub struct ActorId(pub u128); impl ActorId { - /// Create a new ActorId from node id and local id pub fn new(node: NodeId, local_id: u64) -> Self { Self(((node.0 as u128) << 64) | (local_id as u128)) } - /// Create a local actor id pub fn local(local_id: u64) -> Self { Self::new(NodeId::LOCAL, local_id) } - /// Get the node id pub fn node(&self) -> NodeId { NodeId((self.0 >> 64) as u64) } - /// Get the local actor id pub fn local_id(&self) -> u64 { self.0 as u64 } @@ -82,58 +68,35 @@ impl fmt::Display for ActorId { } } -// ============================================================================ -// Lifecycle -// ============================================================================ - -/// Reason why an actor stopped +/// Reason why an actor stopped. #[derive(Clone, Debug, Error, Serialize, Deserialize)] pub enum StopReason { - /// Normal shutdown (graceful stop) #[error("Normal")] Normal, - /// Actor panicked or encountered an unrecoverable error #[error("Failed: {0}")] Failed(String), - /// Actor was killed/aborted #[error("Killed")] Killed, - /// System is shutting down #[error("SystemShutdown")] SystemShutdown, } -// ============================================================================ -// Messaging -// ============================================================================ - -/// Message stream type (stream of Single messages) +/// Message stream type (stream of Single messages). pub type MessageStream = Pin> + Send>>; -/// Unified message type for both requests and responses -/// -/// Message is an enum with two variants: -/// - `Single`: for traditional request-response with a single data payload -/// - `Stream`: for streaming scenarios composed of Single messages +/// Unified message type for both requests and responses. pub enum Message { - /// Single data message Single { - /// Message type identifier (empty for responses) msg_type: String, - /// Message data data: Vec, }, - /// Streaming data message (stream of Single messages) Stream { - /// Default message type (used if chunk doesn't specify one) default_msg_type: String, - /// Stream of Single messages stream: MessageStream, }, } impl Message { - /// Create a single message with type and data pub fn single(msg_type: impl Into, data: impl Into>) -> Self { Message::Single { msg_type: msg_type.into(), @@ -141,9 +104,6 @@ impl Message { } } - /// Pack a serializable value into a message - /// - /// Uses `std::any::type_name` to automatically generate the message type. pub fn pack(msg: &M) -> anyhow::Result { Ok(Message::Single { msg_type: std::any::type_name::().to_string(), @@ -151,7 +111,6 @@ impl Message { }) } - /// Unpack (deserialize) the message data into a specific type pub fn unpack(self) -> anyhow::Result { match self { Message::Single { data, .. } => Ok(bincode::deserialize(&data)?), @@ -159,7 +118,6 @@ impl Message { } } - /// Create a streaming message from a channel receiver pub fn from_channel( default_msg_type: impl Into, rx: mpsc::Receiver>, @@ -171,7 +129,6 @@ impl Message { } } - /// Create a streaming message from a stream pub fn stream(default_msg_type: impl Into, stream: S) -> Self where S: Stream> + Send + 'static, @@ -182,7 +139,6 @@ impl Message { } } - /// Get message type (for Single) or default message type (for Stream) pub fn msg_type(&self) -> &str { match self { Message::Single { msg_type, .. } => msg_type, @@ -192,12 +148,10 @@ impl Message { } } - /// Check if this is a single message pub fn is_single(&self) -> bool { matches!(self, Message::Single { .. }) } - /// Check if this is a streaming message pub fn is_stream(&self) -> bool { matches!(self, Message::Stream { .. }) } diff --git a/crates/pulsing-actor/src/behavior/context.rs b/crates/pulsing-actor/src/behavior/context.rs index 93d4d606d..4e549a963 100644 --- a/crates/pulsing-actor/src/behavior/context.rs +++ b/crates/pulsing-actor/src/behavior/context.rs @@ -1,5 +1,3 @@ -//! Typed actor context for behavior-based actors - use super::reference::TypedRef; use crate::actor::ActorId; use crate::actor::ActorSystemRef; @@ -9,20 +7,12 @@ use std::sync::Arc; use std::time::Duration; use tokio_util::sync::CancellationToken; -/// Context provided to behavior handlers -/// -/// Unlike the traditional ActorContext, this is parameterized by message type, -/// providing type-safe self-references and scheduling. +/// Context provided to behavior handlers. pub struct BehaviorContext { - /// Actor's name actor_name: String, - /// Actor's unique identifier actor_id: ActorId, - /// Reference to the actor system system: Arc, - /// Typed self-reference for receiving messages self_ref: TypedRef, - /// Cancellation token for graceful shutdown cancel_token: CancellationToken, _marker: PhantomData, } @@ -31,7 +21,6 @@ impl BehaviorContext where M: Serialize + DeserializeOwned + Send + 'static, { - /// Create a new behavior context pub(crate) fn new( actor_name: String, actor_id: ActorId, @@ -49,34 +38,24 @@ where } } - /// Get the actor's unique identifier pub fn actor_id(&self) -> &ActorId { &self.actor_id } - /// Get the actor's name pub fn name(&self) -> &str { &self.actor_name } - /// Get a typed reference to self - /// - /// This can be passed to other actors for reply-to patterns + /// Get a typed reference to self. pub fn self_ref(&self) -> TypedRef { self.self_ref.clone() } - /// Check if the actor should stop pub fn is_cancelled(&self) -> bool { self.cancel_token.is_cancelled() } - /// Get a typed reference to another behavior-based actor by name - /// - /// # Type Safety - /// - /// The caller must ensure the target actor actually accepts messages of type N. - /// This is a runtime check - if types don't match, sends will fail at serialization. + /// Get a typed reference to another behavior-based actor by name. pub fn typed_ref(&self, name: &str) -> TypedRef where N: Serialize + DeserializeOwned + Send + 'static, @@ -84,14 +63,8 @@ where TypedRef::from_name(name, self.system.clone()) } - /// Schedule a message to be sent to self after a delay - /// - /// The message will be delivered to this actor after the specified duration. - /// If the actor is stopped before the delay expires, the message will not be sent. - /// - /// The message goes through the actor's normal mailbox, ensuring proper ordering. + /// Schedule a message to be sent to self after a delay. pub fn schedule_self(&self, msg: M, delay: Duration) { - // Resolve the ActorRef upfront (ActorRef is Send + Sync) let actor_ref = match self.self_ref.as_untyped() { Ok(r) => r, Err(e) => { @@ -104,7 +77,6 @@ where tokio::spawn(async move { tokio::select! { _ = tokio::time::sleep(delay) => { - // Only send if actor is still running if !cancel.is_cancelled() { if let Err(e) = actor_ref.tell(msg).await { tracing::warn!("Failed to deliver scheduled message: {}", e); @@ -112,19 +84,16 @@ where } } _ = cancel.cancelled() => { - // Actor stopped, don't send the message tracing::debug!("Scheduled message cancelled due to actor stop"); } } }); } - /// Get a reference to the underlying actor system pub fn system(&self) -> &Arc { &self.system } - /// Get the cancellation token for cooperative shutdown pub fn cancel_token(&self) -> CancellationToken { self.cancel_token.clone() } diff --git a/crates/pulsing-actor/src/behavior/core.rs b/crates/pulsing-actor/src/behavior/core.rs index 39feda4a7..fc89b63e5 100644 --- a/crates/pulsing-actor/src/behavior/core.rs +++ b/crates/pulsing-actor/src/behavior/core.rs @@ -1,5 +1,3 @@ -//! Behavior definitions and combinators - use super::context::BehaviorContext; use super::reference::TypedRef; use crate::actor::ActorSystemRef; @@ -11,44 +9,32 @@ use std::marker::PhantomData; use std::sync::Arc; use tokio::sync::Mutex; -/// Action returned by a behavior after processing a message +/// Action returned by a behavior after processing a message. pub enum BehaviorAction { - /// Keep the current behavior Same, - /// Switch to a new behavior (state machine transition) Become(Behavior), - /// Stop the actor gracefully with optional reason Stop(Option), - /// Actor is already stopped (internal use) - /// This is returned when messages arrive after Stop AlreadyStopped, } impl BehaviorAction { - /// Create a Stop action without reason pub fn stop() -> Self { Self::Stop(None) } - /// Create a Stop action with reason pub fn stop_with_reason(reason: impl Into) -> Self { Self::Stop(Some(reason.into())) } - /// Check if this action indicates the actor should stop pub fn is_stop(&self) -> bool { matches!(self, Self::Stop(_) | Self::AlreadyStopped) } } -/// The core behavior function type pub type BehaviorFn = Box) -> BoxFuture<'_, BehaviorAction> + Send>; -/// A behavior wraps a message-handling function -/// -/// Behaviors are the fundamental building block of this actor model. -/// An actor is simply a behavior that processes messages. +/// A behavior wraps a message-handling function. pub struct Behavior { inner: BehaviorFn, _marker: PhantomData, @@ -58,7 +44,6 @@ impl Behavior where M: Send + 'static, { - /// Create a new behavior from a function pub fn new(f: F) -> Self where F: FnMut(M, &mut BehaviorContext) -> BoxFuture<'_, BehaviorAction> + Send + 'static, @@ -69,19 +54,12 @@ where } } - /// Process a message with this behavior pub async fn receive(&mut self, msg: M, ctx: &mut BehaviorContext) -> BehaviorAction { (self.inner)(msg, ctx).await } } -/// IntoActor implementation for Behavior -/// -/// This allows Behavior to be passed directly to `spawn` and `spawn_named`: -/// ```rust,ignore -/// let counter = system.spawn(counter(0)).await?; -/// let counter = system.spawn_named("counter", counter(0)).await?; -/// ``` +/// IntoActor implementation for Behavior. impl IntoActor for Behavior where M: Serialize + DeserializeOwned + Send + Sync + 'static, @@ -93,25 +71,7 @@ where } } -/// A wrapper that allows Behavior to be used as an Actor -/// -/// This wrapper implements the Actor trait, allowing behaviors to be spawned -/// using the standard `system.spawn()` and `system.spawn_named()` methods. -/// -/// # Example -/// -/// ```rust,ignore -/// fn counter(init: i32) -> Behavior { -/// stateful(init, |count, n, _ctx| { -/// *count += n; -/// BehaviorAction::Same -/// }) -/// } -/// -/// // Use as Actor via IntoActor trait -/// let counter = system.spawn(counter(0)).await?; -/// let counter = system.spawn_named("counter", counter(0)).await?; -/// ``` +/// Wrapper that allows Behavior to be used as an Actor. pub struct BehaviorWrapper where M: Serialize + DeserializeOwned + Send + 'static, @@ -125,7 +85,6 @@ impl BehaviorWrapper where M: Serialize + DeserializeOwned + Send + 'static, { - /// Create a new BehaviorWrapper from a Behavior pub fn new(behavior: Behavior) -> Self { Self { behavior: Mutex::new(behavior), @@ -150,10 +109,8 @@ where M: Serialize + DeserializeOwned + Send + Sync + 'static, { async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { - // Deserialize the incoming message let typed_msg: M = msg.unpack()?; - // Get mutable access to behavior and context let mut behavior = self.behavior.lock().await; let mut ctx_guard = self.behavior_ctx.lock().await; @@ -161,10 +118,8 @@ where .as_mut() .ok_or_else(|| anyhow::anyhow!("BehaviorContext not initialized"))?; - // Process the message let action = behavior.receive(typed_msg, ctx).await; - // Handle the action match action { BehaviorAction::Same => Message::pack(&()), BehaviorAction::Become(new_behavior) => { diff --git a/crates/pulsing-actor/src/behavior/mod.rs b/crates/pulsing-actor/src/behavior/mod.rs index 08d859c41..e23cc3b39 100644 --- a/crates/pulsing-actor/src/behavior/mod.rs +++ b/crates/pulsing-actor/src/behavior/mod.rs @@ -1,50 +1,4 @@ -//! Behavior-based Actor Programming Model -//! -//! This module provides a type-safe, functional actor programming interface -//! inspired by Akka Typed. It shares the underlying ActorSystem infrastructure -//! with the traditional Actor trait but offers a completely new programming model. -//! -//! # Key Concepts -//! -//! - **Behavior**: An actor is defined as a message-handling function -//! - **TypedRef**: Type-safe actor reference with compile-time message checking -//! - **BehaviorAction**: Control actor lifecycle and state transitions -//! -//! # Example -//! -//! ```rust,ignore -//! use pulsing_actor::behavior::*; -//! use pulsing_actor::prelude::*; -//! -//! // Define message type -//! #[derive(Serialize, Deserialize)] -//! enum CounterMsg { -//! Increment(i32), -//! Get { reply_to: TypedRef }, -//! } -//! -//! // Define actor as a function returning Behavior -//! fn counter(initial: i32) -> Behavior { -//! stateful(initial, |value, msg, ctx| { -//! Box::pin(async move { -//! match msg { -//! CounterMsg::Increment(n) => { -//! *value += n; -//! BehaviorAction::Same -//! } -//! CounterMsg::Get { reply_to } => { -//! let _ = reply_to.tell(*value).await; -//! BehaviorAction::Same -//! } -//! } -//! }) -//! }) -//! } -//! -//! // Spawn using standard spawn/spawn_named - Behavior implements IntoActor -//! let actor_ref = system.spawn_named("actors/counter", counter(0)).await?; -//! actor_ref.tell(CounterMsg::Increment(5)).await?; -//! ``` +//! Behavior-based actor programming model. mod context; mod core; diff --git a/crates/pulsing-actor/src/behavior/reference.rs b/crates/pulsing-actor/src/behavior/reference.rs index 9d1755cc5..eb2f34bc2 100644 --- a/crates/pulsing-actor/src/behavior/reference.rs +++ b/crates/pulsing-actor/src/behavior/reference.rs @@ -1,5 +1,3 @@ -//! Typed actor references - use crate::actor::ActorRef; use crate::actor::ActorSystemRef; use serde::{de::DeserializeOwned, Serialize}; @@ -7,35 +5,13 @@ use std::marker::PhantomData; use std::sync::Arc; use std::time::Duration; -/// Resolution mode for TypedRef #[derive(Clone)] enum ResolutionMode { - /// Direct reference - always use this ActorRef Direct(ActorRef), - /// Dynamic resolution - resolve by name each time (no caching) Dynamic(Arc), } -/// A type-safe actor reference -/// -/// Unlike `ActorRef`, `TypedRef` knows the message type at compile time, -/// providing type-safe message sending. -/// -/// # Resolution Strategy -/// -/// - **Direct mode**: Created with `TypedRef::new()`, uses the provided ActorRef directly -/// - **Dynamic mode**: Created via spawn, resolves actor by name on each call (no stale cache) -/// -/// # Example -/// -/// ```rust,ignore -/// // Spawn behavior directly (Behavior implements IntoActor) -/// let counter = system.spawn_named("actors/counter", counter_behavior).await?; -/// -/// // Or wrap with TypedRef for type-safe sending -/// let counter: TypedRef = TypedRef::new("actors/counter", counter); -/// counter.tell(CounterMsg::Increment(5)).await?; -/// ``` +/// A type-safe actor reference. pub struct TypedRef { name: String, mode: ResolutionMode, @@ -71,10 +47,7 @@ impl TypedRef where M: Serialize + DeserializeOwned + Send + 'static, { - /// Create a typed reference wrapping an existing ActorRef (direct mode) - /// - /// The provided ActorRef is used directly without re-resolution. - /// Use this when you have a known, stable actor reference. + /// Create a typed reference wrapping an existing ActorRef. pub fn new(name: &str, inner: ActorRef) -> Self { Self { name: name.to_string(), @@ -83,10 +56,7 @@ where } } - /// Create a typed reference from a name (dynamic resolution mode) - /// - /// The actor is resolved by name on each operation, ensuring - /// the reference is always up-to-date (no stale cache). + /// Create a typed reference from a name. pub(crate) fn from_name(name: &str, system: Arc) -> Self { Self { name: name.to_string(), @@ -95,15 +65,10 @@ where } } - /// Get the actor's name pub fn name(&self) -> &str { &self.name } - /// Resolve the underlying ActorRef - /// - /// - Direct mode: returns the stored ActorRef - /// - Dynamic mode: looks up actor by name (fresh each time) fn resolve(&self) -> anyhow::Result { match &self.mode { ResolutionMode::Direct(inner) => Ok(inner.clone()), @@ -113,23 +78,13 @@ where } } - /// Send a message without waiting for response (fire-and-forget) - /// - /// This is type-safe: only messages of type M can be sent. + /// Send a message without waiting for response. pub async fn tell(&self, msg: M) -> anyhow::Result<()> { let actor_ref = self.resolve()?; actor_ref.tell(msg).await } - /// Send a message and wait for a response - /// - /// # Type Parameters - /// - /// - `M`: The message type (input) - /// - `R`: The expected response type - /// - /// Note: The response type is not checked at compile time for the receiver. - /// Ensure the target actor returns the expected type. + /// Send a message and wait for a response. pub async fn ask(&self, msg: M) -> anyhow::Result where R: DeserializeOwned, @@ -138,7 +93,7 @@ where actor_ref.ask(msg).await } - /// Send a message and wait for a response with timeout + /// Send a message and wait for a response with timeout. pub async fn ask_timeout(&self, msg: M, timeout: Duration) -> anyhow::Result where R: DeserializeOwned, @@ -148,14 +103,11 @@ where .map_err(|_| anyhow::anyhow!("Ask timeout after {:?}", timeout))? } - /// Get the underlying untyped ActorRef - /// - /// Useful when you need to interact with APIs that expect ActorRef + /// Get the underlying untyped ActorRef. pub fn as_untyped(&self) -> anyhow::Result { self.resolve() } - /// Check if the referenced actor is currently alive pub fn is_alive(&self) -> bool { self.resolve().is_ok() } diff --git a/crates/pulsing-actor/src/cluster/backends/gossip.rs b/crates/pulsing-actor/src/cluster/backends/gossip.rs index 07531a81a..00827cf0e 100644 --- a/crates/pulsing-actor/src/cluster/backends/gossip.rs +++ b/crates/pulsing-actor/src/cluster/backends/gossip.rs @@ -1,6 +1,4 @@ -//! Gossip backend implementation -//! -//! Wraps the existing GossipCluster to implement the NamingBackend trait. +//! Gossip backend implementation. use crate::actor::{ActorId, ActorPath, NodeId, StopReason}; use crate::cluster::NamingBackend; @@ -15,16 +13,12 @@ use std::net::SocketAddr; use std::sync::Arc; use tokio_util::sync::CancellationToken; -/// Gossip-based naming backend -/// -/// This wraps the existing GossipCluster implementation to provide -/// the NamingBackend trait interface. +/// Gossip-based naming backend. pub struct GossipBackend { cluster: Arc, } impl GossipBackend { - /// Create a new GossipBackend pub fn new( local_node: NodeId, local_addr: SocketAddr, @@ -37,9 +31,7 @@ impl GossipBackend { } } - /// Get a reference to the inner GossipCluster - /// - /// This is needed for SystemMessageHandler to access handle_gossip method. + /// Get a reference to the inner GossipCluster. pub fn inner(&self) -> &GossipCluster { &self.cluster } @@ -47,10 +39,6 @@ impl GossipBackend { #[async_trait] impl NamingBackend for GossipBackend { - // ======================================================================== - // Node Management - // ======================================================================== - async fn join(&self, seeds: Vec) -> anyhow::Result<()> { self.cluster.join(seeds).await } @@ -71,10 +59,6 @@ impl NamingBackend for GossipBackend { self.cluster.get_member(node_id).await } - // ======================================================================== - // Named Actor Registration - // ======================================================================== - async fn register_named_actor(&self, path: ActorPath) { self.cluster.register_named_actor(path).await } @@ -100,10 +84,6 @@ impl NamingBackend for GossipBackend { .await } - // ======================================================================== - // Named Actor Queries - // ======================================================================== - async fn lookup_named_actor(&self, path: &ActorPath) -> Option { self.cluster.lookup_named_actor(path).await } @@ -127,10 +107,6 @@ impl NamingBackend for GossipBackend { self.cluster.all_named_actors().await } - // ======================================================================== - // Actor Registration - // ======================================================================== - async fn register_actor(&self, actor_id: ActorId) { self.cluster.register_actor(actor_id).await } @@ -143,10 +119,6 @@ impl NamingBackend for GossipBackend { self.cluster.lookup_actor(actor_id).await } - // ======================================================================== - // Lifecycle Management - // ======================================================================== - fn start(&self, cancel: CancellationToken) { self.cluster.start(cancel) } diff --git a/crates/pulsing-actor/src/cluster/backends/head.rs b/crates/pulsing-actor/src/cluster/backends/head.rs index 92597d31b..505e0c5bc 100644 --- a/crates/pulsing-actor/src/cluster/backends/head.rs +++ b/crates/pulsing-actor/src/cluster/backends/head.rs @@ -1,8 +1,4 @@ -//! Head node backend implementation -//! -//! Implements a centralized naming backend where: -//! - Head node: Maintains global registry of nodes and named actors -//! - Worker nodes: Sync with head node via HTTP/2 +//! Head node backend implementation. use crate::actor::{ActorId, ActorPath, NodeId, StopReason}; use crate::cluster::{ @@ -19,18 +15,11 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; -// ============================================================================ -// Configuration -// ============================================================================ - -/// Configuration for head node backend +/// Configuration for head node backend. #[derive(Clone, Debug)] pub struct HeadNodeConfig { - /// Sync interval for worker nodes (default: 5s) pub sync_interval: Duration, - /// Heartbeat interval for worker nodes (default: 10s) pub heartbeat_interval: Duration, - /// Heartbeat timeout for head node (default: 30s) pub heartbeat_timeout: Duration, } @@ -44,23 +33,12 @@ impl Default for HeadNodeConfig { } } -// ============================================================================ -// Node Mode -// ============================================================================ - #[derive(Clone, Debug)] enum NodeMode { - /// Head node mode Head, - /// Worker node mode with head node address Worker { head_addr: SocketAddr }, } -// ============================================================================ -// Head Node State -// ============================================================================ - -/// Node registration information #[derive(Clone, Debug, Serialize, Deserialize)] struct NodeRegistration { node_id: NodeId, @@ -68,13 +46,9 @@ struct NodeRegistration { last_heartbeat: u64, // milliseconds since epoch } -/// Head node state (only used in head mode) struct HeadNodeState { - /// Registered nodes nodes: HashMap, - /// Named actors registry named_actors: HashMap, - /// Actor registry actors: HashMap, } @@ -87,7 +61,6 @@ impl HeadNodeState { } } - /// Register or update a node fn register_node(&mut self, node_id: NodeId, addr: SocketAddr) { let now = HeadNodeBackend::now_millis(); self.nodes.insert( @@ -100,7 +73,6 @@ impl HeadNodeState { ); } - /// Update node heartbeat fn update_heartbeat(&mut self, node_id: &NodeId) -> bool { if let Some(reg) = self.nodes.get_mut(node_id) { reg.last_heartbeat = HeadNodeBackend::now_millis(); @@ -110,7 +82,6 @@ impl HeadNodeState { } } - /// Remove stale nodes (heartbeat timeout) fn remove_stale_nodes(&mut self, timeout_ms: u64) -> Vec { let now = HeadNodeBackend::now_millis(); let mut removed = Vec::new(); @@ -124,7 +95,6 @@ impl HeadNodeState { } }); - // Clean up named actors and actors for removed nodes for node_id in &removed { self.named_actors.values_mut().for_each(|info| { info.remove_instance(node_id); @@ -136,7 +106,6 @@ impl HeadNodeState { removed } - /// Register a named actor fn register_named_actor( &mut self, path: ActorPath, @@ -159,7 +128,6 @@ impl HeadNodeState { } } - /// Unregister a named actor fn unregister_named_actor(&mut self, path: &ActorPath, node_id: &NodeId) { let key = path.as_str().to_string(); if let Some(info) = self.named_actors.get_mut(&key) { @@ -170,17 +138,14 @@ impl HeadNodeState { } } - /// Register an actor fn register_actor(&mut self, actor_id: ActorId, node_id: NodeId) { self.actors.insert(actor_id, node_id); } - /// Unregister an actor fn unregister_actor(&mut self, actor_id: &ActorId) { self.actors.remove(actor_id); } - /// Get all members as MemberInfo fn all_members(&self) -> Vec { self.nodes .values() @@ -195,7 +160,6 @@ impl HeadNodeState { .collect() } - /// Get all named actors fn all_named_actors(&self) -> Vec { self.named_actors.values().cloned().collect() } diff --git a/crates/pulsing-actor/src/cluster/backends/mod.rs b/crates/pulsing-actor/src/cluster/backends/mod.rs index 40fbb31f9..39154e5f5 100644 --- a/crates/pulsing-actor/src/cluster/backends/mod.rs +++ b/crates/pulsing-actor/src/cluster/backends/mod.rs @@ -1,4 +1,4 @@ -//! Naming backend implementations +//! Naming backend implementations. mod gossip; mod head; diff --git a/crates/pulsing-actor/src/cluster/gossip.rs b/crates/pulsing-actor/src/cluster/gossip.rs index 6034ecff7..0f98a09ee 100644 --- a/crates/pulsing-actor/src/cluster/gossip.rs +++ b/crates/pulsing-actor/src/cluster/gossip.rs @@ -1,10 +1,4 @@ -//! Gossip protocol for cluster membership and actor discovery -//! -//! Implements a Redis Cluster-style gossip protocol with: -//! - MEET/PING/PONG message exchange -//! - Configuration epoch for conflict resolution -//! - Partial view propagation to reduce message size -//! - PFail/Fail failure detection +//! Gossip protocol for cluster membership and actor discovery. use super::member::{ ActorLocation, ClusterNode, FailureInfo, MemberInfo, MemberStatus, NamedActorInfo, @@ -23,11 +17,7 @@ use std::time::Duration; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; -// ============================================================================ -// Utility Functions -// ============================================================================ - -/// Get current timestamp in milliseconds since UNIX epoch +/// Get current timestamp in milliseconds since UNIX epoch. fn now_millis() -> u64 { std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -35,7 +25,7 @@ fn now_millis() -> u64 { .as_millis() as u64 } -/// Fix 0.0.0.0 addresses using peer's actual IP +/// Fix 0.0.0.0 addresses using peer's actual IP. fn fix_addr(addr: SocketAddr, peer_ip: std::net::IpAddr) -> SocketAddr { if addr.ip().is_unspecified() { SocketAddr::new(peer_ip, addr.port()) @@ -44,35 +34,18 @@ fn fix_addr(addr: SocketAddr, peer_ip: std::net::IpAddr) -> SocketAddr { } } -// ============================================================================ -// Configuration -// ============================================================================ - -/// Gossip protocol configuration +/// Gossip protocol configuration. #[derive(Clone, Debug)] pub struct GossipConfig { - /// Interval between gossip rounds pub gossip_interval: Duration, - /// Number of nodes to gossip with per round (fanout) pub fanout: usize, - /// Number of times to probe each seed node on startup pub seed_probe_count: usize, - /// Delay between seed probes pub seed_probe_interval: Duration, - /// Interval for periodic seed re-probing (None to disable) pub seed_rejoin_interval: Option, - /// Timeout before marking a node as PFail pub failure_timeout: Duration, - /// Grace period before marking Fail nodes as Tombstone pub cleanup_grace_period: Duration, - /// Tombstone retention period before final removal - /// Tombstoned nodes won't be removed immediately, allowing them to recover - /// and rejoin without causing routing churn pub tombstone_retention: Duration, - /// Observation window: if a Fail node is seen alive within this window, - /// it will be recovered instead of tombstoned pub recovery_observation_window: Duration, - /// SWIM config (for ping interval and suspicion timeout) pub swim: SwimConfig, } @@ -84,37 +57,24 @@ impl Default for GossipConfig { seed_probe_count: 3, seed_probe_interval: Duration::from_millis(100), seed_rejoin_interval: Some(Duration::from_secs(15)), - // Increased from 5s to 15s for better tolerance in high-load scenarios - // In large-scale stress tests, gossip messages may be delayed due to high load failure_timeout: Duration::from_secs(15), - // Grace period before tombstoning (increased for stability) cleanup_grace_period: Duration::from_secs(60), - // Tombstone retention: 5 minutes before final removal - // This allows nodes to recover from longer network partitions tombstone_retention: Duration::from_secs(300), - // Recovery observation window: 30 seconds - // If we see the node alive within this window, recover it recovery_observation_window: Duration::from_secs(30), swim: SwimConfig::default(), } } } -// ============================================================================ -// Messages -// ============================================================================ - -/// Gossip protocol messages +/// Gossip protocol messages. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum GossipMessage { - /// MEET: Invite a new node to join the cluster Meet { from: NodeId, from_addr: SocketAddr, current_epoch: u64, }, - /// PING: Periodic probe with partial cluster view Ping { from: NodeId, current_epoch: u64, @@ -123,7 +83,6 @@ pub enum GossipMessage { named_actors: Option>, }, - /// PONG: Response to PING/MEET Pong { from: NodeId, current_epoch: u64, @@ -132,7 +91,6 @@ pub enum GossipMessage { named_actors: Option>, }, - // Legacy actor messages (kept for compatibility) ActorRegistered { location: ActorLocation, }, @@ -142,10 +100,8 @@ pub enum GossipMessage { NamedActorRegistered { path: ActorPath, node_id: NodeId, - /// Actor ID (optional for backward compatibility) #[serde(default)] actor_id: Option, - /// Metadata (e.g., Python class, module, file path) #[serde(default)] metadata: std::collections::HashMap, }, @@ -160,11 +116,7 @@ pub enum GossipMessage { }, } -// ============================================================================ -// Shared State -// ============================================================================ - -/// Shared cluster state (used by both GossipCluster and background tasks) +/// Shared cluster state. struct ClusterState { local_node: NodeId, local_addr: SocketAddr, diff --git a/crates/pulsing-actor/src/cluster/member.rs b/crates/pulsing-actor/src/cluster/member.rs index 1403c8c37..9351fb590 100644 --- a/crates/pulsing-actor/src/cluster/member.rs +++ b/crates/pulsing-actor/src/cluster/member.rs @@ -1,4 +1,4 @@ -//! Cluster member types +//! Cluster member types. use crate::actor::{ActorId, ActorPath, NodeId}; use serde::{Deserialize, Serialize}; @@ -6,23 +6,13 @@ use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use std::time::Instant; -// ============================================================================ -// New Gossip Protocol (Redis Cluster style) -// ============================================================================ - -/// Node status in the new gossip protocol (Redis Cluster style) +/// Node status in the new gossip protocol. #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub enum NodeStatus { - /// Node is online and healthy Online = 0, - /// Node is possibly failed (local detection, not confirmed) PFail = 1, - /// Node is confirmed failed (majority of nodes agree) Fail = 2, - /// Node is in handshake (new node joining) Handshake = 3, - /// Node is tombstoned (failed + grace period expired) - /// Named actors are cleared but node info is retained for recovery Tombstone = 4, } @@ -44,18 +34,13 @@ impl NodeStatus { } } -/// Cluster node information (new format) +/// Cluster node information. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct ClusterNode { - /// Node identifier pub node_id: NodeId, - /// Network address pub addr: SocketAddr, - /// Current status pub status: NodeStatus, - /// Configuration epoch (for conflict resolution) pub epoch: u64, - /// Last seen timestamp (milliseconds since epoch) pub last_seen: u64, } @@ -73,44 +58,29 @@ impl ClusterNode { } } - /// Check if this node info supersedes another (based on epoch) pub fn supersedes(&self, other: &ClusterNode) -> bool { - // Higher epoch always wins if self.epoch != other.epoch { return self.epoch > other.epoch; } - // Same epoch: Fail > PFail > Online self.status > other.status } } -/// Failure information to propagate via gossip +/// Failure information to propagate via gossip. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct FailureInfo { - /// Node that failed pub node_id: NodeId, - /// Failure status (PFail or Fail) pub status: NodeStatus, - /// Epoch when failure was detected pub epoch: u64, - /// Node that reported the failure pub reported_by: NodeId, } -// ============================================================================ -// Legacy types (kept for backward compatibility) -// ============================================================================ - -/// Member status in the cluster +/// Member status in the cluster. #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum MemberStatus { - /// Member is alive and healthy Alive, - /// Member is suspected to be down (not responding to pings) Suspect, - /// Member is confirmed dead Dead, - /// Member is leaving the cluster gracefully Leaving, } @@ -124,32 +94,19 @@ impl MemberStatus { } } -/// Information about a cluster member +/// Information about a cluster member. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct MemberInfo { - /// Node identifier pub node_id: NodeId, - - /// Network address (for TCP communication) pub addr: SocketAddr, - - /// Gossip address (for UDP gossip) pub gossip_addr: SocketAddr, - - /// Current status pub status: MemberStatus, - - /// Incarnation number (for conflict resolution) - /// Higher incarnation wins in case of conflicting information pub incarnation: u64, - - /// Timestamp of last update (not serialized, local only) #[serde(skip)] pub last_update: Option, } impl MemberInfo { - /// Create a new member info pub fn new(node_id: NodeId, addr: SocketAddr, gossip_addr: SocketAddr) -> Self { Self { node_id, @@ -161,14 +118,12 @@ impl MemberInfo { } } - /// Update incarnation number (used when refuting suspicion) pub fn refute(&mut self) { self.incarnation += 1; self.status = MemberStatus::Alive; self.last_update = Some(Instant::now()); } - /// Mark as suspect pub fn suspect(&mut self) { if self.status == MemberStatus::Alive { self.status = MemberStatus::Suspect; @@ -176,20 +131,15 @@ impl MemberInfo { } } - /// Mark as dead pub fn mark_dead(&mut self) { self.status = MemberStatus::Dead; self.last_update = Some(Instant::now()); } - /// Check if this info supersedes another (based on incarnation) pub fn supersedes(&self, other: &MemberInfo) -> bool { - // Higher incarnation always wins if self.incarnation != other.incarnation { return self.incarnation > other.incarnation; } - - // Same incarnation: Dead > Suspect > Alive matches!( (&self.status, &other.status), (MemberStatus::Dead, _) | (MemberStatus::Suspect, MemberStatus::Alive) diff --git a/crates/pulsing-actor/src/cluster/mod.rs b/crates/pulsing-actor/src/cluster/mod.rs index 9ea0f10c5..a3428448c 100644 --- a/crates/pulsing-actor/src/cluster/mod.rs +++ b/crates/pulsing-actor/src/cluster/mod.rs @@ -1,9 +1,4 @@ -//! Cluster module - Gossip-based service discovery -//! -//! Implements a SWIM-like protocol for: -//! - Cluster membership management -//! - Actor location discovery (named actors with multi-instance support) -//! - Failure detection +//! Cluster module. mod gossip; mod member; @@ -20,7 +15,5 @@ pub use member::{ pub use naming::NamingBackend; pub use swim::{SwimConfig, SwimDetector, SwimMessage}; -// Re-export backends for convenience pub use backends::GossipBackend; -// Re-export head node backend types (via backends module's re-exports) pub use backends::{HeadNodeBackend, HeadNodeConfig}; diff --git a/crates/pulsing-actor/src/cluster/naming.rs b/crates/pulsing-actor/src/cluster/naming.rs index dcb937000..0c4f30207 100644 --- a/crates/pulsing-actor/src/cluster/naming.rs +++ b/crates/pulsing-actor/src/cluster/naming.rs @@ -1,9 +1,4 @@ -//! Naming backend trait for abstracting different naming service implementations -//! -//! This trait provides a unified interface for: -//! - Node discovery and membership management -//! - Named actor registration and discovery -//! - Actor location queries +//! Naming backend trait. use crate::actor::{ActorId, ActorPath, NodeId, StopReason}; use crate::cluster::member::{MemberInfo, NamedActorInfo, NamedActorInstance}; @@ -12,36 +7,21 @@ use std::collections::HashMap; use std::net::SocketAddr; use tokio_util::sync::CancellationToken; -/// Trait for naming backends that provide cluster membership and actor discovery +/// Trait for naming backends that provide cluster membership and actor discovery. #[async_trait] pub trait NamingBackend: Send + Sync { - // ======================================================================== - // Node Management - // ======================================================================== - - /// Join the cluster via seed nodes async fn join(&self, seeds: Vec) -> anyhow::Result<()>; - /// Leave the cluster gracefully async fn leave(&self) -> anyhow::Result<()>; - /// Get all cluster members async fn all_members(&self) -> Vec; - /// Get only alive cluster members async fn alive_members(&self) -> Vec; - /// Get member information for a specific node async fn get_member(&self, node_id: &NodeId) -> Option; - // ======================================================================== - // Named Actor Registration - // ======================================================================== - - /// Register a named actor (legacy, without actor_id) async fn register_named_actor(&self, path: ActorPath); - /// Register a named actor with full details (actor_id and metadata) async fn register_named_actor_full( &self, path: ActorPath, @@ -49,60 +29,34 @@ pub trait NamingBackend: Send + Sync { metadata: HashMap, ); - /// Unregister a named actor async fn unregister_named_actor(&self, path: &ActorPath); - /// Broadcast that a named actor has failed async fn broadcast_named_actor_failed(&self, path: &ActorPath, reason: &StopReason); - // ======================================================================== - // Named Actor Queries - // ======================================================================== - - /// Lookup named actor information async fn lookup_named_actor(&self, path: &ActorPath) -> Option; - /// Select a named actor instance (for load balancing) async fn select_named_actor_instance(&self, path: &ActorPath) -> Option; - /// Get all instances of a named actor async fn get_named_actor_instances(&self, path: &ActorPath) -> Vec; - /// Get detailed instance information for a named actor async fn get_named_actor_instances_detailed( &self, path: &ActorPath, ) -> Vec<(MemberInfo, Option)>; - /// Get all named actors in the cluster async fn all_named_actors(&self) -> Vec; - // ======================================================================== - // Actor Registration (optional, some backends may not support) - // ======================================================================== - - /// Register an actor (for non-named actors) async fn register_actor(&self, actor_id: ActorId); - /// Unregister an actor async fn unregister_actor(&self, actor_id: &ActorId); - /// Lookup actor location async fn lookup_actor(&self, actor_id: &ActorId) -> Option; - // ======================================================================== - // Lifecycle Management - // ======================================================================== - - /// Start the backend (e.g., start background tasks) fn start(&self, cancel: CancellationToken); - /// Get the local node ID fn local_node(&self) -> &NodeId; - /// Get the local node address fn local_addr(&self) -> SocketAddr; - /// Get as Any for downcasting fn as_any(&self) -> &dyn std::any::Any; } diff --git a/crates/pulsing-actor/src/cluster/swim.rs b/crates/pulsing-actor/src/cluster/swim.rs index 50dc76380..799610cc9 100644 --- a/crates/pulsing-actor/src/cluster/swim.rs +++ b/crates/pulsing-actor/src/cluster/swim.rs @@ -1,7 +1,4 @@ -//! SWIM failure detection protocol -//! -//! Implements the SWIM (Scalable Weakly-consistent Infection-style Membership) protocol -//! for detecting failed nodes in the cluster. +//! SWIM failure detection protocol. use crate::actor::NodeId; use serde::{Deserialize, Serialize}; @@ -11,19 +8,12 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use tokio::sync::RwLock; -/// SWIM configuration +/// SWIM configuration. #[derive(Clone, Debug)] pub struct SwimConfig { - /// Ping interval pub ping_interval: Duration, - - /// Ping timeout before indirect probe pub ping_timeout: Duration, - - /// Number of indirect probes pub indirect_probes: usize, - - /// Suspicion timeout before marking dead pub suspicion_timeout: Duration, } @@ -33,31 +23,28 @@ impl Default for SwimConfig { ping_interval: Duration::from_millis(500), ping_timeout: Duration::from_secs(2), indirect_probes: 3, - // Increased from 5s to 15s for better tolerance in high-load scenarios - // This gives nodes more time to respond before being marked as failed suspicion_timeout: Duration::from_secs(15), } } } -/// SWIM protocol messages +/// SWIM protocol messages. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum SwimMessage { - /// Direct ping - Ping { seq: u64, from: NodeId }, - - /// Ping acknowledgment - Ack { seq: u64, from: NodeId }, - - /// Indirect ping request + Ping { + seq: u64, + from: NodeId, + }, + Ack { + seq: u64, + from: NodeId, + }, PingReq { seq: u64, from: NodeId, target: NodeId, target_addr: SocketAddr, }, - - /// Indirect ping acknowledgment PingReqAck { seq: u64, from: NodeId, @@ -65,13 +52,12 @@ pub enum SwimMessage { }, } -/// Pending ping state struct PendingPing { target: NodeId, sent_at: Instant, } -/// SWIM failure detector +/// SWIM failure detector. pub struct SwimDetector { local_node: NodeId, config: SwimConfig, @@ -91,7 +77,6 @@ impl Clone for SwimDetector { } impl SwimDetector { - /// Create a new SWIM detector pub fn new(local_node: NodeId, config: SwimConfig) -> Self { Self { local_node, @@ -101,12 +86,10 @@ impl SwimDetector { } } - /// Get ping interval pub fn ping_interval(&self) -> Duration { self.config.ping_interval } - /// Create a new ping message pub fn create_ping(&self) -> (u64, SwimMessage) { let seq = self.seq.fetch_add(1, Ordering::SeqCst); let ping = SwimMessage::Ping { @@ -116,7 +99,6 @@ impl SwimDetector { (seq, ping) } - /// Create an ack message pub fn create_ack(&self, seq: u64) -> SwimMessage { SwimMessage::Ack { seq, @@ -124,7 +106,6 @@ impl SwimDetector { } } - /// Record that a ping was sent pub async fn ping_sent(&self, seq: u64, target: NodeId) { let mut pending = self.pending_pings.write().await; pending.insert( @@ -136,14 +117,12 @@ impl SwimDetector { ); } - /// Record that an ack was received pub async fn ack_received(&self, seq: u64) { let mut pending = self.pending_pings.write().await; pending.remove(&seq); } - /// Check for ping timeouts - /// Returns (node_id, should_suspect) + /// Check for ping timeouts. pub async fn check_timeouts(&self) -> Vec<(NodeId, bool)> { let mut pending = self.pending_pings.write().await; let now = Instant::now(); diff --git a/crates/pulsing-actor/src/error.rs b/crates/pulsing-actor/src/error.rs new file mode 100644 index 000000000..0d247c8bb --- /dev/null +++ b/crates/pulsing-actor/src/error.rs @@ -0,0 +1,388 @@ +//! Unified error types for the actor system. + +use thiserror::Error; + +/// Unified error type for the Pulsing actor system +/// +/// This enum encompasses all error categories in the system. +/// It implements `From` for each sub-error type for easy conversion. +#[derive(Error, Debug)] +pub enum PulsingError { + /// Actor-related errors + #[error("Actor error: {0}")] + Actor(#[from] ActorError), + + /// Transport layer errors + #[error("Transport error: {0}")] + Transport(#[from] TransportError), + + /// Cluster-related errors + #[error("Cluster error: {0}")] + Cluster(#[from] ClusterError), + + /// Configuration errors + #[error("Configuration error: {0}")] + Config(#[from] ConfigError), + + /// I/O errors + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// Serialization/deserialization errors + #[error("Serialization error: {0}")] + Serialization(String), + + /// Timeout errors + #[error("Timeout: {0}")] + Timeout(String), + + /// Generic errors (for cases not covered by specific types) + #[error("{0}")] + Other(String), +} + +impl PulsingError { + /// Create a generic error from a message + pub fn other(msg: impl Into) -> Self { + Self::Other(msg.into()) + } + + /// Create a timeout error + pub fn timeout(msg: impl Into) -> Self { + Self::Timeout(msg.into()) + } + + /// Create a serialization error + pub fn serialization(msg: impl Into) -> Self { + Self::Serialization(msg.into()) + } +} + +impl From for PulsingError { + fn from(err: anyhow::Error) -> Self { + // Try to downcast to known error types + if let Some(actor_err) = err.downcast_ref::() { + return Self::Actor(actor_err.clone()); + } + if let Some(transport_err) = err.downcast_ref::() { + return Self::Transport(transport_err.clone()); + } + if let Some(cluster_err) = err.downcast_ref::() { + return Self::Cluster(cluster_err.clone()); + } + if let Some(config_err) = err.downcast_ref::() { + return Self::Config(config_err.clone()); + } + Self::Other(err.to_string()) + } +} + +/// Actor-related errors +#[derive(Error, Debug, Clone, PartialEq, Eq)] +pub enum ActorError { + /// Actor not found by name or ID + #[error("Actor not found: {name}")] + NotFound { name: String }, + + /// Actor already exists with the given name + #[error("Actor already exists: {name}")] + AlreadyExists { name: String }, + + /// Actor is not local to this node + #[error("Actor is not local: {name}")] + NotLocal { name: String }, + + /// Actor has stopped and cannot process messages + #[error("Actor stopped: {name}")] + Stopped { name: String }, + + /// Actor mailbox is full + #[error("Actor mailbox full: {name}")] + MailboxFull { name: String }, + + /// Invalid actor path format + #[error("Invalid actor path: {path}")] + InvalidPath { path: String }, + + /// Message type mismatch + #[error("Message type mismatch: expected {expected}, got {actual}")] + MessageTypeMismatch { expected: String, actual: String }, + + /// Actor spawn failed + #[error("Failed to spawn actor: {reason}")] + SpawnFailed { reason: String }, +} + +impl ActorError { + /// Create a "not found" error + pub fn not_found(name: impl Into) -> Self { + Self::NotFound { name: name.into() } + } + + /// Create an "already exists" error + pub fn already_exists(name: impl Into) -> Self { + Self::AlreadyExists { name: name.into() } + } + + /// Create a "mailbox full" error + pub fn mailbox_full(name: impl Into) -> Self { + Self::MailboxFull { name: name.into() } + } + + /// Create an "invalid path" error + pub fn invalid_path(path: impl Into) -> Self { + Self::InvalidPath { path: path.into() } + } + + /// Create a "spawn failed" error + pub fn spawn_failed(reason: impl Into) -> Self { + Self::SpawnFailed { + reason: reason.into(), + } + } +} + +/// Transport layer errors +#[derive(Error, Debug, Clone, PartialEq, Eq)] +pub enum TransportError { + /// Connection failed + #[error("Connection failed to {addr}: {reason}")] + ConnectionFailed { addr: String, reason: String }, + + /// Connection closed unexpectedly + #[error("Connection closed: {reason}")] + ConnectionClosed { reason: String }, + + /// Request timed out + #[error("Request timeout after {timeout_ms}ms")] + RequestTimeout { timeout_ms: u64 }, + + /// Invalid response from remote + #[error("Invalid response: {reason}")] + InvalidResponse { reason: String }, + + /// TLS error + #[error("TLS error: {reason}")] + TlsError { reason: String }, + + /// Protocol error (HTTP/2) + #[error("Protocol error: {reason}")] + ProtocolError { reason: String }, +} + +impl TransportError { + /// Create a connection failed error + pub fn connection_failed(addr: impl Into, reason: impl Into) -> Self { + Self::ConnectionFailed { + addr: addr.into(), + reason: reason.into(), + } + } + + /// Create a request timeout error + pub fn request_timeout(timeout_ms: u64) -> Self { + Self::RequestTimeout { timeout_ms } + } + + /// Create a TLS error + pub fn tls_error(reason: impl Into) -> Self { + Self::TlsError { + reason: reason.into(), + } + } +} + +/// Cluster-related errors +#[derive(Error, Debug, Clone, PartialEq, Eq)] +pub enum ClusterError { + /// Cluster not initialized + #[error("Cluster not initialized")] + NotInitialized, + + /// Node not found in cluster + #[error("Node not found: {node_id}")] + NodeNotFound { node_id: String }, + + /// Named actor not found + #[error("Named actor not found: {path}")] + NamedActorNotFound { path: String }, + + /// No healthy instances available + #[error("No healthy instances for: {path}")] + NoHealthyInstances { path: String }, + + /// Join failed + #[error("Failed to join cluster: {reason}")] + JoinFailed { reason: String }, + + /// Gossip protocol error + #[error("Gossip error: {reason}")] + GossipError { reason: String }, +} + +impl ClusterError { + /// Create a "not initialized" error + pub fn not_initialized() -> Self { + Self::NotInitialized + } + + /// Create a "node not found" error + pub fn node_not_found(node_id: impl Into) -> Self { + Self::NodeNotFound { + node_id: node_id.into(), + } + } + + /// Create a "named actor not found" error + pub fn named_actor_not_found(path: impl Into) -> Self { + Self::NamedActorNotFound { path: path.into() } + } + + /// Create a "no healthy instances" error + pub fn no_healthy_instances(path: impl Into) -> Self { + Self::NoHealthyInstances { path: path.into() } + } +} + +/// Configuration-related errors +#[derive(Error, Debug, Clone, PartialEq, Eq)] +pub enum ConfigError { + /// Invalid configuration value + #[error("Invalid configuration: {field} = {value} ({reason})")] + InvalidValue { + field: String, + value: String, + reason: String, + }, + + /// Missing required configuration + #[error("Missing required configuration: {field}")] + MissingRequired { field: String }, + + /// Conflicting configuration options + #[error("Conflicting configuration: {reason}")] + Conflicting { reason: String }, + + /// Address parsing error + #[error("Invalid address '{addr}': {reason}")] + InvalidAddress { addr: String, reason: String }, +} + +impl ConfigError { + /// Create an "invalid value" error + pub fn invalid_value( + field: impl Into, + value: impl Into, + reason: impl Into, + ) -> Self { + Self::InvalidValue { + field: field.into(), + value: value.into(), + reason: reason.into(), + } + } + + /// Create a "missing required" error + pub fn missing_required(field: impl Into) -> Self { + Self::MissingRequired { + field: field.into(), + } + } + + /// Create a "conflicting" error + pub fn conflicting(reason: impl Into) -> Self { + Self::Conflicting { + reason: reason.into(), + } + } + + /// Create an "invalid address" error + pub fn invalid_address(addr: impl Into, reason: impl Into) -> Self { + Self::InvalidAddress { + addr: addr.into(), + reason: reason.into(), + } + } +} + +/// Convenience type alias for results using PulsingError +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_actor_error_display() { + let err = ActorError::not_found("my-actor"); + assert!(err.to_string().contains("my-actor")); + + let err = ActorError::already_exists("existing-actor"); + assert!(err.to_string().contains("existing-actor")); + } + + #[test] + fn test_transport_error_display() { + let err = TransportError::connection_failed("127.0.0.1:8000", "connection refused"); + assert!(err.to_string().contains("127.0.0.1:8000")); + assert!(err.to_string().contains("refused")); + + let err = TransportError::request_timeout(5000); + assert!(err.to_string().contains("5000")); + } + + #[test] + fn test_cluster_error_display() { + let err = ClusterError::not_initialized(); + assert!(err.to_string().contains("not initialized")); + + let err = ClusterError::named_actor_not_found("services/echo"); + assert!(err.to_string().contains("services/echo")); + } + + #[test] + fn test_config_error_display() { + let err = ConfigError::invalid_value("mailbox_capacity", "0", "must be > 0"); + assert!(err.to_string().contains("mailbox_capacity")); + + let err = ConfigError::conflicting("cannot be both head node and worker"); + assert!(err.to_string().contains("head node")); + } + + #[test] + fn test_pulsing_error_from_actor_error() { + let actor_err = ActorError::not_found("test"); + let pulsing_err: PulsingError = actor_err.into(); + + assert!(matches!(pulsing_err, PulsingError::Actor(_))); + assert!(pulsing_err.to_string().contains("test")); + } + + #[test] + fn test_pulsing_error_from_transport_error() { + let transport_err = TransportError::request_timeout(3000); + let pulsing_err: PulsingError = transport_err.into(); + + assert!(matches!(pulsing_err, PulsingError::Transport(_))); + assert!(pulsing_err.to_string().contains("3000")); + } + + #[test] + fn test_pulsing_error_helpers() { + let err = PulsingError::other("something went wrong"); + assert!(err.to_string().contains("wrong")); + + let err = PulsingError::timeout("operation timed out"); + assert!(err.to_string().contains("timed out")); + } + + #[test] + fn test_error_equality() { + let err1 = ActorError::not_found("test"); + let err2 = ActorError::not_found("test"); + let err3 = ActorError::not_found("other"); + + assert_eq!(err1, err2); + assert_ne!(err1, err3); + } +} diff --git a/crates/pulsing-actor/src/lib.rs b/crates/pulsing-actor/src/lib.rs index 176ddd487..6ac2e6519 100644 --- a/crates/pulsing-actor/src/lib.rs +++ b/crates/pulsing-actor/src/lib.rs @@ -2,59 +2,25 @@ //! #![cfg_attr(coverage_nightly, feature(coverage_attribute))] //! -//! A lightweight, zero-external-dependency distributed actor framework. -//! -//! ## Features -//! -//! - **Zero external dependencies**: No etcd, nats, or redis required -//! - **Gossip-based discovery**: Automatic cluster membership using SWIM protocol -//! - **Location-transparent ActorRef**: Same API for local and remote actors -//! - **Async/await native**: Built on tokio -//! -//! ## Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────┐ -//! │ ActorSystem │ -//! │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ -//! │ │ Actor 1 │ │ Actor 2 │ │ Cluster Module │ │ -//! │ │ ┌───────┐ │ │ ┌───────┐ │ │ ┌───────────────────┐ │ │ -//! │ │ │Mailbox│ │ │ │Mailbox│ │ │ │ Gossip Protocol │ │ │ -//! │ │ └───────┘ │ │ └───────┘ │ │ │ (SWIM-like) │ │ │ -//! │ └─────────────┘ └─────────────┘ │ └───────────────────┘ │ │ -//! │ ↑ ↑ │ ↑ │ │ -//! │ └───────┬───────┘ │ │ │ │ -//! │ │ │ │ │ │ -//! │ ┌────────┮────────┐ │ ┌────────┮────────┐ │ │ -//! │ │ Actor Registry │←────────┌──│ Member Registry │ │ │ -//! │ └─────────────────┘ │ └─────────────────┘ │ │ -//! │ └─────────────────────────┘ │ -//! │ ↕ TCP Transport │ -//! └─────────────────────────────────────────────────────────────────┘ -//! ``` +//! Lightweight distributed actor framework (gossip discovery, HTTP/2 transport). //! //! ## Quick Start //! //! ```rust,ignore //! use pulsing_actor::prelude::*; //! -//! // Define messages //! #[derive(Serialize, Deserialize)] //! struct Ping { value: i32 } -//! //! #[derive(Serialize, Deserialize)] //! struct Pong { result: i32 } //! -//! // Define an actor - no boilerplate! //! struct Counter { count: i32 } //! //! #[async_trait] //! impl Actor for Counter { -//! async fn receive( -//! &mut self, -//! msg: Message, -//! ctx: &mut ActorContext, -//! ) -> anyhow::Result { +//! async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) +//! -> anyhow::Result +//! { //! if msg.msg_type().ends_with("Ping") { //! let ping: Ping = msg.unpack()?; //! self.count += ping.value; @@ -66,47 +32,20 @@ //! //! #[tokio::main] //! async fn main() -> anyhow::Result<()> { -//! let system = ActorSystem::new(SystemConfig::standalone()).await?; -//! -//! // Spawn with a name - system assigns the ID -//! let actor_ref = system.spawn("counter", Counter { count: 0 }).await?; -//! -//! // Send message and get response +//! let system = ActorSystem::builder().build().await?; +//! let actor_ref = system.spawn_named("services/counter", Counter { count: 0 }).await?; //! let pong: Pong = actor_ref.ask(Ping { value: 42 }).await?; //! println!("Result: {}", pong.result); -//! //! system.shutdown().await?; //! Ok(()) //! } //! ``` -//! -//! ## Cluster Mode -//! -//! ```rust,ignore -//! // Node 1 - Start first node -//! let config = SystemConfig::with_addrs( -//! "0.0.0.0:8000".parse()?, // TCP -//! "0.0.0.0:7000".parse()?, // Gossip -//! ); -//! let system1 = ActorSystem::new(config).await?; -//! -//! // Node 2 - Join existing cluster -//! let config = SystemConfig::with_addrs( -//! "0.0.0.0:8001".parse()?, -//! "0.0.0.0:7001".parse()?, -//! ).with_seeds(vec!["192.168.1.100:7000".parse()?]); -//! -//! let system2 = ActorSystem::new(config).await?; -//! -//! // Get reference to actor on another node -//! let remote_ref = system2.actor_ref(&actor_id).await?; -//! let result: Pong = remote_ref.ask(Ping { value: 10 }).await?; -//! ``` pub mod actor; pub mod behavior; pub mod circuit_breaker; pub mod cluster; +pub mod error; pub mod metrics; pub mod policies; pub mod supervision; @@ -116,6 +55,28 @@ pub mod tracing; pub mod transport; pub mod watch; +/// Test helpers and macros for writing actor system tests +/// +/// This module provides reusable test infrastructure including: +/// - Common test messages (TestPing, TestPong, etc.) +/// - Common test actors (TestEchoActor, TestAccumulatorActor) +/// - Helper functions for test setup +/// - Macros for standardized test patterns +/// +/// # Example +/// ```rust,ignore +/// use pulsing_actor::test_helper::*; +/// use pulsing_actor::actor_test; +/// +/// actor_test!(test_echo, system, { +/// let echo = spawn_echo_actor(&system, "test/echo").await; +/// let response: TestPong = echo.ask(TestPing { value: 21 }).await.unwrap(); +/// assert_eq!(response.result, 42); +/// }); +/// ``` +#[cfg(any(test, feature = "test-helper"))] +pub mod test_helper; + /// Prelude - commonly used types /// /// Import with: `use pulsing_actor::prelude::*;` diff --git a/crates/pulsing-actor/src/system/config.rs b/crates/pulsing-actor/src/system/config.rs index 94e0152f8..de9f891c4 100644 --- a/crates/pulsing-actor/src/system/config.rs +++ b/crates/pulsing-actor/src/system/config.rs @@ -1,4 +1,10 @@ -//! Configuration types for the Actor System +//! Configuration types for the Actor System. +//! +//! Core types: +//! - [`SystemConfig`] +//! - [`ActorSystemBuilder`] +//! - [`SpawnOptions`] +//! - [`ResolveOptions`] use crate::actor::{NodeId, DEFAULT_MAILBOX_SIZE}; use crate::cluster::{GossipConfig, HeadNodeConfig}; @@ -9,7 +15,22 @@ use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; +/// Minimum mailbox capacity (prevents performance issues) +const MIN_MAILBOX_CAPACITY: usize = 16; + +/// Maximum mailbox capacity (prevents memory exhaustion) +const MAX_MAILBOX_CAPACITY: usize = 1_000_000; + /// Actor System configuration +/// +/// This struct holds all configuration options for the actor system. +/// Use the builder pattern via [`ActorSystem::builder()`](crate::system::ActorSystem::builder) +/// for a more ergonomic API. +/// +/// # Validation +/// +/// Call [`validate()`](Self::validate) to check configuration validity before use. +/// The builder automatically validates during `build()`. #[derive(Clone, Debug)] pub struct SystemConfig { /// HTTP/2 address for all communication (actors + gossip) @@ -117,34 +138,92 @@ impl SystemConfig { self.head_node_config = Some(config); self } + + /// Validate the configuration + /// + /// Returns a list of validation errors, or an empty list if valid. + /// The builder calls this automatically during `build()`. + /// + /// # Validation Rules + /// - Mailbox capacity must be between 16 and 1,000,000 + /// - Cannot be both head node and have head_addr set + /// - Seed nodes should not be empty when head_addr is not set (for cluster mode) + pub fn validate(&self) -> Vec { + let mut errors = Vec::new(); + + // Validate mailbox capacity + if self.default_mailbox_capacity < MIN_MAILBOX_CAPACITY { + errors.push(ConfigValidationError::MailboxTooSmall { + value: self.default_mailbox_capacity, + min: MIN_MAILBOX_CAPACITY, + }); + } + if self.default_mailbox_capacity > MAX_MAILBOX_CAPACITY { + errors.push(ConfigValidationError::MailboxTooLarge { + value: self.default_mailbox_capacity, + max: MAX_MAILBOX_CAPACITY, + }); + } + + // Validate head node configuration + if self.is_head_node && self.head_addr.is_some() { + errors.push(ConfigValidationError::ConflictingHeadNodeConfig); + } + + errors + } + + /// Check if configuration is valid + pub fn is_valid(&self) -> bool { + self.validate().is_empty() + } } +/// Configuration validation error +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ConfigValidationError { + /// Mailbox capacity is too small + MailboxTooSmall { value: usize, min: usize }, + /// Mailbox capacity is too large + MailboxTooLarge { value: usize, max: usize }, + /// Conflicting head node configuration (both is_head_node and head_addr set) + ConflictingHeadNodeConfig, +} + +impl std::fmt::Display for ConfigValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::MailboxTooSmall { value, min } => { + write!( + f, + "Mailbox capacity {} is too small (minimum: {})", + value, min + ) + } + Self::MailboxTooLarge { value, max } => { + write!( + f, + "Mailbox capacity {} is too large (maximum: {})", + value, max + ) + } + Self::ConflictingHeadNodeConfig => { + write!( + f, + "Cannot set both head_node mode and head_addr (conflicting options)" + ) + } + } + } +} + +impl std::error::Error for ConfigValidationError {} + // ============================================================================ // ActorSystem Builder // ============================================================================ /// Builder for creating ActorSystem with fluent API -/// -/// # Example -/// -/// ```rust,ignore -/// // Standalone mode (simplest) -/// let system = ActorSystem::builder().build().await?; -/// -/// // With custom address -/// let system = ActorSystem::builder() -/// .addr("0.0.0.0:8000") -/// .build() -/// .await?; -/// -/// // Cluster mode with seeds -/// let system = ActorSystem::builder() -/// .addr("0.0.0.0:8000") -/// .seeds(["192.168.1.1:8000", "192.168.1.2:8000"]) -/// .mailbox_capacity(512) -/// .build() -/// .await?; -/// ``` #[derive(Default)] pub struct ActorSystemBuilder { /// Bind address (stored as Result for deferred error handling) @@ -238,53 +317,200 @@ impl ActorSystemBuilder { /// Build the ActorSystem /// - /// Returns an error if any address parsing failed. + /// Returns an error if any address parsing or validation failed. pub async fn build(self) -> anyhow::Result> { - // Parse bind address (use default if not specified) - let addr = match self.addr { - Some(Ok(addr)) => addr, - Some(Err(invalid)) => { - return Err(anyhow::anyhow!("Invalid bind address: {}", invalid)); - } - None => DEFAULT_BIND_ADDR, + let addr = + Self::parse_optional_addr("bind address", self.addr)?.unwrap_or(DEFAULT_BIND_ADDR); + + let seed_nodes = Self::parse_addr_list("seed address", self.seeds)?; + + let head_addr = Self::parse_optional_addr("head node address", self.head_addr)?; + + let config = SystemConfig { + addr, + seed_nodes, + gossip_config: self.gossip_config.unwrap_or_default(), + http2_config: self.http2_config.unwrap_or_default(), + default_mailbox_capacity: self.mailbox_capacity.unwrap_or(DEFAULT_MAILBOX_SIZE), + is_head_node: self.is_head_node, + head_addr, + head_node_config: self.head_node_config, }; - // Parse seed nodes - let mut seed_nodes = Vec::with_capacity(self.seeds.len()); - for (i, seed) in self.seeds.into_iter().enumerate() { + Self::validate_config(&config)?; + + crate::system::ActorSystem::new(config).await + } + + fn parse_optional_addr( + label: &str, + input: Option>, + ) -> anyhow::Result> { + match input { + Some(Ok(addr)) => Ok(Some(addr)), + Some(Err(invalid)) => Err(anyhow::anyhow!("Invalid {}: {}", label, invalid)), + None => Ok(None), + } + } + + fn parse_addr_list( + label: &str, + seeds: Vec>, + ) -> anyhow::Result> { + let mut addrs = Vec::with_capacity(seeds.len()); + for (i, seed) in seeds.into_iter().enumerate() { match seed { - Ok(addr) => seed_nodes.push(addr), + Ok(addr) => addrs.push(addr), Err(invalid) => { return Err(anyhow::anyhow!( - "Invalid seed address at index {}: {}", + "Invalid {} at index {}: {}", + label, i, invalid )); } } } + Ok(addrs) + } - // Parse head node address if specified - let head_addr = match self.head_addr { - Some(Ok(addr)) => Some(addr), - Some(Err(invalid)) => { - return Err(anyhow::anyhow!("Invalid head node address: {}", invalid)); - } - None => None, + fn validate_config(config: &SystemConfig) -> anyhow::Result<()> { + let errors = config.validate(); + if errors.is_empty() { + return Ok(()); + } + let error_msgs: Vec = errors.iter().map(|e| e.to_string()).collect(); + Err(anyhow::anyhow!( + "Configuration validation failed:\n - {}", + error_msgs.join("\n - ") + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_system_config_default() { + let config = SystemConfig::default(); + assert!(config.is_valid()); + assert_eq!(config.default_mailbox_capacity, DEFAULT_MAILBOX_SIZE); + assert!(!config.is_head_node); + assert!(config.head_addr.is_none()); + } + + #[test] + fn test_system_config_standalone() { + let config = SystemConfig::standalone(); + assert!(config.is_valid()); + } + + #[test] + fn test_system_config_with_addr() { + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let config = SystemConfig::with_addr(addr); + assert_eq!(config.addr, addr); + assert!(config.is_valid()); + } + + #[test] + fn test_system_config_validation_mailbox_too_small() { + let config = SystemConfig { + default_mailbox_capacity: 1, + ..Default::default() }; + let errors = config.validate(); + assert_eq!(errors.len(), 1); + assert!(matches!( + errors[0], + ConfigValidationError::MailboxTooSmall { .. } + )); + } + #[test] + fn test_system_config_validation_mailbox_too_large() { let config = SystemConfig { - addr, - seed_nodes, - gossip_config: self.gossip_config.unwrap_or_default(), - http2_config: self.http2_config.unwrap_or_default(), - default_mailbox_capacity: self.mailbox_capacity.unwrap_or(DEFAULT_MAILBOX_SIZE), - is_head_node: self.is_head_node, - head_addr, - head_node_config: self.head_node_config, + default_mailbox_capacity: 10_000_000, + ..Default::default() }; + let errors = config.validate(); + assert_eq!(errors.len(), 1); + assert!(matches!( + errors[0], + ConfigValidationError::MailboxTooLarge { .. } + )); + } - crate::system::ActorSystem::new(config).await + #[test] + fn test_system_config_validation_conflicting_head_node() { + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let config = SystemConfig { + is_head_node: true, + head_addr: Some(addr), + ..Default::default() + }; + let errors = config.validate(); + assert_eq!(errors.len(), 1); + assert!(matches!( + errors[0], + ConfigValidationError::ConflictingHeadNodeConfig + )); + } + + #[test] + fn test_system_config_builder_methods() { + let config = SystemConfig::standalone() + .with_mailbox_capacity(256) + .with_head_node(); + + assert_eq!(config.default_mailbox_capacity, 256); + assert!(config.is_head_node); + assert!(config.is_valid()); + } + + #[test] + fn test_spawn_options_default() { + let options = SpawnOptions::new(); + assert!(options.mailbox_capacity.is_none()); + assert!(options.metadata.is_empty()); + } + + #[test] + fn test_spawn_options_builder() { + let options = SpawnOptions::new() + .mailbox_capacity(512) + .metadata([("key".to_string(), "value".to_string())].into()); + + assert_eq!(options.mailbox_capacity, Some(512)); + assert_eq!(options.metadata.get("key"), Some(&"value".to_string())); + } + + #[test] + fn test_resolve_options_default() { + let options = ResolveOptions::new(); + assert!(options.node_id.is_none()); + assert!(options.policy.is_none()); + assert!(options.filter_alive); + } + + #[test] + fn test_resolve_options_builder() { + let node_id = NodeId::new(123); + let options = ResolveOptions::new().node_id(node_id).filter_alive(false); + + assert_eq!(options.node_id, Some(node_id)); + assert!(!options.filter_alive); + } + + #[test] + fn test_config_validation_error_display() { + let err = ConfigValidationError::MailboxTooSmall { value: 5, min: 16 }; + assert!(err.to_string().contains("5")); + assert!(err.to_string().contains("16")); + + let err = ConfigValidationError::ConflictingHeadNodeConfig; + assert!(err.to_string().contains("head_node")); } } diff --git a/crates/pulsing-actor/src/system/lifecycle.rs b/crates/pulsing-actor/src/system/lifecycle.rs new file mode 100644 index 000000000..3d1b5fd36 --- /dev/null +++ b/crates/pulsing-actor/src/system/lifecycle.rs @@ -0,0 +1,222 @@ +//! Actor lifecycle management +//! +//! This module contains the implementation of actor stop and shutdown methods +//! for graceful lifecycle management. + +use crate::actor::{ActorPath, StopReason}; +use crate::system::ActorSystem; +use std::time::Duration; +use tokio_util::sync::CancellationToken; + +impl ActorSystem { + /// Default timeout for graceful actor shutdown (30 seconds) + pub(crate) const GRACEFUL_STOP_TIMEOUT: Duration = Duration::from_secs(30); + + /// Stop an actor gracefully + /// + /// This method first signals the actor to stop via its cancellation token, + /// waits for it to finish (with timeout), then performs cleanup. + /// If the actor doesn't stop within the timeout, it will be forcefully aborted. + pub async fn stop(&self, name: impl AsRef) -> anyhow::Result<()> { + self.stop_with_reason(name, StopReason::Killed).await + } + + /// Stop an actor with a specific reason + /// + /// Note: If the name doesn't contain a "/" and no actor is found with the exact name, + /// it will try with the "actors/" prefix (for Python compatibility). + pub async fn stop_with_reason( + &self, + name: impl AsRef, + reason: StopReason, + ) -> anyhow::Result<()> { + let name = name.as_ref(); + + let actual_name = if self.actor_names.contains_key(name) { + name.to_string() + } else if !name.contains('/') { + let prefixed = format!("actors/{}", name); + if self.actor_names.contains_key(&prefixed) { + prefixed + } else { + name.to_string() + } + } else { + name.to_string() + }; + + if let Some((_, local_id)) = self.actor_names.remove(&actual_name) { + if let Some((_, handle)) = self.local_actors.remove(&local_id) { + let named_path = handle.named_path.clone(); + self.stop_local_actor( + &actual_name, + handle, + named_path, + reason, + Self::GRACEFUL_STOP_TIMEOUT, + ) + .await; + } + } + + Ok(()) + } + + /// Stop a named actor by path + pub async fn stop_named(&self, path: &crate::actor::ActorPath) -> anyhow::Result<()> { + self.stop_named_with_reason(path, StopReason::Killed).await + } + + /// Stop a named actor by path with a specific reason + pub async fn stop_named_with_reason( + &self, + path: &crate::actor::ActorPath, + reason: StopReason, + ) -> anyhow::Result<()> { + let path_key = path.as_str(); + + if let Some(actor_name_ref) = self.named_actor_paths.get(&path_key) { + let actor_name = actor_name_ref.clone(); + drop(actor_name_ref); + + if let Some((_, local_id)) = self.actor_names.remove(&actor_name) { + if let Some((_, handle)) = self.local_actors.remove(&local_id) { + self.stop_local_actor( + &actor_name, + handle, + Some(path.clone()), + reason, + Self::GRACEFUL_STOP_TIMEOUT, + ) + .await; + } + } + } + + Ok(()) + } + + /// Shutdown the entire actor system + /// + pub async fn shutdown(&self) -> anyhow::Result<()> { + tracing::info!("Shutting down actor system"); + + self.cancel_token.cancel(); + + tokio::time::sleep(Duration::from_millis(100)).await; + + let actor_entries: Vec<_> = self + .local_actors + .iter() + .map(|entry| { + let local_id = *entry.key(); + let actor_id = entry.actor_id; + let named_path = entry.named_path.clone(); + let name = self + .actor_names + .iter() + .find(|e| *e.value() == local_id) + .map(|e| e.key().clone()) + .unwrap_or_else(|| actor_id.to_string()); + (local_id, actor_id, name, named_path) + }) + .collect(); + + for (local_id, _actor_id, actor_name, named_path) in actor_entries { + self.actor_names.remove(&actor_name); + + if let Some((_, handle)) = self.local_actors.remove(&local_id) { + self.stop_local_actor( + &actor_name, + handle, + named_path, + StopReason::SystemShutdown, + Duration::from_secs(5), + ) + .await; + } + } + + self.local_actors.clear(); + self.actor_names.clear(); + + self.node_load.clear(); + + self.lifecycle.clear().await; + + { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.leave().await?; + } + } + + tracing::info!("Actor system shutdown complete"); + Ok(()) + } + + /// Get cancellation token + pub fn cancel_token(&self) -> CancellationToken { + self.cancel_token.clone() + } + + async fn stop_local_actor( + &self, + actor_name: &str, + handle: super::handle::LocalActorHandle, + named_path: Option, + reason: StopReason, + timeout: Duration, + ) { + // 1. Signal the actor to stop gracefully + handle.cancel_token.cancel(); + + // 2. Wait for the actor to finish with timeout + match tokio::time::timeout(timeout, handle.join_handle).await { + Ok(_) => { + if let Some(path) = named_path.as_ref() { + tracing::debug!( + actor = %actor_name, + path = %path, + "Actor stopped gracefully" + ); + } else { + tracing::debug!(actor = %actor_name, "Actor stopped gracefully"); + } + } + Err(_) => { + if let Some(path) = named_path.as_ref() { + tracing::warn!( + actor = %actor_name, + path = %path, + "Actor didn't stop gracefully within timeout" + ); + } else { + tracing::warn!( + actor = %actor_name, + "Actor didn't stop gracefully within timeout" + ); + } + } + } + + // 3. Handle lifecycle cleanup + let actor_names = self.actor_names.clone(); + let local_actors = self.local_actors.clone(); + self.lifecycle + .handle_termination( + &handle.actor_id, + actor_name, + named_path, + reason, + &self.named_actor_paths, + &self.cluster, + |name| { + actor_names + .get(name) + .and_then(|id| local_actors.get(id.value()).map(|h| h.sender.clone())) + }, + ) + .await; + } +} diff --git a/crates/pulsing-actor/src/system/load_balancer.rs b/crates/pulsing-actor/src/system/load_balancer.rs new file mode 100644 index 000000000..b7ecce819 --- /dev/null +++ b/crates/pulsing-actor/src/system/load_balancer.rs @@ -0,0 +1,239 @@ +//! Load balancing utilities for the actor system +//! +//! This module provides per-node load tracking and worker adapters for +//! integrating with the load balancing policies. + +use crate::cluster::MemberInfo; +use crate::cluster::MemberStatus; +use crate::policies::Worker; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +/// Per-node load tracking with activity timestamp for cleanup +/// +/// Tracks the current load (in-flight requests) and total processed requests +/// for a remote node. Includes timestamp tracking for stale entry cleanup. +#[derive(Debug)] +pub struct NodeLoadTracker { + /// Current in-flight requests to this node + load: AtomicUsize, + /// Total requests processed + processed: AtomicU64, + /// Last activity timestamp (Unix millis) for stale entry cleanup + last_activity_millis: AtomicU64, +} + +impl Default for NodeLoadTracker { + fn default() -> Self { + Self { + load: AtomicUsize::new(0), + processed: AtomicU64::new(0), + last_activity_millis: AtomicU64::new(Self::current_millis()), + } + } +} + +impl NodeLoadTracker { + /// Create a new load tracker + pub fn new() -> Self { + Self::default() + } + + /// Get current timestamp in milliseconds + fn current_millis() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 + } + + /// Update the last activity timestamp + fn touch(&self) { + self.last_activity_millis + .store(Self::current_millis(), Ordering::Relaxed); + } + + /// Get current load (in-flight requests) + pub fn load(&self) -> usize { + self.load.load(Ordering::Relaxed) + } + + /// Increment the load counter + pub fn increment(&self) { + self.load.fetch_add(1, Ordering::Relaxed); + self.touch(); + } + + /// Decrement the load counter + pub fn decrement(&self) { + self.load.fetch_sub(1, Ordering::Relaxed); + self.touch(); + } + + /// Get total processed requests + pub fn processed(&self) -> u64 { + self.processed.load(Ordering::Relaxed) + } + + /// Increment the processed counter + pub fn increment_processed(&self) { + self.processed.fetch_add(1, Ordering::Relaxed); + self.touch(); + } + + /// Returns elapsed time since last activity + pub fn last_activity_elapsed(&self) -> Duration { + let last = self.last_activity_millis.load(Ordering::Relaxed); + let now = Self::current_millis(); + Duration::from_millis(now.saturating_sub(last)) + } + + /// Returns true if this tracker has been inactive for longer than the threshold + pub fn is_stale(&self, threshold: Duration) -> bool { + self.last_activity_elapsed() > threshold + } +} + +/// Wrapper to adapt MemberInfo to the Worker trait for load balancing +/// +/// This allows cluster member information to be used with the load balancing +/// policies defined in the `policies` module. +#[derive(Debug)] +pub struct MemberWorker { + url: String, + is_alive: bool, + /// Shared load tracker for this node + load_tracker: Arc, +} + +impl MemberWorker { + /// Create a new MemberWorker from a MemberInfo + pub fn new(member: &MemberInfo, load_tracker: Arc) -> Self { + Self { + url: member.addr.to_string(), + is_alive: member.status == MemberStatus::Alive, + load_tracker, + } + } +} + +impl Worker for MemberWorker { + fn url(&self) -> &str { + &self.url + } + + fn is_healthy(&self) -> bool { + self.is_alive + } + + fn set_healthy(&mut self, healthy: bool) { + self.is_alive = healthy; + } + + fn load(&self) -> usize { + self.load_tracker.load() + } + + fn increment_load(&self) { + self.load_tracker.increment(); + } + + fn decrement_load(&self) { + self.load_tracker.decrement(); + } + + fn increment_processed(&self) { + self.load_tracker.increment_processed(); + } + + fn processed(&self) -> u64 { + self.load_tracker.processed() + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_node_load_tracker_default() { + let tracker = NodeLoadTracker::new(); + assert_eq!(tracker.load(), 0); + assert_eq!(tracker.processed(), 0); + } + + #[test] + fn test_node_load_tracker_increment_decrement() { + let tracker = NodeLoadTracker::new(); + + tracker.increment(); + tracker.increment(); + assert_eq!(tracker.load(), 2); + + tracker.decrement(); + assert_eq!(tracker.load(), 1); + } + + #[test] + fn test_node_load_tracker_processed() { + let tracker = NodeLoadTracker::new(); + + tracker.increment_processed(); + tracker.increment_processed(); + tracker.increment_processed(); + assert_eq!(tracker.processed(), 3); + } + + #[test] + fn test_node_load_tracker_staleness() { + let tracker = NodeLoadTracker::new(); + + // Just created, should not be stale + assert!(!tracker.is_stale(Duration::from_secs(1))); + + // Elapsed time should be very small + let elapsed = tracker.last_activity_elapsed(); + assert!(elapsed < Duration::from_secs(1)); + } + + #[test] + fn test_member_worker() { + let addr: std::net::SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let member = MemberInfo { + node_id: crate::actor::NodeId::new(1), + addr, + gossip_addr: addr, + status: MemberStatus::Alive, + incarnation: 0, + last_update: None, + }; + let tracker = Arc::new(NodeLoadTracker::new()); + let worker = MemberWorker::new(&member, tracker); + + assert!(worker.is_healthy()); + assert_eq!(worker.url(), "127.0.0.1:8000"); + assert_eq!(worker.load(), 0); + } + + #[test] + fn test_member_worker_dead_node() { + let addr: std::net::SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let member = MemberInfo { + node_id: crate::actor::NodeId::new(1), + addr, + gossip_addr: addr, + status: MemberStatus::Dead, + incarnation: 0, + last_update: None, + }; + let tracker = Arc::new(NodeLoadTracker::new()); + let worker = MemberWorker::new(&member, tracker); + + assert!(!worker.is_healthy()); + } +} diff --git a/crates/pulsing-actor/src/system/mod.rs b/crates/pulsing-actor/src/system/mod.rs index 6593e000a..6d761bb37 100644 --- a/crates/pulsing-actor/src/system/mod.rs +++ b/crates/pulsing-actor/src/system/mod.rs @@ -9,207 +9,76 @@ mod config; mod handle; mod handler; +mod lifecycle; +mod load_balancer; +mod resolve; mod runtime; +mod spawn; mod traits; -pub use config::{ActorSystemBuilder, ResolveOptions, SpawnOptions, SystemConfig}; +pub use config::{ + ActorSystemBuilder, ConfigValidationError, ResolveOptions, SpawnOptions, SystemConfig, +}; pub use handle::ActorStats; +pub use load_balancer::NodeLoadTracker; pub use traits::{ActorSystemAdvancedExt, ActorSystemCoreExt, ActorSystemOpsExt}; -use crate::actor::{ - Actor, ActorAddress, ActorContext, ActorId, ActorPath, ActorRef, ActorResolver, ActorSystemRef, - Envelope, IntoActor, IntoActorPath, Mailbox, NodeId, StopReason, -}; -use crate::cluster::{ - GossipBackend, HeadNodeBackend, MemberInfo, MemberStatus, NamedActorInfo, NamingBackend, -}; -use crate::policies::{LoadBalancingPolicy, RoundRobinPolicy, Worker}; +use crate::actor::{ActorId, ActorPath, ActorRef, ActorResolver, ActorSystemRef, Envelope, NodeId}; +use crate::cluster::{GossipBackend, HeadNodeBackend, NamingBackend}; +use crate::policies::{LoadBalancingPolicy, RoundRobinPolicy}; use crate::system_actor::{BoxedActorFactory, SystemActor, SystemRef, SYSTEM_ACTOR_PATH}; -use crate::transport::{Http2RemoteTransport, Http2Transport}; +use crate::transport::Http2Transport; use crate::watch::ActorLifecycle; use dashmap::DashMap; use handle::LocalActorHandle; use handler::SystemMessageHandler; -use runtime::{run_actor_instance, run_supervision_loop}; use std::net::SocketAddr; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::atomic::AtomicU64; use std::sync::Arc; -use std::time::Duration; use tokio::sync::mpsc; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; -/// Per-node load tracking with activity timestamp for cleanup -#[derive(Debug)] -pub struct NodeLoadTracker { - /// Current in-flight requests to this node - load: AtomicUsize, - /// Total requests processed - processed: AtomicU64, - /// Last activity timestamp (Unix millis) for stale entry cleanup - last_activity_millis: AtomicU64, -} - -impl Default for NodeLoadTracker { - fn default() -> Self { - Self { - load: AtomicUsize::new(0), - processed: AtomicU64::new(0), - last_activity_millis: AtomicU64::new(Self::current_millis()), - } - } -} - -impl NodeLoadTracker { - pub fn new() -> Self { - Self::default() - } - - fn current_millis() -> u64 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_millis() as u64 - } - - fn touch(&self) { - self.last_activity_millis - .store(Self::current_millis(), Ordering::Relaxed); - } - - pub fn load(&self) -> usize { - self.load.load(Ordering::Relaxed) - } - - pub fn increment(&self) { - self.load.fetch_add(1, Ordering::Relaxed); - self.touch(); - } - - pub fn decrement(&self) { - self.load.fetch_sub(1, Ordering::Relaxed); - self.touch(); - } - - pub fn processed(&self) -> u64 { - self.processed.load(Ordering::Relaxed) - } - - pub fn increment_processed(&self) { - self.processed.fetch_add(1, Ordering::Relaxed); - self.touch(); - } - - /// Returns elapsed time since last activity - pub fn last_activity_elapsed(&self) -> Duration { - let last = self.last_activity_millis.load(Ordering::Relaxed); - let now = Self::current_millis(); - Duration::from_millis(now.saturating_sub(last)) - } - - /// Returns true if this tracker has been inactive for longer than the threshold - pub fn is_stale(&self, threshold: Duration) -> bool { - self.last_activity_elapsed() > threshold - } -} - -/// Wrapper to adapt MemberInfo to the Worker trait for load balancing -#[derive(Debug)] -struct MemberWorker { - url: String, - is_alive: bool, - /// Shared load tracker for this node - load_tracker: Arc, -} - -impl MemberWorker { - fn new(member: &MemberInfo, load_tracker: Arc) -> Self { - Self { - url: member.addr.to_string(), - is_alive: member.status == MemberStatus::Alive, - load_tracker, - } - } -} - -impl Worker for MemberWorker { - fn url(&self) -> &str { - &self.url - } - - fn is_healthy(&self) -> bool { - self.is_alive - } - - fn set_healthy(&mut self, healthy: bool) { - self.is_alive = healthy; - } - - fn load(&self) -> usize { - self.load_tracker.load() - } - - fn increment_load(&self) { - self.load_tracker.increment(); - } - - fn decrement_load(&self) { - self.load_tracker.decrement(); - } - - fn increment_processed(&self) { - self.load_tracker.increment_processed(); - } - - fn processed(&self) -> u64 { - self.load_tracker.processed() - } - - fn as_any(&self) -> &dyn std::any::Any { - self - } -} - /// The Actor System - manages actors and cluster membership pub struct ActorSystem { /// Local node ID - node_id: NodeId, + pub(crate) node_id: NodeId, /// HTTP/2 address - addr: SocketAddr, + pub(crate) addr: SocketAddr, /// Default mailbox capacity for actors - default_mailbox_capacity: usize, + pub(crate) default_mailbox_capacity: usize, /// Local actors indexed by local_id (O(1) lookup by ActorId) - local_actors: Arc>, + pub(crate) local_actors: Arc>, /// Actor name to local_id mapping (for name-based lookups) - actor_names: Arc>, + pub(crate) actor_names: Arc>, /// Named actor path to local actor name mapping (path_string -> actor_name) - named_actor_paths: Arc>, + pub(crate) named_actor_paths: Arc>, /// Naming backend (for discovery) - cluster: Arc>>>, + pub(crate) cluster: Arc>>>, /// HTTP/2 transport - transport: Arc, + pub(crate) transport: Arc, /// Cancellation token - cancel_token: CancellationToken, + pub(crate) cancel_token: CancellationToken, /// Actor lifecycle manager (watch, termination handling) - lifecycle: Arc, + pub(crate) lifecycle: Arc, /// Actor ID counter (for generating unique local IDs) - actor_id_counter: AtomicU64, + pub(crate) actor_id_counter: AtomicU64, /// Default load balancing policy - default_lb_policy: Arc, + pub(crate) default_lb_policy: Arc, /// Per-node load tracking for remote nodes - node_load: Arc>>, + pub(crate) node_load: Arc>>, } impl ActorSystem { @@ -300,53 +169,42 @@ impl ActorSystem { node_id, addr: actual_addr, default_mailbox_capacity: config.default_mailbox_capacity, - local_actors: local_actors.clone(), - actor_names: actor_names.clone(), - named_actor_paths: named_actor_paths.clone(), + local_actors, + actor_names, + named_actor_paths, cluster: cluster_holder, transport, - cancel_token: cancel_token.clone(), + cancel_token, lifecycle, - actor_id_counter: AtomicU64::new(1), + actor_id_counter: AtomicU64::new(1), // Start from 1 (0 reserved for system) default_lb_policy: Arc::new(RoundRobinPolicy::new()), node_load: Arc::new(DashMap::new()), }); - // Start the builtin SystemActor with path "system" - system - .start_system_actor(actor_names, named_actor_paths) - .await?; - - tracing::info!( - node_id = %node_id, - addr = %actual_addr, - "Actor system started" - ); + // Start SystemActor + system.start_system_actor().await?; Ok(system) } - /// Start the builtin SystemActor - async fn start_system_actor( - self: &Arc, - actor_names: Arc>, - named_actor_paths: Arc>, - ) -> anyhow::Result<()> { - // Create SystemRef for SystemActor - // Note: SystemRef uses a simplified DashMap for sending messages - let local_actors_ref = self.local_actors.clone(); - - // Build a name -> sender mapping for SystemRef + /// Start SystemActor (internal, called during system creation) + async fn start_system_actor(self: &Arc) -> anyhow::Result<()> { + // Create senders snapshot for SystemRef let local_actor_senders: Arc>> = Arc::new(DashMap::new()); - for entry in actor_names.iter() { - let name = entry.key().clone(); - let local_id = *entry.value(); - if let Some(handle) = local_actors_ref.get(&local_id) { - local_actor_senders.insert(name, handle.sender.clone()); + for entry in self.local_actors.iter() { + // Find name for this actor (reverse lookup from actor_names) + if let Some(name_entry) = self.actor_names.iter().find(|e| *e.value() == *entry.key()) { + local_actor_senders.insert(name_entry.key().clone(), entry.sender.clone()); } } + // Create named_actor_paths snapshot + let named_actor_paths: Arc> = Arc::new(DashMap::new()); + for entry in self.named_actor_paths.iter() { + named_actor_paths.insert(entry.key().clone(), entry.value().clone()); + } + let system_ref = Arc::new(SystemRef { node_id: self.node_id, addr: self.addr, @@ -433,846 +291,6 @@ impl ActorSystem { .map(|handle| ActorRef::local(handle.actor_id, handle.sender.clone())) }) } - - /// Generate a new unique local actor ID - fn next_actor_id(&self) -> ActorId { - let local_id = self.actor_id_counter.fetch_add(1, Ordering::Relaxed); - ActorId::new(self.node_id, local_id) - } - - // ========== Spawn Methods ========== - - /// Create a once-use factory from an actor instance - fn once_factory(actor: A) -> impl FnMut() -> anyhow::Result { - let mut actor_opt = Some(actor); - move || { - actor_opt - .take() - .ok_or_else(|| anyhow::anyhow!("Actor cannot be restarted (spawned as instance)")) - } - } - - /// Spawn an anonymous actor (no name, only accessible via ActorRef) - /// - /// Note: Anonymous actors do not support supervision/restart because they have - /// no stable identity for re-resolution. Use `spawn_named_factory` for actors - /// that need supervision. - pub async fn spawn_anonymous(self: &Arc, actor: A) -> anyhow::Result - where - A: IntoActor, - { - self.spawn_anonymous_with_options(actor.into_actor(), SpawnOptions::default()) - .await - } - - /// Spawn an anonymous actor with custom options - pub async fn spawn_anonymous_with_options( - self: &Arc, - actor: A, - options: SpawnOptions, - ) -> anyhow::Result - where - A: IntoActor, - { - let actor = actor.into_actor(); - let actor_id = self.next_actor_id(); - - // Use configured mailbox capacity - let capacity = options - .mailbox_capacity - .unwrap_or(self.default_mailbox_capacity); - let mailbox = Mailbox::with_capacity(capacity); - let (sender, receiver) = mailbox.split(); - - let stats = Arc::new(ActorStats::default()); - - // Create a child cancellation token for this specific actor - // When system shuts down, parent token cancels all children - // When stopping individual actor, only this child token is cancelled - let actor_cancel = self.cancel_token.child_token(); - - // Create context with system reference - let ctx = ActorContext::with_system( - actor_id, - self.clone() as Arc, - actor_cancel.clone(), - sender.clone(), - ); - - // Spawn actor loop (no supervision for anonymous actors, they can't restart without a factory) - let stats_clone = stats.clone(); - let cancel = actor_cancel.clone(); - let actor_id_for_log = actor_id; - - let join_handle = tokio::spawn(async move { - let mut receiver = receiver; - let mut ctx = ctx; - let reason = - run_actor_instance(actor, &mut receiver, &mut ctx, cancel, stats_clone).await; - tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Anonymous actor stopped"); - }); - - // Register using local_id as key (O(1) lookup by ActorId) - let local_id = actor_id.local_id(); - let handle = LocalActorHandle { - sender: sender.clone(), - join_handle, - cancel_token: actor_cancel, - stats: stats.clone(), - metadata: options.metadata.clone(), - named_path: None, - actor_id, - }; - - // Use local_id as primary key - self.local_actors.insert(local_id, handle); - // Anonymous actors use their local_id as "name" for internal tracking - self.actor_names.insert(actor_id.to_string(), local_id); - - Ok(ActorRef::local(actor_id, sender)) - } - - /// Spawn a named actor (resolvable by name across the cluster) - /// - /// # Example - /// ```rust,ignore - /// // Name is used as both path (for resolution) and local name - /// system.spawn_named("services/echo", MyActor).await?; - /// ``` - pub async fn spawn_named(self: &Arc, name: P, actor: A) -> anyhow::Result - where - P: IntoActorPath, - A: IntoActor, - { - let path = name.into_actor_path()?; - self.spawn_named_factory( - path, - Self::once_factory(actor.into_actor()), - SpawnOptions::default(), - ) - .await - } - - /// Spawn a named actor with custom options - pub async fn spawn_named_with_options( - self: &Arc, - name: P, - actor: A, - options: SpawnOptions, - ) -> anyhow::Result - where - P: IntoActorPath, - A: IntoActor, - { - let path = name.into_actor_path()?; - self.spawn_named_factory(path, Self::once_factory(actor.into_actor()), options) - .await - } - - /// Spawn a named actor using a factory function - pub async fn spawn_named_factory( - self: &Arc, - name: P, - factory: F, - options: SpawnOptions, - ) -> anyhow::Result - where - P: IntoActorPath, - F: FnMut() -> anyhow::Result + Send + 'static, - A: Actor, - { - let path = name.into_actor_path()?; - let name_str = path.as_str(); - - // Check for duplicate name - if self.actor_names.contains_key(&name_str.to_string()) { - return Err(anyhow::anyhow!("Actor already exists: {}", name_str)); - } - - // Check for duplicate named path - if self.named_actor_paths.contains_key(&name_str.to_string()) { - return Err(anyhow::anyhow!( - "Named path already registered: {}", - name_str - )); - } - - let actor_id = self.next_actor_id(); - let local_id = actor_id.local_id(); - - // Use configured mailbox capacity - let capacity = options - .mailbox_capacity - .unwrap_or(self.default_mailbox_capacity); - let mailbox = Mailbox::with_capacity(capacity); - let (sender, receiver) = mailbox.split(); - - let stats = Arc::new(ActorStats::default()); - let metadata = options.metadata.clone(); - - // Create a child cancellation token for this specific actor - let actor_cancel = self.cancel_token.child_token(); - - // Create context with system reference and named path - let ctx = ActorContext::with_system_and_name( - actor_id, - self.clone() as Arc, - actor_cancel.clone(), - sender.clone(), - Some(name_str.to_string()), - ); - - // Spawn actor loop - let stats_clone = stats.clone(); - let cancel = actor_cancel.clone(); - let actor_id_for_log = actor_id; - let supervision = options.supervision.clone(); - - let join_handle = tokio::spawn(async move { - let reason = - run_supervision_loop(factory, receiver, ctx, cancel, stats_clone, supervision) - .await; - tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Actor stopped"); - }); - - // Register actor using local_id as primary key - let handle = LocalActorHandle { - sender: sender.clone(), - join_handle, - cancel_token: actor_cancel, - stats: stats.clone(), - metadata: metadata.clone(), - named_path: Some(path.clone()), - actor_id, - }; - - self.local_actors.insert(local_id, handle); - self.actor_names.insert(name_str.to_string(), local_id); - self.named_actor_paths - .insert(name_str.to_string(), name_str.to_string()); - - // Register in cluster with full details - if let Some(cluster) = self.cluster.read().await.as_ref() { - if metadata.is_empty() { - cluster.register_named_actor(path.clone()).await; - } else { - cluster - .register_named_actor_full(path.clone(), actor_id, metadata) - .await; - } - } - - // Create ActorRef - Ok(ActorRef::local(actor_id, sender)) - } - - // ========== Resolve Methods ========== - - /// Get ActorRef for a local or remote actor by ID - /// - /// This is an O(1) operation for local actors using local_id indexing. - pub async fn actor_ref(&self, id: &ActorId) -> anyhow::Result { - // Check if local - if id.node() == self.node_id || id.node().is_local() { - // O(1) lookup by local_id - let handle = self - .local_actors - .get(&id.local_id()) - .ok_or_else(|| anyhow::anyhow!("Local actor not found: {}", id))?; - return Ok(ActorRef::local(handle.actor_id, handle.sender.clone())); - } - - // Remote actor - get address from cluster - let cluster_guard = self.cluster.read().await; - let cluster = cluster_guard - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; - - let member = cluster - .get_member(&id.node()) - .await - .ok_or_else(|| anyhow::anyhow!("Node not found in cluster: {}", id.node()))?; - - // Create remote transport using actor id - let transport = Http2RemoteTransport::new_by_id(self.transport.client(), member.addr, *id); - - Ok(ActorRef::remote(*id, member.addr, Arc::new(transport))) - } - - /// Resolve a named actor by path (direct resolution) - /// - /// Returns an ActorRef that points to the current location of the named actor. - /// Note: If the actor migrates, this reference may become stale. - /// For actors that may migrate, consider using `resolve_named_lazy`. - /// - /// # Example - /// ```rust,ignore - /// let actor = system.resolve_named("services/echo", None).await?; - /// ``` - pub async fn resolve_named

( - &self, - path: P, - node_id: Option<&NodeId>, - ) -> anyhow::Result - where - P: IntoActorPath, - { - let path = path.into_actor_path()?; - let options = if let Some(nid) = node_id { - ResolveOptions::new().node_id(*nid) - } else { - ResolveOptions::new() - }; - self.resolve_named_with_options(&path, options).await - } - - /// Resolve a named actor with lazy resolution (re-resolves after cache expires) - /// - /// Returns an ActorRef that automatically re-resolves after 5 seconds. - /// This is useful for named actors that may migrate between nodes. - /// - /// # Example - /// ```rust,ignore - /// let actor = system.resolve_named_lazy("services/echo").await?; - /// // Even if the actor migrates, this ref will find it after cache expires - /// ``` - pub fn resolve_named_lazy

(self: &Arc, path: P) -> anyhow::Result - where - P: IntoActorPath, - { - let path = path.into_actor_path()?; - Ok(ActorRef::lazy(path, self.clone() as Arc)) - } - - /// Internal: Direct resolution for ActorResolver trait - async fn resolve_named_direct( - &self, - path: &ActorPath, - node_id: Option<&NodeId>, - ) -> anyhow::Result { - let options = if let Some(nid) = node_id { - ResolveOptions::new().node_id(*nid) - } else { - ResolveOptions::new() - }; - self.resolve_named_with_options(path, options).await - } - - /// Resolve a named actor with custom options (load balancing, health filtering) - pub async fn resolve_named_with_options( - &self, - path: &ActorPath, - options: ResolveOptions, - ) -> anyhow::Result { - let cluster_guard = self.cluster.read().await; - let cluster = cluster_guard - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; - - let instances = cluster.get_named_actor_instances(path).await; - - if instances.is_empty() { - return Err(anyhow::anyhow!("Named actor not found: {}", path.as_str())); - } - - // Health filtering: only select Alive nodes - let healthy_instances: Vec<_> = if options.filter_alive { - instances - .into_iter() - .filter(|i| i.status == MemberStatus::Alive) - .collect() - } else { - instances - }; - - if healthy_instances.is_empty() { - return Err(anyhow::anyhow!( - "No healthy instances for named actor: {}", - path.as_str() - )); - } - - // Select target instance - let target = if let Some(nid) = options.node_id { - // If node_id specified, find that specific instance - healthy_instances - .iter() - .find(|i| i.node_id == nid) - .ok_or_else(|| anyhow::anyhow!("Actor instance not found on node: {}", nid))? - } else { - // Use load balancing policy - let policy = options.policy.as_ref().unwrap_or(&self.default_lb_policy); - self.select_instance(&healthy_instances, policy.as_ref()) - }; - - // If local, get local ref - if target.node_id == self.node_id { - let actor_name = self - .named_actor_paths - .get(&path.as_str()) - .ok_or_else(|| anyhow::anyhow!("Named actor not found locally"))? - .clone(); - - // Look up local_id from actor_names, then get handle - let local_id = self - .actor_names - .get(&actor_name) - .ok_or_else(|| anyhow::anyhow!("Actor not found: {}", actor_name))?; - - let handle = self - .local_actors - .get(local_id.value()) - .ok_or_else(|| anyhow::anyhow!("Actor handle not found: {}", actor_name))?; - - return Ok(ActorRef::local(handle.actor_id, handle.sender.clone())); - } - - // Remote actor - let transport = - Http2RemoteTransport::new_named(self.transport.client(), target.addr, path.clone()); - - let actor_id = ActorId::new(target.node_id, 0); - Ok(ActorRef::remote(actor_id, target.addr, Arc::new(transport))) - } - - /// Select an instance using load balancing policy - fn select_instance<'a>( - &self, - instances: &'a [MemberInfo], - policy: &dyn LoadBalancingPolicy, - ) -> &'a MemberInfo { - // Convert MemberInfo to Worker wrappers with load tracking - let workers: Vec> = instances - .iter() - .map(|m| { - // Get or create load tracker for this node - let tracker = self - .node_load - .entry(m.addr) - .or_insert_with(|| Arc::new(NodeLoadTracker::new())) - .clone(); - Arc::new(MemberWorker::new(m, tracker)) as Arc - }) - .collect(); - - // Use the policy to select a worker index - let idx = policy.select_worker(&workers, None).unwrap_or(0); - - // Increment load for selected node (caller should decrement after request completes) - if let Some(tracker) = self.node_load.get(&instances[idx].addr) { - tracker.increment(); - } - - &instances[idx] - } - - /// Get load tracker for a node address - pub fn get_node_load_tracker(&self, addr: &SocketAddr) -> Option> { - self.node_load.get(addr).map(|r| r.clone()) - } - - /// Decrement load after a request completes - pub fn decrement_node_load(&self, addr: &SocketAddr) { - if let Some(tracker) = self.node_load.get(addr) { - tracker.decrement(); - tracker.increment_processed(); - } - } - - /// Clean up stale node load trackers to prevent memory leaks - /// - /// Removes entries for nodes that have not been active for longer than the threshold. - /// Call this periodically (e.g., every few minutes) in long-running systems. - /// - /// # Arguments - /// * `stale_threshold` - Remove trackers inactive for longer than this duration - /// - /// # Returns - /// Number of entries removed - pub fn cleanup_stale_node_trackers(&self, stale_threshold: Duration) -> usize { - let before = self.node_load.len(); - self.node_load.retain(|_addr, tracker| { - // Keep entries that are still active OR have in-flight requests - !tracker.is_stale(stale_threshold) || tracker.load() > 0 - }); - let removed = before - self.node_load.len(); - if removed > 0 { - tracing::debug!( - removed = removed, - remaining = self.node_load.len(), - "Cleaned up stale node load trackers" - ); - } - removed - } - - /// Get the number of tracked nodes - pub fn tracked_node_count(&self) -> usize { - self.node_load.len() - } - - /// Resolve an actor address and get an ActorRef - pub async fn resolve(&self, address: &ActorAddress) -> anyhow::Result { - match address { - ActorAddress::Named { path, instance } => { - self.resolve_named(path, instance.as_ref()).await - } - ActorAddress::Global { node_id, actor_id } => { - let id = ActorId::new(*node_id, *actor_id); - self.actor_ref(&id).await - } - } - } - - /// Get all instances of a named actor across the cluster - pub async fn get_named_instances(&self, path: &ActorPath) -> Vec { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.get_named_actor_instances(path).await - } else { - Vec::new() - } - } - - /// Resolve all instances of a named actor as ActorRefs - pub async fn resolve_all_instances( - &self, - path: &ActorPath, - filter_alive: bool, - ) -> anyhow::Result> { - let cluster_guard = self.cluster.read().await; - let cluster = cluster_guard - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; - - let instances = cluster.get_named_actor_instances_detailed(path).await; - - let mut refs = Vec::new(); - for (member, instance_opt) in instances { - // Filter by alive status if requested - if filter_alive && member.status != MemberStatus::Alive { - continue; - } - - // Get actor_id from instance info - if let Some(instance) = instance_opt { - let actor_ref = self.actor_ref(&instance.actor_id).await?; - refs.push(actor_ref); - } - } - - Ok(refs) - } - - /// Get detailed instances with actor_id and metadata - pub async fn get_named_instances_detailed( - &self, - path: &ActorPath, - ) -> Vec<(MemberInfo, Option)> { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.get_named_actor_instances_detailed(path).await - } else { - Vec::new() - } - } - - /// Lookup named actor information - pub async fn lookup_named(&self, path: &ActorPath) -> Option { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.lookup_named_actor(path).await - } else { - None - } - } - - /// Get cluster member information - pub async fn members(&self) -> Vec { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.all_members().await - } else { - Vec::new() - } - } - - /// Get all named actors in the cluster - pub async fn all_named_actors(&self) -> Vec { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.all_named_actors().await - } else { - Vec::new() - } - } - - // ========== Stop Methods ========== - - /// Default timeout for graceful actor shutdown (30 seconds) - const GRACEFUL_STOP_TIMEOUT: Duration = Duration::from_secs(30); - - /// Stop an actor gracefully - /// - /// This method first signals the actor to stop via its cancellation token, - /// waits for it to finish (with timeout), then performs cleanup. - /// If the actor doesn't stop within the timeout, it will be forcefully aborted. - pub async fn stop(&self, name: impl AsRef) -> anyhow::Result<()> { - self.stop_with_reason(name, StopReason::Killed).await - } - - /// Stop an actor with a specific reason - /// - /// Performs graceful shutdown: - /// 1. Cancels the actor's cancellation token (triggers `on_stop()`) - /// 2. Waits for the actor to finish (with 30s timeout) - /// 3. If timeout, forcefully aborts the actor task - /// 4. Handles lifecycle cleanup (watch notifications, cluster broadcast, etc.) - /// - /// Note: If the name doesn't contain a "/" and no actor is found with the exact name, - /// it will try with the "actors/" prefix (for Python compatibility). - pub async fn stop_with_reason( - &self, - name: impl AsRef, - reason: StopReason, - ) -> anyhow::Result<()> { - let name = name.as_ref(); - - // Try exact name first, then normalized name with "actors/" prefix - let actual_name = if self.actor_names.contains_key(name) { - name.to_string() - } else if !name.contains('/') { - // Try with "actors/" prefix (Python API compatibility) - let prefixed = format!("actors/{}", name); - if self.actor_names.contains_key(&prefixed) { - prefixed - } else { - name.to_string() - } - } else { - name.to_string() - }; - - // Get local_id from actor_names, then remove from local_actors - if let Some((_, local_id)) = self.actor_names.remove(&actual_name) { - if let Some((_, handle)) = self.local_actors.remove(&local_id) { - // 1. Signal the actor to stop gracefully - handle.cancel_token.cancel(); - - // 2. Wait for the actor to finish with timeout - match tokio::time::timeout(Self::GRACEFUL_STOP_TIMEOUT, handle.join_handle).await { - Ok(_) => { - // Actor stopped gracefully - tracing::debug!(actor = %actual_name, "Actor stopped gracefully"); - } - Err(_) => { - // Timeout - actor didn't respond to cancel signal - // This shouldn't happen normally, but we log a warning - tracing::warn!( - actor = %actual_name, - "Actor didn't stop gracefully within timeout, already aborted by tokio" - ); - } - } - - // 3. Handle lifecycle cleanup - let actor_names = self.actor_names.clone(); - let local_actors = self.local_actors.clone(); - self.lifecycle - .handle_termination( - &handle.actor_id, - &actual_name, - handle.named_path.clone(), - reason, - &self.named_actor_paths, - &self.cluster, - |n| { - actor_names.get(n).and_then(|id| { - local_actors.get(id.value()).map(|h| h.sender.clone()) - }) - }, - ) - .await; - } - } - - Ok(()) - } - - /// Stop a named actor by path - pub async fn stop_named(&self, path: &ActorPath) -> anyhow::Result<()> { - self.stop_named_with_reason(path, StopReason::Killed).await - } - - /// Stop a named actor by path with a specific reason - pub async fn stop_named_with_reason( - &self, - path: &ActorPath, - reason: StopReason, - ) -> anyhow::Result<()> { - let path_key = path.as_str(); - - // Find the local actor name for this path - if let Some(actor_name_ref) = self.named_actor_paths.get(&path_key) { - let actor_name = actor_name_ref.clone(); - drop(actor_name_ref); - - // Get local_id from actor_names, then remove from local_actors - if let Some((_, local_id)) = self.actor_names.remove(&actor_name) { - if let Some((_, handle)) = self.local_actors.remove(&local_id) { - // 1. Signal the actor to stop gracefully - handle.cancel_token.cancel(); - - // 2. Wait for the actor to finish with timeout - match tokio::time::timeout(Self::GRACEFUL_STOP_TIMEOUT, handle.join_handle) - .await - { - Ok(_) => { - tracing::debug!(actor = %actor_name, path = %path_key, "Actor stopped gracefully"); - } - Err(_) => { - tracing::warn!( - actor = %actor_name, - path = %path_key, - "Actor didn't stop gracefully within timeout" - ); - } - } - - // 3. Handle lifecycle cleanup - let actor_names = self.actor_names.clone(); - let local_actors = self.local_actors.clone(); - self.lifecycle - .handle_termination( - &handle.actor_id, - &actor_name, - Some(path.clone()), - reason, - &self.named_actor_paths, - &self.cluster, - |name| { - actor_names.get(name).and_then(|id| { - local_actors.get(id.value()).map(|h| h.sender.clone()) - }) - }, - ) - .await; - } - } - } - - Ok(()) - } - - /// Shutdown the entire actor system - /// - /// This method performs a graceful shutdown: - /// 1. Signals cancellation to all actors (via parent cancel token, which cancels all child tokens) - /// 2. Waits for actors to stop gracefully (with timeout) - /// 3. Triggers lifecycle cleanup for each actor (watch notifications, cluster broadcast, etc.) - /// 4. Leaves the cluster gracefully - /// 5. Clears all actors and watch relationships - pub async fn shutdown(&self) -> anyhow::Result<()> { - tracing::info!("Shutting down actor system"); - - // Signal cancellation first - this cancels the parent token, - // which automatically cancels all child tokens (individual actor tokens) - // This triggers the `cancel.cancelled()` branch in each actor's message loop, - // allowing them to call `on_stop()` gracefully - self.cancel_token.cancel(); - - // Give actors a short time to process the cancellation signal - // Since all actors share the same parent token, they should all start stopping - tokio::time::sleep(Duration::from_millis(100)).await; - - // Collect all actor info (local_id, actor_id, name, named_path) - let actor_entries: Vec<_> = self - .local_actors - .iter() - .map(|entry| { - let local_id = *entry.key(); - let actor_id = entry.actor_id; - let named_path = entry.named_path.clone(); - // Find name from actor_names (reverse lookup) - let name = self - .actor_names - .iter() - .find(|e| *e.value() == local_id) - .map(|e| e.key().clone()) - .unwrap_or_else(|| actor_id.to_string()); - (local_id, actor_id, name, named_path) - }) - .collect(); - - // Process each actor's termination - for (local_id, actor_id, actor_name, named_path) in actor_entries { - // Remove from actor_names first - self.actor_names.remove(&actor_name); - - // Remove and get ownership of the handle - if let Some((_, handle)) = self.local_actors.remove(&local_id) { - // Wait briefly for graceful shutdown (actor should already be stopping due to parent cancel) - // Use a shorter timeout since we already signaled cancellation - match tokio::time::timeout(Duration::from_secs(5), handle.join_handle).await { - Ok(_) => { - tracing::debug!(actor = %actor_name, "Actor stopped gracefully during shutdown"); - } - Err(_) => { - // Timeout - this shouldn't happen normally since cancel was already called - tracing::warn!( - actor = %actor_name, - "Actor didn't stop within timeout during shutdown" - ); - } - } - - // Trigger lifecycle cleanup (watch notifications, cluster broadcast, routing cleanup) - let actor_names = self.actor_names.clone(); - let local_actors = self.local_actors.clone(); - self.lifecycle - .handle_termination( - &actor_id, - &actor_name, - named_path, - StopReason::SystemShutdown, - &self.named_actor_paths, - &self.cluster, - |name| { - actor_names.get(name).and_then(|id| { - local_actors.get(id.value()).map(|h| h.sender.clone()) - }) - }, - ) - .await; - } - } - - // Clear all actors (should already be empty, but just in case) - self.local_actors.clear(); - self.actor_names.clear(); - - // Clear node load trackers - self.node_load.clear(); - - // Clear all watch relationships - self.lifecycle.clear().await; - - // Leave cluster gracefully - { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.leave().await?; - } - } - - tracing::info!("Actor system shutdown complete"); - Ok(()) - } - - /// Get cancellation token - pub fn cancel_token(&self) -> CancellationToken { - self.cancel_token.clone() - } } #[async_trait::async_trait] diff --git a/crates/pulsing-actor/src/system/resolve.rs b/crates/pulsing-actor/src/system/resolve.rs new file mode 100644 index 000000000..40959aa1c --- /dev/null +++ b/crates/pulsing-actor/src/system/resolve.rs @@ -0,0 +1,338 @@ +//! Actor resolution logic +//! +//! This module contains the implementation of actor resolution methods +//! that are used by the ActorSystem for locating actors by ID or name. + +use crate::actor::{ + ActorAddress, ActorId, ActorPath, ActorRef, ActorResolver, IntoActorPath, NodeId, +}; +use crate::cluster::{MemberInfo, MemberStatus, NamedActorInfo}; +use crate::policies::LoadBalancingPolicy; +use crate::system::config::ResolveOptions; +use crate::system::load_balancer::{MemberWorker, NodeLoadTracker}; +use crate::system::ActorSystem; +use crate::transport::Http2RemoteTransport; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + +impl ActorSystem { + async fn cluster_opt(&self) -> Option> { + self.cluster.read().await.as_ref().cloned() + } + + async fn cluster_or_err(&self) -> anyhow::Result> { + self.cluster_opt() + .await + .ok_or_else(|| anyhow::anyhow!("Cluster not initialized")) + } + + /// Get ActorRef for a local or remote actor by ID + /// + /// This is an O(1) operation for local actors using local_id indexing. + pub async fn actor_ref(&self, id: &ActorId) -> anyhow::Result { + // Check if local + if id.node() == self.node_id || id.node().is_local() { + // O(1) lookup by local_id + let handle = self + .local_actors + .get(&id.local_id()) + .ok_or_else(|| anyhow::anyhow!("Local actor not found: {}", id))?; + return Ok(ActorRef::local(handle.actor_id, handle.sender.clone())); + } + + // Remote actor - get address from cluster + let cluster = self.cluster_or_err().await?; + + let member = cluster + .get_member(&id.node()) + .await + .ok_or_else(|| anyhow::anyhow!("Node not found in cluster: {}", id.node()))?; + + // Create remote transport using actor id + let transport = Http2RemoteTransport::new_by_id(self.transport.client(), member.addr, *id); + + Ok(ActorRef::remote(*id, member.addr, Arc::new(transport))) + } + + /// Resolve a named actor by path (direct resolution) + /// + /// Returns an ActorRef that points to the current location of the named actor. + /// Note: If the actor migrates, this reference may become stale. + /// For actors that may migrate, consider using `resolve_named_lazy`. + /// + /// # Example + /// ```rust,ignore + /// let actor = system.resolve_named("services/echo", None).await?; + /// ``` + pub async fn resolve_named

( + &self, + path: P, + node_id: Option<&NodeId>, + ) -> anyhow::Result + where + P: IntoActorPath, + { + let path = path.into_actor_path()?; + let options = if let Some(nid) = node_id { + ResolveOptions::new().node_id(*nid) + } else { + ResolveOptions::new() + }; + self.resolve_named_with_options(&path, options).await + } + + /// Resolve a named actor with lazy resolution (re-resolves after cache expires) + /// + /// Returns an ActorRef that automatically re-resolves after 5 seconds. + /// This is useful for named actors that may migrate between nodes. + /// + /// # Example + /// ```rust,ignore + /// let actor = system.resolve_named_lazy("services/echo").await?; + /// // Even if the actor migrates, this ref will find it after cache expires + /// ``` + pub fn resolve_named_lazy

(self: &Arc, path: P) -> anyhow::Result + where + P: IntoActorPath, + { + let path = path.into_actor_path()?; + Ok(ActorRef::lazy(path, self.clone() as Arc)) + } + + /// Internal: Direct resolution for ActorResolver trait + pub(crate) async fn resolve_named_direct( + &self, + path: &ActorPath, + node_id: Option<&NodeId>, + ) -> anyhow::Result { + let options = if let Some(nid) = node_id { + ResolveOptions::new().node_id(*nid) + } else { + ResolveOptions::new() + }; + self.resolve_named_with_options(path, options).await + } + + /// Resolve a named actor with custom options (load balancing, health filtering) + pub async fn resolve_named_with_options( + &self, + path: &ActorPath, + options: ResolveOptions, + ) -> anyhow::Result { + let cluster = self.cluster_or_err().await?; + + let instances = cluster.get_named_actor_instances(path).await; + + if instances.is_empty() { + return Err(anyhow::anyhow!("Named actor not found: {}", path.as_str())); + } + + let healthy_instances: Vec<_> = if options.filter_alive { + instances + .into_iter() + .filter(|i| i.status == MemberStatus::Alive) + .collect() + } else { + instances + }; + + if healthy_instances.is_empty() { + return Err(anyhow::anyhow!( + "No healthy instances for named actor: {}", + path.as_str() + )); + } + + let target = if let Some(nid) = options.node_id { + healthy_instances + .iter() + .find(|i| i.node_id == nid) + .ok_or_else(|| anyhow::anyhow!("Actor instance not found on node: {}", nid))? + } else { + let policy = options.policy.as_ref().unwrap_or(&self.default_lb_policy); + self.select_instance(&healthy_instances, policy.as_ref()) + }; + + if target.node_id == self.node_id { + let actor_name = self + .named_actor_paths + .get(&path.as_str()) + .ok_or_else(|| anyhow::anyhow!("Named actor not found locally"))? + .clone(); + + let local_id = self + .actor_names + .get(&actor_name) + .ok_or_else(|| anyhow::anyhow!("Actor not found: {}", actor_name))?; + + let handle = self + .local_actors + .get(local_id.value()) + .ok_or_else(|| anyhow::anyhow!("Actor handle not found: {}", actor_name))?; + + return Ok(ActorRef::local(handle.actor_id, handle.sender.clone())); + } + + let transport = + Http2RemoteTransport::new_named(self.transport.client(), target.addr, path.clone()); + + let actor_id = ActorId::new(target.node_id, 0); + Ok(ActorRef::remote(actor_id, target.addr, Arc::new(transport))) + } + + /// Select an instance using load balancing policy + pub(crate) fn select_instance<'a>( + &self, + instances: &'a [MemberInfo], + policy: &dyn LoadBalancingPolicy, + ) -> &'a MemberInfo { + let workers: Vec> = instances + .iter() + .map(|m| { + let tracker = self + .node_load + .entry(m.addr) + .or_insert_with(|| Arc::new(NodeLoadTracker::new())) + .clone(); + Arc::new(MemberWorker::new(m, tracker)) as Arc + }) + .collect(); + + let idx = policy.select_worker(&workers, None).unwrap_or(0); + + if let Some(tracker) = self.node_load.get(&instances[idx].addr) { + tracker.increment(); + } + + &instances[idx] + } + + /// Get load tracker for a node address + pub fn get_node_load_tracker(&self, addr: &SocketAddr) -> Option> { + self.node_load.get(addr).map(|r| r.clone()) + } + + /// Decrement load after a request completes + pub fn decrement_node_load(&self, addr: &SocketAddr) { + if let Some(tracker) = self.node_load.get(addr) { + tracker.decrement(); + tracker.increment_processed(); + } + } + + /// Clean up stale node load trackers to prevent memory leaks + /// + /// Removes entries for nodes that have not been active for longer than the threshold. + /// Call this periodically (e.g., every few minutes) in long-running systems. + /// + /// # Arguments + /// * `stale_threshold` - Remove trackers inactive for longer than this duration + /// + /// # Returns + /// Number of entries removed + pub fn cleanup_stale_node_trackers(&self, stale_threshold: Duration) -> usize { + let before = self.node_load.len(); + self.node_load.retain(|_addr, tracker| { + // Keep entries that are still active OR have in-flight requests + !tracker.is_stale(stale_threshold) || tracker.load() > 0 + }); + let removed = before - self.node_load.len(); + if removed > 0 { + tracing::debug!( + removed = removed, + remaining = self.node_load.len(), + "Cleaned up stale node load trackers" + ); + } + removed + } + + /// Get the number of tracked nodes + pub fn tracked_node_count(&self) -> usize { + self.node_load.len() + } + + /// Resolve an actor address and get an ActorRef + pub async fn resolve(&self, address: &ActorAddress) -> anyhow::Result { + match address { + ActorAddress::Named { path, instance } => { + self.resolve_named(path, instance.as_ref()).await + } + ActorAddress::Global { node_id, actor_id } => { + let id = ActorId::new(*node_id, *actor_id); + self.actor_ref(&id).await + } + } + } + + /// Get all instances of a named actor across the cluster + pub async fn get_named_instances(&self, path: &ActorPath) -> Vec { + match self.cluster_opt().await { + Some(cluster) => cluster.get_named_actor_instances(path).await, + None => Vec::new(), + } + } + + /// Resolve all instances of a named actor as ActorRefs + pub async fn resolve_all_instances( + &self, + path: &ActorPath, + filter_alive: bool, + ) -> anyhow::Result> { + let cluster = self.cluster_or_err().await?; + + let instances = cluster.get_named_actor_instances_detailed(path).await; + + let mut refs = Vec::new(); + for (member, instance_opt) in instances { + // Filter by alive status if requested + if filter_alive && member.status != MemberStatus::Alive { + continue; + } + + // Get actor_id from instance info + if let Some(instance) = instance_opt { + let actor_ref = self.actor_ref(&instance.actor_id).await?; + refs.push(actor_ref); + } + } + + Ok(refs) + } + + /// Get detailed instances with actor_id and metadata + pub async fn get_named_instances_detailed( + &self, + path: &ActorPath, + ) -> Vec<(MemberInfo, Option)> { + match self.cluster_opt().await { + Some(cluster) => cluster.get_named_actor_instances_detailed(path).await, + None => Vec::new(), + } + } + + /// Lookup named actor information + pub async fn lookup_named(&self, path: &ActorPath) -> Option { + match self.cluster_opt().await { + Some(cluster) => cluster.lookup_named_actor(path).await, + None => None, + } + } + + /// Get cluster member information + pub async fn members(&self) -> Vec { + match self.cluster_opt().await { + Some(cluster) => cluster.all_members().await, + None => Vec::new(), + } + } + + /// Get all named actors in the cluster + pub async fn all_named_actors(&self) -> Vec { + match self.cluster_opt().await { + Some(cluster) => cluster.all_named_actors().await, + None => Vec::new(), + } + } +} diff --git a/crates/pulsing-actor/src/system/spawn.rs b/crates/pulsing-actor/src/system/spawn.rs new file mode 100644 index 000000000..4709fd7aa --- /dev/null +++ b/crates/pulsing-actor/src/system/spawn.rs @@ -0,0 +1,247 @@ +//! Actor spawning logic +//! +//! This module contains the implementation of actor spawning methods +//! that are used by the ActorSystem. + +use crate::actor::{ + Actor, ActorContext, ActorId, ActorRef, ActorSystemRef, IntoActor, IntoActorPath, Mailbox, +}; +use crate::system::config::SpawnOptions; +use crate::system::handle::{ActorStats, LocalActorHandle}; +use crate::system::runtime::{run_actor_instance, run_supervision_loop}; +use crate::system::ActorSystem; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +impl ActorSystem { + /// Create a once-use factory from an actor instance + pub(crate) fn once_factory(actor: A) -> impl FnMut() -> anyhow::Result { + let mut actor_opt = Some(actor); + move || { + actor_opt + .take() + .ok_or_else(|| anyhow::anyhow!("Actor cannot be restarted (spawned as instance)")) + } + } + + /// Spawn an anonymous actor (no name, only accessible via ActorRef) + /// + /// Note: Anonymous actors do not support supervision/restart because they have + /// no stable identity for re-resolution. Use `spawn_named_factory` for actors + /// that need supervision. + pub async fn spawn_anonymous(self: &Arc, actor: A) -> anyhow::Result + where + A: IntoActor, + { + self.spawn_anonymous_with_options(actor.into_actor(), SpawnOptions::default()) + .await + } + + /// Spawn an anonymous actor with custom options + pub async fn spawn_anonymous_with_options( + self: &Arc, + actor: A, + options: SpawnOptions, + ) -> anyhow::Result + where + A: IntoActor, + { + let actor = actor.into_actor(); + let actor_id = self.next_actor_id(); + + let mailbox = Mailbox::with_capacity(self.mailbox_capacity(&options)); + let (sender, receiver) = mailbox.split(); + + let stats = Arc::new(ActorStats::default()); + + let actor_cancel = self.cancel_token.child_token(); + + let ctx = Self::build_context(self, actor_id, &sender, &actor_cancel, None); + + let stats_clone = stats.clone(); + let cancel = actor_cancel.clone(); + let actor_id_for_log = actor_id; + + let join_handle = tokio::spawn(async move { + let mut receiver = receiver; + let mut ctx = ctx; + let reason = + run_actor_instance(actor, &mut receiver, &mut ctx, cancel, stats_clone).await; + tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Anonymous actor stopped"); + }); + + let local_id = actor_id.local_id(); + let handle = LocalActorHandle { + sender: sender.clone(), + join_handle, + cancel_token: actor_cancel, + stats: stats.clone(), + metadata: options.metadata.clone(), + named_path: None, + actor_id, + }; + + self.local_actors.insert(local_id, handle); + self.actor_names.insert(actor_id.to_string(), local_id); + + Ok(ActorRef::local(actor_id, sender)) + } + + /// Spawn a named actor (resolvable by name across the cluster) + /// + /// # Example + /// ```rust,ignore + /// // Name is used as both path (for resolution) and local name + /// system.spawn_named("services/echo", MyActor).await?; + /// ``` + pub async fn spawn_named(self: &Arc, name: P, actor: A) -> anyhow::Result + where + P: IntoActorPath, + A: IntoActor, + { + let path = name.into_actor_path()?; + self.spawn_named_factory( + path, + Self::once_factory(actor.into_actor()), + SpawnOptions::default(), + ) + .await + } + + /// Spawn a named actor with custom options + pub async fn spawn_named_with_options( + self: &Arc, + name: P, + actor: A, + options: SpawnOptions, + ) -> anyhow::Result + where + P: IntoActorPath, + A: IntoActor, + { + let path = name.into_actor_path()?; + self.spawn_named_factory(path, Self::once_factory(actor.into_actor()), options) + .await + } + + /// Spawn a named actor using a factory function + pub async fn spawn_named_factory( + self: &Arc, + name: P, + factory: F, + options: SpawnOptions, + ) -> anyhow::Result + where + P: IntoActorPath, + F: FnMut() -> anyhow::Result + Send + 'static, + A: Actor, + { + let path = name.into_actor_path()?; + let name_str = path.as_str(); + + if self.actor_names.contains_key(&name_str.to_string()) { + return Err(anyhow::anyhow!("Actor already exists: {}", name_str)); + } + + if self.named_actor_paths.contains_key(&name_str.to_string()) { + return Err(anyhow::anyhow!( + "Named path already registered: {}", + name_str + )); + } + + let actor_id = self.next_actor_id(); + let local_id = actor_id.local_id(); + + let mailbox = Mailbox::with_capacity(self.mailbox_capacity(&options)); + let (sender, receiver) = mailbox.split(); + + let stats = Arc::new(ActorStats::default()); + let metadata = options.metadata.clone(); + + let actor_cancel = self.cancel_token.child_token(); + + let ctx = Self::build_context( + self, + actor_id, + &sender, + &actor_cancel, + Some(name_str.to_string()), + ); + + let stats_clone = stats.clone(); + let cancel = actor_cancel.clone(); + let actor_id_for_log = actor_id; + let supervision = options.supervision.clone(); + + let join_handle = tokio::spawn(async move { + let reason = + run_supervision_loop(factory, receiver, ctx, cancel, stats_clone, supervision) + .await; + tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Actor stopped"); + }); + + let handle = LocalActorHandle { + sender: sender.clone(), + join_handle, + cancel_token: actor_cancel, + stats: stats.clone(), + metadata: metadata.clone(), + named_path: Some(path.clone()), + actor_id, + }; + + self.local_actors.insert(local_id, handle); + self.actor_names.insert(name_str.to_string(), local_id); + self.named_actor_paths + .insert(name_str.to_string(), name_str.to_string()); + + if let Some(cluster) = self.cluster.read().await.as_ref() { + if metadata.is_empty() { + cluster.register_named_actor(path.clone()).await; + } else { + cluster + .register_named_actor_full(path.clone(), actor_id, metadata) + .await; + } + } + + Ok(ActorRef::local(actor_id, sender)) + } + + /// Generate a new unique local actor ID + pub(crate) fn next_actor_id(&self) -> ActorId { + let local_id = self.actor_id_counter.fetch_add(1, Ordering::Relaxed); + ActorId::new(self.node_id, local_id) + } + + fn mailbox_capacity(&self, options: &SpawnOptions) -> usize { + options + .mailbox_capacity + .unwrap_or(self.default_mailbox_capacity) + } + + fn build_context( + system: &Arc, + actor_id: ActorId, + sender: &tokio::sync::mpsc::Sender, + cancel: &tokio_util::sync::CancellationToken, + name: Option, + ) -> ActorContext { + match name { + Some(name) => ActorContext::with_system_and_name( + actor_id, + system.clone() as Arc, + cancel.clone(), + sender.clone(), + Some(name), + ), + None => ActorContext::with_system( + actor_id, + system.clone() as Arc, + cancel.clone(), + sender.clone(), + ), + } + } +} diff --git a/crates/pulsing-actor/src/system/traits.rs b/crates/pulsing-actor/src/system/traits.rs index 15780810e..67355b908 100644 --- a/crates/pulsing-actor/src/system/traits.rs +++ b/crates/pulsing-actor/src/system/traits.rs @@ -1,9 +1,4 @@ -//! Actor System Extension Traits -//! -//! This module defines the public API surface for ActorSystem through traits: -//! - [`ActorSystemCoreExt`] - Core spawn and resolve operations (primary API) -//! - [`ActorSystemAdvancedExt`] - Factory-based spawning for supervision/restart -//! - [`ActorSystemOpsExt`] - Operations, introspection, and lifecycle management +//! ActorSystem extension traits. use std::collections::HashMap; use std::net::SocketAddr; @@ -20,49 +15,7 @@ use crate::policies::LoadBalancingPolicy; use tokio_util::sync::CancellationToken; -// ============================================================================= -// Core Trait: Spawn + Resolve (Primary API) -// ============================================================================= - /// Core API for spawning and resolving actors. -/// -/// This trait defines the primary interface for creating and locating actors. -/// It is automatically implemented for `Arc` and re-exported in prelude. -/// -/// # Spawn Methods -/// - [`spawn`](Self::spawn) - Spawn an anonymous actor (not resolvable by name) -/// - [`spawn_named`](Self::spawn_named) - Spawn a named actor (resolvable by name) -/// - [`spawning`](Self::spawning) - Get a builder for advanced spawn options -/// -/// # Resolve Methods -/// - [`actor_ref`](Self::actor_ref) - Get ActorRef by ActorId -/// - [`resolve`](Self::resolve) - Resolve a named actor by name -/// - [`resolve_with_options`](Self::resolve_with_options) - Resolve with load balancing/filtering -/// - [`resolve_lazy`](Self::resolve_lazy) - Lazy resolution with auto-refresh -/// -/// # Example -/// ```rust,ignore -/// use pulsing_actor::prelude::*; -/// -/// let system = ActorSystem::builder().build().await?; -/// -/// // Spawn an anonymous actor (only accessible via ActorRef) -/// let worker = system.spawn(Worker::new()).await?; -/// -/// // Spawn a named actor (resolvable by name) -/// let echo = system.spawn_named("services/echo", EchoService).await?; -/// -/// // Spawn with builder for advanced options -/// let counter = system.spawning() -/// .name("services/counter") -/// .supervision(SupervisionSpec::on_failure().max_restarts(3)) -/// .mailbox_capacity(256) -/// .spawn(Counter::new()) -/// .await?; -/// -/// // Resolve by name -/// let echo_ref = system.resolve("services/echo").await?; -/// ``` #[async_trait::async_trait] pub trait ActorSystemCoreExt: Sized { /// Spawn an anonymous actor (not resolvable by name, only accessible via ActorRef) @@ -93,27 +46,13 @@ pub trait ActorSystemCoreExt: Sized { where A: IntoActor; - /// Get a builder for spawning actors with advanced options - /// - /// # Example - /// ```rust,ignore - /// let actor = system.spawning() - /// .name("services/worker") - /// .supervision(SupervisionSpec::on_failure().max_restarts(3)) - /// .mailbox_capacity(1024) - /// .spawn(Worker::new()) - /// .await?; - /// ``` + /// Get a builder for spawning actors with advanced options. fn spawning(&self) -> SpawnBuilder<'_>; /// Get ActorRef for a local or remote actor by ID async fn actor_ref(&self, id: &ActorId) -> anyhow::Result; - /// Resolve a named actor by name - /// - /// Returns an ActorRef that points to the current location of the named actor. - /// Note: If the actor migrates, this reference may become stale. - /// For actors that may migrate, consider using [`resolve_lazy`](Self::resolve_lazy). + /// Resolve a named actor by name. async fn resolve

(&self, name: P) -> anyhow::Result where P: IntoActorPath + Send; @@ -125,48 +64,11 @@ pub trait ActorSystemCoreExt: Sized { options: ResolveOptions, ) -> anyhow::Result; - /// Get a builder for resolving actors with advanced options - /// - /// # Example - /// ```rust,ignore - /// // With load balancing - /// let actor = system.resolving() - /// .policy(RoundRobinPolicy::new()) - /// .resolve("services/worker").await?; - /// - /// // List all instances - /// let actors = system.resolving() - /// .list("services/worker").await?; - /// - /// // Lazy resolve - /// let actor = system.resolving() - /// .lazy("services/worker")?; - /// ``` + /// Get a builder for resolving actors with advanced options. fn resolving(&self) -> ResolveBuilder<'_>; } -// ============================================================================= -// SpawnBuilder: Fluent API for spawning actors -// ============================================================================= - /// Builder for spawning actors with advanced options. -/// -/// # Example -/// ```rust,ignore -/// // Anonymous actor with supervision -/// let worker = system.spawning() -/// .supervision(SupervisionSpec::on_failure().max_restarts(3)) -/// .spawn(Worker::new()) -/// .await?; -/// -/// // Named actor with full options -/// let service = system.spawning() -/// .name("services/counter") -/// .supervision(SupervisionSpec::on_failure().max_restarts(5)) -/// .mailbox_capacity(512) -/// .spawn(CounterService::new()) -/// .await?; -/// ``` pub struct SpawnBuilder<'a> { system: &'a Arc, name: Option, @@ -239,30 +141,7 @@ impl<'a> SpawnBuilder<'a> { } } -// ============================================================================= -// ResolveBuilder: Fluent API for resolving actors -// ============================================================================= - /// Builder for resolving actors with advanced options. -/// -/// # Example -/// ```rust,ignore -/// // Simple resolve -/// let actor = system.resolve("services/counter").await?; -/// -/// // With load balancing policy -/// let actor = system.resolving() -/// .policy(RoundRobinPolicy::new()) -/// .resolve("services/counter").await?; -/// -/// // Get all instances -/// let actors = system.resolving() -/// .list("services/counter").await?; -/// -/// // Lazy resolve (auto re-resolves on stale) -/// let actor = system.resolving() -/// .lazy("services/counter")?; -/// ``` pub struct ResolveBuilder<'a> { system: &'a Arc, node_id: Option, @@ -353,20 +232,7 @@ impl<'a> ResolveBuilder<'a> { /// cannot be restarted. Use `spawn_named_factory` if you need supervision with /// restart capability. Anonymous actors do not support supervision. /// -/// # Example -/// ```rust,ignore -/// use pulsing_actor::prelude::*; -/// -/// let system = ActorSystem::builder().build().await?; /// -/// let options = SpawnOptions::new() -/// .supervision(SupervisionSpec::new() -/// .restart_policy(RestartPolicy::OnFailure) -/// .max_restarts(3)); -/// -/// // Spawn named actor with factory (only named actors support supervision) -/// let named = system.spawn_named_factory("services/worker", || Ok(Worker::new()), options).await?; -/// ``` #[async_trait::async_trait] pub trait ActorSystemAdvancedExt { /// Spawn a named actor using a factory function (enables supervision restarts) @@ -385,36 +251,7 @@ pub trait ActorSystemAdvancedExt { A: Actor; } -// ============================================================================= -// Ops Trait: Operations, Introspection, Lifecycle -// ============================================================================= - /// Operations, introspection, and lifecycle management API. -/// -/// This trait provides: -/// - System information (node_id, addr, etc.) -/// - Actor listing and lookup -/// - Cluster membership information -/// - Actor stop and system shutdown -/// -/// # Example -/// ```rust,ignore -/// use pulsing_actor::prelude::*; -/// -/// let system = ActorSystem::builder().build().await?; -/// -/// // Get system info -/// println!("Node ID: {}", system.node_id()); -/// println!("Address: {}", system.addr()); -/// -/// // List cluster members -/// for member in system.members().await { -/// println!("Member: {} at {}", member.node_id, member.addr); -/// } -/// -/// // Shutdown -/// system.shutdown().await?; -/// ``` #[async_trait::async_trait] pub trait ActorSystemOpsExt { /// Get SystemActor reference diff --git a/crates/pulsing-actor/src/test_helper.rs b/crates/pulsing-actor/src/test_helper.rs new file mode 100644 index 000000000..1c6ebec79 --- /dev/null +++ b/crates/pulsing-actor/src/test_helper.rs @@ -0,0 +1,395 @@ +//! Test helper macros and utilities +//! +//! This module provides reusable test infrastructure to eliminate +//! repetitive test patterns across the codebase. +//! +//! # Example +//! +//! ```rust,ignore +//! use pulsing_actor::test_helper::*; +//! use pulsing_actor::actor_test; +//! +//! actor_test!(test_basic_echo, system, { +//! let echo = spawn_echo_actor(&system, "test/echo").await; +//! let response: TestPong = echo.ask(TestPing { value: 21 }).await.unwrap(); +//! assert_eq!(response.result, 42); +//! }); +//! ``` + +use crate::actor::{Actor, ActorContext, ActorRef, Message}; +use crate::system::{ActorSystem, SystemConfig}; +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +// ============================================================================ +// Common Test Messages +// ============================================================================ + +/// Simple ping message for testing +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct TestPing { + pub value: i32, +} + +/// Simple pong response for testing +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct TestPong { + pub result: i32, +} + +/// Accumulate message for stateful actor testing +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TestAccumulate { + pub amount: i32, +} + +/// Get total message for stateful actor testing +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TestGetTotal; + +/// Total response for stateful actor testing +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TestTotalResponse { + pub total: i32, +} + +// ============================================================================ +// Common Test Actors +// ============================================================================ + +/// Echo actor that doubles the input value +/// +/// Useful for basic request-response testing +pub struct TestEchoActor { + /// Counter to track how many messages were processed + pub echo_count: Arc, +} + +impl TestEchoActor { + /// Create a new echo actor with an internal counter + pub fn new() -> Self { + Self { + echo_count: Arc::new(AtomicUsize::new(0)), + } + } + + /// Create a new echo actor with a shared counter + pub fn with_counter(counter: Arc) -> Self { + Self { + echo_count: counter, + } + } + + /// Get the number of messages processed + pub fn count(&self) -> usize { + self.echo_count.load(Ordering::SeqCst) + } +} + +impl Default for TestEchoActor { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl Actor for TestEchoActor { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + if msg.msg_type().ends_with("TestPing") { + let ping: TestPing = msg.unpack()?; + self.echo_count.fetch_add(1, Ordering::SeqCst); + return Message::pack(&TestPong { + result: ping.value * 2, + }); + } + Err(anyhow::anyhow!("Unknown message type: {}", msg.msg_type())) + } +} + +/// Accumulator actor that maintains a running total +/// +/// Useful for testing stateful actors +pub struct TestAccumulatorActor { + pub total: i32, +} + +impl TestAccumulatorActor { + /// Create a new accumulator with initial value + pub fn new(initial: i32) -> Self { + Self { total: initial } + } +} + +impl Default for TestAccumulatorActor { + fn default() -> Self { + Self::new(0) + } +} + +#[async_trait] +impl Actor for TestAccumulatorActor { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + let msg_type = msg.msg_type(); + if msg_type.ends_with("TestAccumulate") { + let acc: TestAccumulate = msg.unpack()?; + self.total += acc.amount; + return Message::pack(&TestTotalResponse { total: self.total }); + } + if msg_type.ends_with("TestGetTotal") { + return Message::pack(&TestTotalResponse { total: self.total }); + } + Err(anyhow::anyhow!("Unknown message type: {}", msg_type)) + } +} + +// ============================================================================ +// Test Setup Helpers +// ============================================================================ + +/// Create a standalone actor system for testing +/// +/// This creates a system configured for single-node testing without cluster features. +pub async fn create_test_system() -> Arc { + ActorSystem::new(SystemConfig::standalone()) + .await + .expect("Failed to create test actor system") +} + +/// Spawn an echo actor with the given name +pub async fn spawn_echo_actor(system: &Arc, name: &str) -> ActorRef { + system + .spawn_named(name, TestEchoActor::new()) + .await + .expect("Failed to spawn echo actor") +} + +/// Spawn an echo actor with a shared counter +pub async fn spawn_echo_actor_with_counter( + system: &Arc, + name: &str, + counter: Arc, +) -> ActorRef { + system + .spawn_named(name, TestEchoActor::with_counter(counter)) + .await + .expect("Failed to spawn echo actor") +} + +/// Spawn an accumulator actor with the given name +pub async fn spawn_accumulator_actor(system: &Arc, name: &str) -> ActorRef { + system + .spawn_named(name, TestAccumulatorActor::default()) + .await + .expect("Failed to spawn accumulator actor") +} + +/// Spawn an accumulator actor with initial value +pub async fn spawn_accumulator_actor_with_initial( + system: &Arc, + name: &str, + initial: i32, +) -> ActorRef { + system + .spawn_named(name, TestAccumulatorActor::new(initial)) + .await + .expect("Failed to spawn accumulator actor") +} + +// ============================================================================ +// Test Macros +// ============================================================================ + +/// Macro for creating actor system tests with automatic setup and teardown +/// +/// This macro handles: +/// - Creating a standalone ActorSystem +/// - Running the test body +/// - Shutting down the system +/// +/// # Example +/// +/// ```rust,ignore +/// actor_test!(test_echo_actor, system, { +/// let echo = spawn_echo_actor(&system, "test/echo").await; +/// let response: TestPong = echo.ask(TestPing { value: 21 }).await.unwrap(); +/// assert_eq!(response.result, 42); +/// }); +/// ``` +#[macro_export] +macro_rules! actor_test { + ($test_name:ident, $system:ident, $test_body:block) => { + #[tokio::test] + async fn $test_name() { + let $system = $crate::test_helper::create_test_system().await; + // Execute the test body + $test_body + // Shutdown the system + $system.shutdown().await.expect("Failed to shutdown system"); + } + }; +} + +/// Macro for creating actor system tests that return a Result +/// +/// Similar to `actor_test!` but allows the test body to return a Result for +/// more ergonomic error handling with `?`. +/// +/// # Example +/// +/// ```rust,ignore +/// actor_test_result!(test_echo_actor, system, { +/// let echo = spawn_echo_actor(&system, "test/echo").await; +/// let response: TestPong = echo.ask(TestPing { value: 21 }).await?; +/// assert_eq!(response.result, 42); +/// Ok(()) +/// }); +/// ``` +#[macro_export] +macro_rules! actor_test_result { + ($test_name:ident, $system:ident, $test_body:block) => { + #[tokio::test] + async fn $test_name() -> anyhow::Result<()> { + let $system = $crate::test_helper::create_test_system().await; + // Execute the test body + let test_result: anyhow::Result<()> = $test_body; + // Shutdown the system regardless of test result + $system.shutdown().await?; + test_result + } + }; +} + +/// Macro for creating tests with multiple actors +/// +/// This is a convenience macro for spawning multiple named actors at once. +/// +/// # Example +/// +/// ```rust,ignore +/// spawn_test_actors!(system, { +/// "test/echo1" => TestEchoActor::new(), +/// "test/echo2" => TestEchoActor::new(), +/// "test/acc" => TestAccumulatorActor::default(), +/// }); +/// ``` +#[macro_export] +macro_rules! spawn_test_actors { + ($system:expr, { $($name:expr => $actor:expr),* $(,)? }) => { + { + let mut refs = Vec::new(); + $( + refs.push( + $system + .spawn_named($name, $actor) + .await + .expect(concat!("Failed to spawn actor: ", $name)) + ); + )* + refs + } + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_create_test_system() { + let system = create_test_system().await; + assert!(!system.local_actor_names().is_empty()); // At least SystemActor + system.shutdown().await.unwrap(); + } + + #[tokio::test] + async fn test_echo_actor() { + let system = create_test_system().await; + let echo = spawn_echo_actor(&system, "test/echo").await; + + let response: TestPong = echo.ask(TestPing { value: 21 }).await.unwrap(); + assert_eq!(response.result, 42); + + system.shutdown().await.unwrap(); + } + + #[tokio::test] + async fn test_echo_actor_with_counter() { + let counter = Arc::new(AtomicUsize::new(0)); + let system = create_test_system().await; + let echo = spawn_echo_actor_with_counter(&system, "test/echo", counter.clone()).await; + + let _: TestPong = echo.ask(TestPing { value: 1 }).await.unwrap(); + let _: TestPong = echo.ask(TestPing { value: 2 }).await.unwrap(); + let _: TestPong = echo.ask(TestPing { value: 3 }).await.unwrap(); + + assert_eq!(counter.load(Ordering::SeqCst), 3); + system.shutdown().await.unwrap(); + } + + #[tokio::test] + async fn test_accumulator_actor() { + let system = create_test_system().await; + let acc = spawn_accumulator_actor(&system, "test/acc").await; + + let r1: TestTotalResponse = acc.ask(TestAccumulate { amount: 10 }).await.unwrap(); + assert_eq!(r1.total, 10); + + let r2: TestTotalResponse = acc.ask(TestAccumulate { amount: 5 }).await.unwrap(); + assert_eq!(r2.total, 15); + + let r3: TestTotalResponse = acc.ask(TestGetTotal).await.unwrap(); + assert_eq!(r3.total, 15); + + system.shutdown().await.unwrap(); + } + + // Test the actor_test! macro + actor_test!(test_macro_basic, system, { + let echo = spawn_echo_actor(&system, "test/echo").await; + let response: TestPong = echo.ask(TestPing { value: 10 }).await.unwrap(); + assert_eq!(response.result, 20); + }); + + // Test the actor_test_result! macro + actor_test_result!(test_macro_result, system, { + let echo = spawn_echo_actor(&system, "test/echo").await; + let response: TestPong = echo.ask(TestPing { value: 5 }).await?; + assert_eq!(response.result, 10); + Ok(()) + }); + + // Test macros with multiple actors + actor_test!(test_macro_multiple_actors, system, { + let echo1 = spawn_echo_actor(&system, "test/echo1").await; + let echo2 = spawn_echo_actor(&system, "test/echo2").await; + + let r1: TestPong = echo1.ask(TestPing { value: 1 }).await.unwrap(); + let r2: TestPong = echo2.ask(TestPing { value: 2 }).await.unwrap(); + + assert_eq!(r1.result, 2); + assert_eq!(r2.result, 4); + }); + + // Test the spawn_test_actors! macro + #[tokio::test] + async fn test_spawn_test_actors_macro() { + let system = create_test_system().await; + + let refs = spawn_test_actors!(system, { + "test/echo1" => TestEchoActor::new(), + "test/echo2" => TestEchoActor::new(), + }); + + assert_eq!(refs.len(), 2); + + let r1: TestPong = refs[0].ask(TestPing { value: 1 }).await.unwrap(); + let r2: TestPong = refs[1].ask(TestPing { value: 2 }).await.unwrap(); + + assert_eq!(r1.result, 2); + assert_eq!(r2.result, 4); + + system.shutdown().await.unwrap(); + } +} diff --git a/crates/pulsing-actor/src/transport/http2/client.rs b/crates/pulsing-actor/src/transport/http2/client.rs index c189b8282..058fd0523 100644 --- a/crates/pulsing-actor/src/transport/http2/client.rs +++ b/crates/pulsing-actor/src/transport/http2/client.rs @@ -1,11 +1,4 @@ -//! HTTP/2 Client implementation -//! -//! Supports h2c (HTTP/2 over cleartext) with: -//! - Advanced connection pooling -//! - Retry strategies with exponential backoff -//! - Timeout management -//! - Streaming support -//! - Distributed tracing (W3C Trace Context) +//! HTTP/2 client implementation. use super::config::Http2Config; use super::pool::{ConnectionPool, PoolConfig}; @@ -26,20 +19,15 @@ use std::time::Duration; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; -/// HTTP/2 Client with connection pooling, retry, and timeout support +/// HTTP/2 client with connection pooling, retry, and timeout support. pub struct Http2Client { - /// Connection pool pool: Arc, - /// HTTP/2 configuration config: Http2Config, - /// Retry configuration retry_config: RetryConfig, - /// Global cancellation token cancel: CancellationToken, } impl Http2Client { - /// Create a new HTTP/2 client with default configuration pub fn new(config: Http2Config) -> Self { Self { pool: Arc::new(ConnectionPool::new(config.clone())), @@ -49,7 +37,6 @@ impl Http2Client { } } - /// Create a new HTTP/2 client with custom configurations pub fn with_configs( http2_config: Http2Config, pool_config: PoolConfig, @@ -66,33 +53,27 @@ impl Http2Client { } } - /// Create client with retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; self } - /// Get the connection pool (for diagnostics) pub fn pool(&self) -> &Arc { &self.pool } - /// Get pool statistics pub fn stats(&self) -> &Arc { self.pool.stats() } - /// Start background maintenance tasks pub fn start_background_tasks(&self) { self.pool.start_cleanup_task(self.cancel.clone()); } - /// Shutdown the client pub fn shutdown(&self) { self.cancel.cancel(); } - /// Send an ask (request-response) message with retry pub async fn ask( &self, addr: SocketAddr, @@ -102,8 +83,6 @@ impl Http2Client { ) -> anyhow::Result> { let executor = RetryExecutor::new(self.retry_config.clone()); - // ask is idempotent if the message handler is idempotent - // We treat read operations as idempotent by default executor .execute(true, || { self.ask_once(addr, path, msg_type, payload.clone()) @@ -111,7 +90,6 @@ impl Http2Client { .await } - /// Send an ask without retry async fn ask_once( &self, addr: SocketAddr, @@ -125,7 +103,6 @@ impl Http2Client { let status = response.status(); - // Read response body with timeout let body = tokio::time::timeout(self.config.request_timeout, response.collect()) .await .map_err(|_| anyhow::anyhow!("Response body read timeout"))? @@ -144,7 +121,6 @@ impl Http2Client { Ok(body.to_vec()) } - /// Send a tell (fire-and-forget) message with retry pub async fn tell( &self, addr: SocketAddr, @@ -154,7 +130,6 @@ impl Http2Client { ) -> anyhow::Result<()> { let executor = RetryExecutor::new(self.retry_config.clone()); - // tell is NOT idempotent by default (could have side effects) executor .execute(false, || { self.tell_once(addr, path, msg_type, payload.clone()) @@ -162,7 +137,6 @@ impl Http2Client { .await } - /// Send a tell without retry async fn tell_once( &self, addr: SocketAddr, @@ -190,9 +164,7 @@ impl Http2Client { Ok(()) } - /// Send a stream request and receive streaming response as StreamFrame - /// - /// Note: Streaming requests are NOT retried (they are not idempotent) + /// Send a stream request and receive streaming response as StreamFrame. pub async fn ask_stream( &self, addr: SocketAddr, diff --git a/crates/pulsing-actor/src/transport/http2/config.rs b/crates/pulsing-actor/src/transport/http2/config.rs index a748c3bea..698411c53 100644 --- a/crates/pulsing-actor/src/transport/http2/config.rs +++ b/crates/pulsing-actor/src/transport/http2/config.rs @@ -1,75 +1,29 @@ -//! HTTP/2 Transport Configuration -//! -//! Provides comprehensive configuration options for HTTP/2 transport including: -//! - Server settings (concurrent streams, window sizes, etc.) -//! - Client settings (timeouts, connection pooling) -//! - Retry policies -//! - TLS settings (when `tls` feature is enabled) +//! HTTP/2 transport configuration. use std::time::Duration; #[cfg(feature = "tls")] use super::tls::TlsConfig; -/// HTTP/2 transport configuration +/// HTTP/2 transport configuration. #[derive(Debug, Clone)] pub struct Http2Config { - // ========== Server Configuration ========== - /// Maximum number of concurrent streams per connection (default: 100) pub max_concurrent_streams: u32, - - /// Initial window size for flow control (default: 65535 bytes) pub initial_window_size: u32, - - /// Connection-level window size (default: 1MB) pub initial_connection_window_size: u32, - - /// Maximum frame size (default: 16KB) pub max_frame_size: u32, - - /// Maximum header list size (default: 16KB) pub max_header_list_size: u32, - - // ========== Client Configuration ========== - /// Connection timeout (default: 5s) pub connect_timeout: Duration, - - /// Request timeout for non-streaming requests (default: 30s) pub request_timeout: Duration, - - /// Timeout for streaming requests (default: 5min) pub stream_timeout: Duration, - - /// Maximum connections per host (default: 10) pub max_connections_per_host: usize, - - // ========== Common Configuration ========== - /// Keep-alive ping interval (default: 30s, None to disable) pub keepalive_interval: Option, - - /// Keep-alive timeout (default: 10s) pub keepalive_timeout: Duration, - - /// Enable HTTP/2 prior knowledge mode (default: true) - /// When true, client sends HTTP/2 preface directly without upgrade pub http2_prior_knowledge: bool, - - // ========== Retry Configuration ========== - /// Maximum number of retry attempts (default: 3) pub max_retries: u32, - - /// Initial retry delay (default: 100ms) pub retry_initial_delay: Duration, - - /// Maximum retry delay (default: 10s) pub retry_max_delay: Duration, - - /// Whether to use jitter in retry delays (default: true) pub retry_use_jitter: bool, - - // ========== TLS Configuration (requires `tls` feature) ========== - /// TLS configuration for encrypted transport (default: None) - /// When set, all connections will use TLS with mutual authentication #[cfg(feature = "tls")] pub tls: Option, } @@ -77,31 +31,23 @@ pub struct Http2Config { impl Default for Http2Config { fn default() -> Self { Self { - // Server defaults max_concurrent_streams: 100, initial_window_size: 65535, - initial_connection_window_size: 1024 * 1024, // 1MB - max_frame_size: 16 * 1024, // 16KB - max_header_list_size: 16 * 1024, // 16KB + initial_connection_window_size: 1024 * 1024, + max_frame_size: 16 * 1024, + max_header_list_size: 16 * 1024, - // Client defaults connect_timeout: Duration::from_secs(5), request_timeout: Duration::from_secs(30), - stream_timeout: Duration::from_secs(300), // 5 minutes + stream_timeout: Duration::from_secs(300), max_connections_per_host: 10, - - // Common defaults keepalive_interval: Some(Duration::from_secs(30)), keepalive_timeout: Duration::from_secs(10), http2_prior_knowledge: true, - - // Retry defaults max_retries: 3, retry_initial_delay: Duration::from_millis(100), retry_max_delay: Duration::from_secs(10), retry_use_jitter: true, - - // TLS defaults #[cfg(feature = "tls")] tls: None, } @@ -109,12 +55,10 @@ impl Default for Http2Config { } impl Http2Config { - /// Create a new configuration with default values pub fn new() -> Self { Self::default() } - /// Create a configuration optimized for low-latency workloads pub fn low_latency() -> Self { Self { connect_timeout: Duration::from_secs(2), @@ -127,74 +71,62 @@ impl Http2Config { } } - /// Create a configuration optimized for high-throughput workloads pub fn high_throughput() -> Self { Self { max_concurrent_streams: 200, - initial_window_size: 256 * 1024, // 256KB - initial_connection_window_size: 4 * 1024 * 1024, // 4MB - max_frame_size: 64 * 1024, // 64KB + initial_window_size: 256 * 1024, + initial_connection_window_size: 4 * 1024 * 1024, + max_frame_size: 64 * 1024, max_connections_per_host: 20, ..Default::default() } } - /// Create a configuration optimized for streaming workloads (e.g., LLM inference) pub fn streaming() -> Self { Self { - stream_timeout: Duration::from_secs(600), // 10 minutes + stream_timeout: Duration::from_secs(600), max_concurrent_streams: 50, - initial_window_size: 128 * 1024, // 128KB + initial_window_size: 128 * 1024, keepalive_interval: Some(Duration::from_secs(60)), ..Default::default() } } - // ========== Builder Methods ========== - - /// Set maximum concurrent streams pub fn max_concurrent_streams(mut self, n: u32) -> Self { self.max_concurrent_streams = n; self } - /// Set initial window size pub fn initial_window_size(mut self, size: u32) -> Self { self.initial_window_size = size; self } - /// Set connection-level window size pub fn initial_connection_window_size(mut self, size: u32) -> Self { self.initial_connection_window_size = size; self } - /// Set maximum frame size pub fn max_frame_size(mut self, size: u32) -> Self { self.max_frame_size = size; self } - /// Set connection timeout pub fn connect_timeout(mut self, timeout: Duration) -> Self { self.connect_timeout = timeout; self } - /// Set request timeout pub fn request_timeout(mut self, timeout: Duration) -> Self { self.request_timeout = timeout; self } - /// Set stream timeout pub fn stream_timeout(mut self, timeout: Duration) -> Self { self.stream_timeout = timeout; self } - /// Set maximum connections per host pub fn max_connections_per_host(mut self, n: usize) -> Self { self.max_connections_per_host = n; self diff --git a/crates/pulsing-actor/src/transport/http2/mod.rs b/crates/pulsing-actor/src/transport/http2/mod.rs index e15ea953a..c3cb78b7c 100644 --- a/crates/pulsing-actor/src/transport/http2/mod.rs +++ b/crates/pulsing-actor/src/transport/http2/mod.rs @@ -1,61 +1,4 @@ -//! HTTP/2 Transport Layer -//! -//! Provides HTTP/2 (h2c - cleartext) transport for actor communication with -//! bidirectional streaming support using a high-performance binary protocol. -//! -//! ## Features -//! -//! - HTTP/2 over cleartext (h2c) - no TLS required -//! - **Bidirectional streaming** - both requests and responses can be streams -//! - **Binary frame protocol** - ~56% smaller than JSON, zero-copy friendly -//! - Connection multiplexing with advanced pooling -//! - Retry strategies with exponential backoff -//! - Timeout management at multiple levels -//! - Built-in flow control (backpressure) -//! -//! ## Protocol -//! -//! ### Message Modes -//! -//! - `ask`: Request-response pattern (single or stream) -//! - `tell`: Fire-and-forget pattern (single only) -//! - `stream`: Explicit streaming response request -//! -//! ### Request Types -//! -//! - `single`: Regular request body -//! - `stream`: Length-prefixed binary frames -//! -//! ### Headers -//! -//! - `x-message-mode`: ask | tell | stream -//! - `x-message-type`: Message type identifier -//! - `x-request-type`: single | stream -//! - `x-response-type`: single | stream -//! - `x-request-id`: Optional request ID for tracing -//! -//! ## Example -//! -//! ```rust,ignore -//! use pulsing_actor::transport::http2::{Http2Client, Http2ClientBuilder, Http2Config}; -//! use std::time::Duration; -//! -//! // Create client with custom configuration -//! let client = Http2ClientBuilder::new() -//! .max_retries(3) -//! .connect_timeout(Duration::from_secs(5)) -//! .request_timeout(Duration::from_secs(30)) -//! .build(); -//! -//! // Send request -//! let response = client.ask(addr, "/actors/my_actor", "Ping", payload).await?; -//! -//! // Streaming request -//! let stream = client.ask_stream(addr, "/actors/my_actor", "StreamingRequest", payload).await?; -//! while let Some(frame) = stream.next().await { -//! // Process streaming frames -//! } -//! ``` +//! HTTP/2 transport layer. mod client; mod config; @@ -83,10 +26,7 @@ use std::net::SocketAddr; use std::sync::Arc; use tokio_util::sync::CancellationToken; -/// High-level HTTP/2 Transport -/// -/// Combines Http2Server and Http2Client into a single component -/// used by ActorSystem and GossipCluster. +/// High-level HTTP/2 transport. pub struct Http2Transport { local_addr: SocketAddr, client: Arc, @@ -95,18 +35,15 @@ pub struct Http2Transport { } impl Http2Transport { - /// Create a new HTTP/2 transport and start the server pub async fn new( bind_addr: SocketAddr, handler: Arc, config: Http2Config, cancel: CancellationToken, ) -> anyhow::Result<(Arc, SocketAddr)> { - // Build HTTP/2 client let client = Arc::new(Http2Client::new(config.clone())); client.start_background_tasks(); - // Start HTTP/2 server let server = Http2Server::new(bind_addr, handler, config.clone(), cancel.clone()).await?; let local_addr = server.local_addr(); @@ -120,12 +57,10 @@ impl Http2Transport { Ok((transport, local_addr)) } - /// Create a client-only transport (no server) pub fn new_client(config: Http2Config) -> Arc { let client = Arc::new(Http2Client::new(config.clone())); client.start_background_tasks(); - // Default address for client-only mode (no server binding) const CLIENT_ONLY_ADDR: SocketAddr = SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::new(0, 0, 0, 0)), 0); @@ -137,16 +72,11 @@ impl Http2Transport { }) } - /// Get local address pub fn local_addr(&self) -> SocketAddr { self.local_addr } - /// Send a request to an actor and wait for response - /// - /// Supports both single and streaming requests: - /// - `Message::Single`: Sent as regular request body - /// - `Message::Stream`: Sent as binary frames + /// Send a request to an actor and wait for response. pub async fn ask( &self, addr: SocketAddr, @@ -157,11 +87,7 @@ impl Http2Transport { self.client.send_message_full(addr, &path, msg).await } - /// Send a request to a named actor and wait for response - /// - /// Supports both single and streaming requests: - /// - `Message::Single`: Sent as regular request body - /// - `Message::Stream`: Sent as binary frames + /// Send a request to a named actor and wait for response. pub async fn ask_named( &self, addr: SocketAddr, @@ -172,7 +98,6 @@ impl Http2Transport { self.client.send_message_full(addr, &url_path, msg).await } - /// Send a fire-and-forget message pub async fn tell( &self, addr: SocketAddr, @@ -187,7 +112,6 @@ impl Http2Transport { self.client.tell(addr, &path, &msg_type, data).await } - /// Send a fire-and-forget message to a named actor pub async fn tell_named( &self, addr: SocketAddr, diff --git a/crates/pulsing-actor/src/transport/http2/pool.rs b/crates/pulsing-actor/src/transport/http2/pool.rs index be53cab58..50aa509f4 100644 --- a/crates/pulsing-actor/src/transport/http2/pool.rs +++ b/crates/pulsing-actor/src/transport/http2/pool.rs @@ -1,11 +1,4 @@ -//! Advanced connection pool management for HTTP/2 transport -//! -//! Features: -//! - Connection health checking -//! - Connection expiration/eviction -//! - Connection reuse optimization -//! - Pool statistics -//! - TLS support (when `tls` feature is enabled) +//! Connection pool management for HTTP/2 transport. use super::config::Http2Config; use bytes::Bytes; @@ -20,25 +13,18 @@ use std::time::{Duration, Instant}; use tokio::net::TcpStream; use tokio::sync::{Mutex, RwLock, Semaphore}; -/// Connection pool statistics +/// Connection pool statistics. #[derive(Debug, Default)] pub struct PoolStats { - /// Total connections created pub connections_created: AtomicU64, - /// Total connections closed pub connections_closed: AtomicU64, - /// Total connections reused pub connections_reused: AtomicU64, - /// Total connection errors pub connection_errors: AtomicU64, - /// Current active connections pub active_connections: AtomicUsize, - /// Current idle connections pub idle_connections: AtomicUsize, } impl PoolStats { - /// Get stats as JSON pub fn to_json(&self) -> serde_json::Value { serde_json::json!({ "connections_created": self.connections_created.load(Ordering::Relaxed), @@ -51,30 +37,21 @@ impl PoolStats { } } -/// Connection state +/// Connection state. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ConnectionState { - /// Connection is available Idle, - /// Connection is in use Active, - /// Connection is unhealthy Unhealthy, - /// Connection has expired Expired, } -/// A pooled HTTP/2 connection +/// A pooled HTTP/2 connection. pub struct PooledConnection { - /// The HTTP/2 sender pub sender: http2::SendRequest>, - /// When the connection was created pub created_at: Instant, - /// When the connection was last used pub last_used: Instant, - /// Number of requests made on this connection pub request_count: u64, - /// Current state pub state: ConnectionState, } @@ -90,28 +67,23 @@ impl PooledConnection { } } - /// Check if the connection is healthy pub fn is_healthy(&self, config: &PoolConfig) -> bool { - // Check if connection is ready if !self.sender.is_ready() { return false; } - // Check max age if let Some(max_age) = config.max_connection_age { if self.created_at.elapsed() > max_age { return false; } } - // Check max idle time if let Some(max_idle) = config.max_idle_time { if self.last_used.elapsed() > max_idle { return false; } } - // Check max requests per connection if let Some(max_requests) = config.max_requests_per_connection { if self.request_count >= max_requests { return false; @@ -121,47 +93,36 @@ impl PooledConnection { true } - /// Mark the connection as used pub fn mark_used(&mut self) { self.last_used = Instant::now(); self.request_count += 1; self.state = ConnectionState::Active; } - /// Mark the connection as idle pub fn mark_idle(&mut self) { self.state = ConnectionState::Idle; } } -/// Pool configuration +/// Pool configuration. #[derive(Debug, Clone)] pub struct PoolConfig { - /// Maximum connections per host pub max_connections_per_host: usize, - /// Minimum idle connections per host pub min_idle_per_host: usize, - /// Maximum total connections pub max_total_connections: usize, - /// Connection timeout pub connect_timeout: Duration, - /// Maximum connection age (None = no limit) pub max_connection_age: Option, - /// Maximum idle time (None = no limit) pub max_idle_time: Option, - /// Maximum requests per connection (None = no limit) pub max_requests_per_connection: Option, - /// How often to run cleanup pub cleanup_interval: Duration, - /// Whether to enable connection warming pub enable_warming: bool, } @@ -172,8 +133,8 @@ impl Default for PoolConfig { min_idle_per_host: 1, max_total_connections: 100, connect_timeout: Duration::from_secs(5), - max_connection_age: Some(Duration::from_secs(300)), // 5 minutes - max_idle_time: Some(Duration::from_secs(60)), // 1 minute + max_connection_age: Some(Duration::from_secs(300)), + max_idle_time: Some(Duration::from_secs(60)), max_requests_per_connection: Some(1000), cleanup_interval: Duration::from_secs(30), enable_warming: false, diff --git a/crates/pulsing-actor/src/transport/http2/retry.rs b/crates/pulsing-actor/src/transport/http2/retry.rs index 435a933b3..19b8bf20b 100644 --- a/crates/pulsing-actor/src/transport/http2/retry.rs +++ b/crates/pulsing-actor/src/transport/http2/retry.rs @@ -1,26 +1,15 @@ -//! Retry and timeout strategies for HTTP/2 transport +//! Retry and timeout strategies for HTTP/2 transport. use std::time::Duration; -/// Retry configuration +/// Retry configuration. #[derive(Debug, Clone)] pub struct RetryConfig { - /// Maximum number of retry attempts (0 = no retry) pub max_retries: u32, - - /// Initial retry delay pub initial_delay: Duration, - - /// Maximum retry delay (for exponential backoff) pub max_delay: Duration, - - /// Exponential backoff multiplier (e.g., 2.0 doubles each time) pub backoff_multiplier: f64, - - /// Whether to add random jitter to delays pub use_jitter: bool, - - /// Only retry idempotent operations pub idempotent_only: bool, } @@ -38,7 +27,6 @@ impl Default for RetryConfig { } impl RetryConfig { - /// Create a new retry config with no retries pub fn no_retry() -> Self { Self { max_retries: 0, @@ -46,7 +34,6 @@ impl RetryConfig { } } - /// Create a retry config with specified max retries pub fn with_max_retries(max_retries: u32) -> Self { Self { max_retries, @@ -54,43 +41,36 @@ impl RetryConfig { } } - /// Set the maximum number of retries pub fn max_retries(mut self, n: u32) -> Self { self.max_retries = n; self } - /// Set the initial retry delay pub fn initial_delay(mut self, delay: Duration) -> Self { self.initial_delay = delay; self } - /// Set the maximum retry delay pub fn max_delay(mut self, delay: Duration) -> Self { self.max_delay = delay; self } - /// Set the backoff multiplier pub fn backoff_multiplier(mut self, multiplier: f64) -> Self { self.backoff_multiplier = multiplier; self } - /// Enable or disable jitter pub fn use_jitter(mut self, enable: bool) -> Self { self.use_jitter = enable; self } - /// Allow retrying non-idempotent operations pub fn allow_non_idempotent(mut self) -> Self { self.idempotent_only = false; self } - /// Calculate delay for the given attempt number (0-indexed) pub fn delay_for_attempt(&self, attempt: u32) -> Duration { if attempt == 0 { return Duration::ZERO; @@ -104,7 +84,6 @@ impl RetryConfig { let capped_delay = base_delay.min(self.max_delay.as_millis() as f64); let final_delay = if self.use_jitter { - // Add jitter: 50% to 150% of the delay let jitter = rand_jitter(); capped_delay * (0.5 + jitter) } else { @@ -115,10 +94,8 @@ impl RetryConfig { } } -/// Generate random jitter between 0.0 and 1.0 fn rand_jitter() -> f64 { use std::time::SystemTime; - // Simple pseudo-random based on time (no external crate needed) let nanos = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap_or_default() @@ -126,32 +103,22 @@ fn rand_jitter() -> f64 { (nanos % 1000) as f64 / 1000.0 } -/// Error classification for retry decisions +/// Error classification for retry decisions. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RetryableError { - /// Connection error - always retryable Connection, - /// Timeout - retryable for idempotent operations Timeout, - /// Server overloaded (503) - retryable ServerOverloaded, - /// Server error (5xx except 503) - might be retryable ServerError, - /// Client error (4xx) - not retryable ClientError, - /// Unknown error - not retryable by default Unknown, } impl RetryableError { - /// Classify an error from its message pub fn classify(error: &anyhow::Error) -> Self { let msg = error.to_string().to_lowercase(); if msg.contains("backing off") { - // If the pool is backing off, immediate retry is futile. - // We should either not retry, or wait longer. - // For now, let's treat it as non-retryable to stop the log spam. return Self::Unknown; } @@ -196,7 +163,6 @@ impl RetryableError { Self::Unknown } - /// Check if this error type is retryable pub fn is_retryable(&self, idempotent_only: bool, is_idempotent: bool) -> bool { match self { // Connection errors are always retryable diff --git a/crates/pulsing-actor/src/transport/http2/server.rs b/crates/pulsing-actor/src/transport/http2/server.rs index e4ef18970..7f853645c 100644 --- a/crates/pulsing-actor/src/transport/http2/server.rs +++ b/crates/pulsing-actor/src/transport/http2/server.rs @@ -1,7 +1,4 @@ -//! HTTP/2 Server implementation -//! -//! Supports h2c (HTTP/2 over cleartext) with optional HTTP/1.1 fallback. -//! When `tls` feature is enabled, supports TLS with passphrase-derived certificates. +//! HTTP/2 server implementation. use super::config::Http2Config; use super::stream::{BinaryFrameParser, StreamFrame}; @@ -24,18 +21,11 @@ use tokio::net::TcpListener; use tokio::sync::mpsc; use tokio_util::sync::CancellationToken; -/// Handler trait for HTTP/2 server +/// Handler trait for HTTP/2 server. #[async_trait::async_trait] pub trait Http2ServerHandler: Send + Sync + 'static { - /// Unified message handler - accepts and returns Message (Single or Stream) - /// - /// This is the primary method that should be implemented. It handles both: - /// - Single requests: `Message::Single` with payload - /// - Streaming requests: `Message::Stream` with async stream of chunks - /// - /// The default implementation delegates to `handle_message_simple` for backward compatibility. + /// Unified message handler. async fn handle_message_full(&self, path: &str, msg: Message) -> anyhow::Result { - // Default: extract single message and delegate to simple handler match msg { Message::Single { msg_type, data } => { self.handle_message_simple(path, &msg_type, data).await @@ -46,9 +36,7 @@ pub trait Http2ServerHandler: Send + Sync + 'static { } } - /// Simple message handler for backward compatibility - /// - /// Implement this if you only need to handle single (non-streaming) requests. + /// Simple message handler. async fn handle_message_simple( &self, path: &str, @@ -59,49 +47,42 @@ pub trait Http2ServerHandler: Send + Sync + 'static { Err(anyhow::anyhow!("Not implemented")) } - /// Handle tell (fire-and-forget) message + /// Handle tell (fire-and-forget) message. async fn handle_tell(&self, path: &str, msg_type: &str, payload: Vec) -> anyhow::Result<()>; - /// Handle gossip message + /// Handle gossip message. async fn handle_gossip( &self, payload: Vec, peer_addr: SocketAddr, ) -> anyhow::Result>>; - /// Get health status + /// Get health status. async fn health_check(&self) -> serde_json::Value { serde_json::json!({"status": "ok"}) } - /// Get Prometheus metrics (text format) - /// - /// Default implementation returns empty metrics. - /// Override this to provide system metrics. + /// Get Prometheus metrics (text format). async fn prometheus_metrics(&self) -> String { String::new() } - /// Get cluster members list (for CLI tools) - /// - /// Returns JSON array of member information. + /// Get cluster members list. async fn cluster_members(&self) -> serde_json::Value { serde_json::json!([]) } - /// Get actors list on this node (for CLI tools) - /// - /// Returns JSON array of actor information. + /// Get actors list on this node. async fn actors_list(&self, include_internal: bool) -> serde_json::Value { let _ = include_internal; serde_json::json!([]) } - /// Get as Any for downcasting + /// Get as Any for downcasting. fn as_any(&self) -> &dyn std::any::Any; - /// Handle head node API requests (optional, returns None if not supported) + /// Handle head node API requests. async fn handle_head_api( &self, _path: &str, @@ -112,14 +93,13 @@ pub trait Http2ServerHandler: Send + Sync + 'static { } } -/// HTTP/2 Server +/// HTTP/2 Server. pub struct Http2Server { local_addr: SocketAddr, cancel: CancellationToken, } impl Http2Server { - /// Create and start a new HTTP/2 server pub async fn new( bind_addr: SocketAddr, handler: Arc, @@ -131,7 +111,6 @@ impl Http2Server { tracing::info!(addr = %local_addr, "Starting HTTP/2 server"); - // Spawn the server task let server_cancel = cancel.clone(); tokio::spawn(async move { Self::run_server(listener, handler, config, server_cancel).await; @@ -140,17 +119,14 @@ impl Http2Server { Ok(Self { local_addr, cancel }) } - /// Get the local address the server is bound to pub fn local_addr(&self) -> SocketAddr { self.local_addr } - /// Shutdown the server pub fn shutdown(&self) { self.cancel.cancel(); } - /// Run the server loop async fn run_server( listener: TcpListener, handler: Arc, diff --git a/crates/pulsing-actor/src/transport/http2/stream.rs b/crates/pulsing-actor/src/transport/http2/stream.rs index 41944585c..ebd6ed328 100644 --- a/crates/pulsing-actor/src/transport/http2/stream.rs +++ b/crates/pulsing-actor/src/transport/http2/stream.rs @@ -1,27 +1,4 @@ -//! Streaming support for HTTP/2 transport -//! -//! Uses a high-performance binary frame format for streaming requests and responses. -//! -//! ## Binary Frame Format -//! -//! ```text -//! +--------+--------+----------+----------+---------+-----------+ -//! | length | flags | msg_type | data_len | data | [error] | -//! | 4B BE | 1byte | 2B+UTF8 | 4B BE | N bytes | [2B+UTF8] | -//! +--------+--------+----------+----------+---------+-----------+ -//! ``` -//! -//! - **length**: Total frame length (excluding this field), big-endian u32 -//! - **flags**: bit 0 = END, bit 1 = ERROR -//! - **msg_type**: 2-byte length prefix + UTF-8 string -//! - **data_len**: 4-byte length prefix + raw binary payload -//! - **error**: Optional error message (only present if ERROR flag is set) -//! -//! ## Features -//! -//! - **Compact**: Raw binary, no encoding overhead -//! - **Fast parsing**: O(1) frame boundary detection via length prefix -//! - **Zero-copy friendly**: Direct binary data passthrough +//! Streaming support for HTTP/2 transport. use crate::actor::Message; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -30,30 +7,23 @@ use std::pin::Pin; use std::task::{Context, Poll}; use tokio_util::sync::CancellationToken; -/// Frame flags +/// Frame flags. pub const FLAG_END: u8 = 0x01; pub const FLAG_ERROR: u8 = 0x02; -/// A frame in a streaming message -/// -/// Serialized using a length-prefixed binary format for efficient transmission. +/// A frame in a streaming message. #[derive(Debug, Clone)] pub struct StreamFrame { - /// Message type identifier pub msg_type: String, - /// Raw binary payload pub data: Vec, - /// Whether this is the final frame pub end: bool, - /// Error message (if any) pub error: Option, } impl StreamFrame { - /// Create a new data frame pub fn data(msg_type: impl Into, payload: &[u8]) -> Self { Self { msg_type: msg_type.into(), @@ -63,7 +33,6 @@ impl StreamFrame { } } - /// Create an end frame (no data) pub fn end() -> Self { Self { msg_type: String::new(), @@ -73,7 +42,6 @@ impl StreamFrame { } } - /// Create an error frame pub fn error(error: impl Into) -> Self { Self { msg_type: String::new(), @@ -83,19 +51,16 @@ impl StreamFrame { } } - /// Get the data payload #[inline] pub fn get_data(&self) -> &[u8] { &self.data } - /// Check if this frame contains an error #[inline] pub fn is_error(&self) -> bool { self.error.is_some() } - /// Create a StreamFrame from a Message::Single pub fn from_message(msg: &Message, default_msg_type: &str) -> Self { match msg { Message::Single { msg_type, data } => { @@ -110,7 +75,6 @@ impl StreamFrame { } } - /// Convert this frame to a Message::Single pub fn to_message(&self) -> anyhow::Result> { if let Some(ref error) = self.error { return Err(anyhow::anyhow!("{}", error)); @@ -121,14 +85,11 @@ impl StreamFrame { Ok(Some(Message::single(&self.msg_type, self.data.clone()))) } - /// Serialize to binary format - /// - /// Format: `[4B len][1B flags][2B msg_type_len][msg_type][4B data_len][data][opt: 2B err_len][err]` + /// Serialize to binary format. pub fn to_binary(&self) -> Bytes { let msg_type_bytes = self.msg_type.as_bytes(); let error_bytes = self.error.as_ref().map(|e| e.as_bytes()); - // Calculate total length (excluding the length field itself) let mut content_len = 1 + 2 + msg_type_bytes.len() + 4 + self.data.len(); if let Some(err) = &error_bytes { content_len += 2 + err.len(); @@ -136,10 +97,8 @@ impl StreamFrame { let mut buf = BytesMut::with_capacity(4 + content_len); - // Length prefix buf.put_u32(content_len as u32); - // Flags let mut flags = 0u8; if self.end { flags |= FLAG_END; @@ -149,15 +108,12 @@ impl StreamFrame { } buf.put_u8(flags); - // Message type buf.put_u16(msg_type_bytes.len() as u16); buf.put_slice(msg_type_bytes); - // Data buf.put_u32(self.data.len() as u32); buf.put_slice(&self.data); - // Error message (if any) if let Some(err) = error_bytes { buf.put_u16(err.len() as u16); buf.put_slice(err); @@ -166,7 +122,6 @@ impl StreamFrame { buf.freeze() } - /// Parse from binary format pub fn from_binary(mut buf: &[u8]) -> anyhow::Result { if buf.remaining() < 4 { return Err(anyhow::anyhow!("Buffer too short for length")); @@ -180,12 +135,10 @@ impl StreamFrame { )); } - // Flags let flags = buf.get_u8(); let end = (flags & FLAG_END) != 0; let has_error = (flags & FLAG_ERROR) != 0; - // Message type let msg_type_len = buf.get_u16() as usize; if buf.remaining() < msg_type_len { return Err(anyhow::anyhow!("Invalid msg_type length")); @@ -194,7 +147,6 @@ impl StreamFrame { .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in msg_type: {}", e))?; buf.advance(msg_type_len); - // Data if buf.remaining() < 4 { return Err(anyhow::anyhow!("Missing data length")); } diff --git a/crates/pulsing-actor/src/transport/http2/tls.rs b/crates/pulsing-actor/src/transport/http2/tls.rs index ae75c9c30..2fdc14d63 100644 --- a/crates/pulsing-actor/src/transport/http2/tls.rs +++ b/crates/pulsing-actor/src/transport/http2/tls.rs @@ -1,22 +1,4 @@ -//! TLS support for HTTP/2 transport with passphrase-derived certificates -//! -//! This module provides TLS encryption using certificates derived from a shared passphrase. -//! All nodes using the same passphrase will generate identical CA certificates, enabling -//! automatic mutual TLS authentication. -//! -//! ## Security Model -//! -//! 1. A passphrase is used to deterministically derive a CA certificate -//! 2. Each node generates its own certificate signed by the shared CA -//! 3. Nodes only trust certificates signed by the same CA (same passphrase) -//! -//! ## Usage -//! -//! ```rust,ignore -//! use pulsing_actor::transport::http2::tls::TlsConfig; -//! -//! let tls = TlsConfig::from_passphrase("my-cluster-secret")?; -//! ``` +//! TLS support for HTTP/2 transport (passphrase-derived certificates). use rcgen::{ BasicConstraints, Certificate, CertificateParams, DnType, ExtendedKeyUsagePurpose, IsCa, @@ -31,10 +13,8 @@ use rustls::server::WebPkiClientVerifier; use rustls::{ClientConfig, RootCertStore, ServerConfig}; use std::sync::OnceLock; -/// Global flag to ensure crypto provider is installed only once static CRYPTO_PROVIDER_INSTALLED: OnceLock<()> = OnceLock::new(); -/// Install the ring crypto provider for rustls fn ensure_crypto_provider() { CRYPTO_PROVIDER_INSTALLED.get_or_init(|| { let _ = default_provider().install_default(); @@ -43,56 +23,39 @@ fn ensure_crypto_provider() { use std::sync::Arc; use tokio_rustls::{TlsAcceptor, TlsConnector}; -/// Salt used for HKDF key derivation const HKDF_SALT: &[u8] = b"pulsing-ca-v1"; -/// CA certificate common name const CA_COMMON_NAME: &str = "Pulsing Cluster CA"; -/// Node certificate common name prefix const NODE_CN_PREFIX: &str = "Pulsing Node"; -/// Certificate validity period (10 years in seconds) const CERT_VALIDITY_SECS: i64 = 10 * 365 * 24 * 60 * 60; -/// TLS configuration for HTTP/2 transport +/// TLS configuration for HTTP/2 transport. #[derive(Clone)] pub struct TlsConfig { - /// TLS acceptor for server-side connections pub acceptor: TlsAcceptor, - /// TLS connector for client-side connections pub connector: TlsConnector, - /// The passphrase hash for debugging passphrase_hash: String, } impl TlsConfig { - /// Create TLS configuration from a passphrase - /// - /// The passphrase is used to deterministically derive a CA certificate. - /// All nodes using the same passphrase will generate identical CA certificates, - /// enabling automatic mutual TLS authentication. + /// Create TLS configuration from a passphrase. pub fn from_passphrase(passphrase: &str) -> anyhow::Result { - // Ensure the ring crypto provider is installed ensure_crypto_provider(); - // Derive CA certificate and key from passphrase let (ca_cert, ca_key_pair) = derive_ca_from_passphrase(passphrase)?; - // Generate node certificate signed by CA let (node_cert, node_key_pair) = generate_node_cert(&ca_cert, &ca_key_pair)?; - // Convert to DER format let ca_cert_der = CertificateDer::from(ca_cert.der().to_vec()); let node_cert_der = CertificateDer::from(node_cert.der().to_vec()); let node_key_der = PrivateKeyDer::Pkcs8(PrivatePkcs8KeyDer::from(node_key_pair.serialize_der())); - // Build root cert store with our CA let mut root_store = RootCertStore::empty(); root_store.add(ca_cert_der.clone())?; - // Build server config with client certificate verification let client_verifier = WebPkiClientVerifier::builder(Arc::new(root_store.clone())) .build() .map_err(|e| anyhow::anyhow!("Failed to build client verifier: {}", e))?; @@ -102,13 +65,11 @@ impl TlsConfig { .with_single_cert(vec![node_cert_der.clone()], node_key_der.clone_key()) .map_err(|e| anyhow::anyhow!("Failed to build server config: {}", e))?; - // Build client config let client_config = ClientConfig::builder() .with_root_certificates(root_store) .with_client_auth_cert(vec![node_cert_der], node_key_der) .map_err(|e| anyhow::anyhow!("Failed to build client config: {}", e))?; - // Calculate passphrase hash for debugging let hash = digest(&SHA256, passphrase.as_bytes()); let passphrase_hash = hex_encode(&hash.as_ref()[..8]); @@ -119,22 +80,16 @@ impl TlsConfig { }) } - /// Get the passphrase hash (for debugging/logging purposes only) pub fn passphrase_hash(&self) -> &str { &self.passphrase_hash } - /// Connect to a remote server with TLS - /// - /// Note: server_name is ignored for mTLS connections within the cluster. - /// We use a fixed server name that matches the node certificate's CN pattern. + /// Connect to a remote server with TLS. pub async fn connect( &self, stream: tokio::net::TcpStream, _server_name: &str, ) -> anyhow::Result> { - // Use a fixed server name for internal cluster communication - // The actual authentication is done via mutual TLS (client cert verification) let server_name = ServerName::try_from("pulsing.internal".to_string()) .map_err(|e| anyhow::anyhow!("Invalid server name: {}", e))?; @@ -144,7 +99,6 @@ impl TlsConfig { .map_err(|e| anyhow::anyhow!("TLS connect failed: {}", e)) } - /// Accept a TLS connection pub async fn accept( &self, stream: tokio::net::TcpStream, @@ -164,18 +118,12 @@ impl std::fmt::Debug for TlsConfig { } } -/// Derive CA certificate and key pair from passphrase -/// -/// This function is deterministic - the same passphrase will always produce -/// the same CA certificate and key. +/// Derive CA certificate and key pair from passphrase. fn derive_ca_from_passphrase(passphrase: &str) -> anyhow::Result<(Certificate, KeyPair)> { - // Derive seed using HKDF let seed = derive_seed(passphrase, b"ca-key")?; - // Generate deterministic Ed25519 key pair from seed let key_pair = generate_deterministic_key_pair(&seed)?; - // Create CA certificate with fixed parameters let mut params = CertificateParams::new(vec![CA_COMMON_NAME.to_string()]) .map_err(|e| anyhow::anyhow!("Failed to create cert params: {}", e))?; @@ -192,8 +140,6 @@ fn derive_ca_from_passphrase(passphrase: &str) -> anyhow::Result<(Certificate, K KeyUsagePurpose::DigitalSignature, ]; - // Fixed validity period (use a fixed start time for determinism) - // We use Unix epoch + 1 year as the start time let not_before = time::OffsetDateTime::UNIX_EPOCH + time::Duration::days(365); let not_after = not_before + time::Duration::seconds(CERT_VALIDITY_SECS); params.not_before = not_before; diff --git a/docs/Makefile b/docs/Makefile index 9f44b4de2..ca6bc6bba 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -12,9 +12,13 @@ sync: uv sync --all-extras serve: + @echo "Installing Pulsing in development mode..." + cd .. && uv run maturin develop uv run mkdocs serve build: + @echo "Installing Pulsing in development mode..." + cd .. && uv run maturin develop uv run mkdocs build clean: diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f2bf670ff..5a6facef5 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -114,15 +114,24 @@ plugins: Load Sync: 莟蜜同步 AS Actor Decorator: AS Actor 装饰噚 API Reference: API 参考 + Overview: 抂述 + Python API: Python API + Overview: API 抂述 + Python API: Python API - mkdocstrings: handlers: python: + paths: ["../python"] options: - docstring_style: numpy + docstring_style: google docstring_options: ignore_init_summary: yes merge_init_into_class: yes show_submodules: no + show_category_heading: yes + show_labels: yes + show_symbol_type_heading: yes + show_symbol_type_toc: yes nav: - Home: index.md @@ -159,7 +168,10 @@ nav: - HTTP2 Transport: design/http2-transport.md - Load Sync: design/load_sync.md - AS Actor Decorator: design/as-actor-decorator.md - - API Reference: api_reference.md + - API Reference: + - Overview: api/overview.md + - Python: api/python.md + - Rust: api/rust.md extra: generator: false diff --git a/docs/src/api/overview.md b/docs/src/api/overview.md new file mode 100644 index 000000000..d202c863c --- /dev/null +++ b/docs/src/api/overview.md @@ -0,0 +1,302 @@ +# API Overview + +Pulsing is a distributed actor framework that provides a communication backbone for building distributed systems and applications. + +## Core Concepts + +### Actor System + +Pulsing is built around the [Actor Model](https://en.wikipedia.org/wiki/Actor_model), where actors are the fundamental units of computation. Actors communicate via asynchronous message passing, providing: + +- **Location Transparency**: Same API for local and remote actors +- **Fault Tolerance**: Actors can fail independently without affecting others +- **Concurrency**: Actors process messages one at a time, simplifying concurrent programming + +### Key Features + +- **Zero External Dependencies**: Pure Rust + Tokio implementation +- **Built-in Service Discovery**: SWIM/Gossip protocol for cluster management +- **Streaming Support**: Native support for streaming requests/responses +- **Multi-Language**: Python-first with Rust core, extensible to other languages + +## API Styles + +### Python APIs + +Pulsing provides multiple API styles to fit different use cases: + +#### 1. Actor System Style (Explicit Management) + +```python +import pulsing as pul + +# Create and manage actor system explicitly +system = await pul.actor_system(addr="0.0.0.0:8000") + +# Spawn actors +actor = await system.spawn(MyActor(), name="my_actor") + +# Communicate +response = await actor.ask({"message": "hello"}) + +# Shutdown +await system.shutdown() +``` + +#### 2. Ray-Style Global API (Convenience) + +```python +import pulsing as pul + +# Initialize global system +await pul.init(addr="0.0.0.0:8000") + +# Spawn actors using global system +actor = await pul.spawn(MyActor(), name="my_actor") + +# Communicate +response = await actor.ask({"message": "hello"}) + +# Shutdown +await pul.shutdown() +``` + +#### 3. Ray-Compatible API (Migration) + +```python +from pulsing.compat import ray + +# Ray-compatible API for easy migration +ray.init(address="0.0.0.0:8000") + +@ray.remote +class MyActor: + def process(self, data): + return f"Processed: {data}" + +actor = MyActor.remote() +result = ray.get(actor.process.remote("hello")) + +ray.shutdown() +``` + +### Actor Patterns + +#### Remote Decorator (Recommended) + +```python +import pulsing as pul + +@pul.remote +class Counter: + def __init__(self, init=0): + self.value = init + + # Synchronous method - serial execution + def incr(self): + self.value += 1 + return self.value + + # Asynchronous method - concurrent execution + async def fetch_and_add(self, url): + data = await http_get(url) + self.value += data + return self.value + +# Usage +counter = await Counter.spawn(name="counter") +result = await counter.incr() +``` + +#### Base Actor Class + +```python +from pulsing.actor import Actor + +class MyActor(Actor): + async def receive(self, msg): + if msg.get("action") == "greet": + return f"Hello, {msg.get('name', 'World')}!" + return "Unknown action" + +# Usage +system = await pul.actor_system() +actor = await system.spawn(MyActor(), name="greeter") +response = await actor.ask({"action": "greet", "name": "Alice"}) +``` + +### Message Passing + +#### Ask vs Tell + +- **`ask(msg)`**: Request/response pattern, waits for and returns a response +- **`tell(msg)`**: Fire-and-forget pattern, sends message without waiting + +```python +# Ask - get response +response = await actor.ask({"action": "compute", "data": [1, 2, 3]}) + +# Tell - no response expected +await actor.tell({"action": "log", "level": "info", "message": "Event occurred"}) +``` + +### Streaming + +Pulsing supports streaming responses for large data or continuous generation: + +```python +@pul.remote +class StreamingService: + async def generate_tokens(self, prompt): + for token in generate_tokens(prompt): + yield token + +# Usage +service = await StreamingService.spawn() +async for token in service.generate_tokens("Hello world"): + print(token, end="") +``` + +### Supervision & Fault Tolerance + +Actors can be configured with restart policies for fault tolerance: + +```python +@pul.remote( + restart_policy="on_failure", # "never", "on_failure", "always" + max_restarts=3, + min_backoff=0.1, + max_backoff=30.0 +) +class ResilientWorker: + def process(self, data): + # If this raises an exception, the actor will be restarted + return risky_computation(data) +``` + +### Distributed Queues + +Pulsing includes a distributed queue system for data pipelines: + +```python +# Writer +writer = await system.queue.write("my_topic", bucket_column="user_id") +await writer.put({"user_id": "u1", "data": "hello"}) +await writer.flush() + +# Reader +reader = await system.queue.read("my_topic") +records = await reader.get(limit=100) +``` + +## Rust APIs + +### Core Traits + +Rust API is organized into trait layers: + +#### ActorSystemCoreExt (Primary API) + +```rust +use pulsing_actor::prelude::*; + +// Spawn actors +let actor = system.spawn_named("services/echo", EchoActor).await?; + +// Communicate +let response = actor.ask(Ping(42)).await?; +``` + +#### Actor Implementation + +```rust +use pulsing_actor::prelude::*; +use async_trait::async_trait; + +struct MyActor; + +#[async_trait] +impl Actor for MyActor { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + // Process message and return response + Message::pack(&Pong(42)) + } +} +``` + +### Behavior (Type-Safe Actors) + +```rust +use pulsing_actor::prelude::*; + +fn counter(init: i32) -> Behavior { + stateful(init, |count, n, _ctx| { + *count += n; + BehaviorAction::Same + }) +} + +// Usage +let counter = system.spawn(counter(0)).await?; +``` + +## Error Handling + +### Python + +```python +try: + response = await actor.ask({"action": "process", "data": data}) +except RuntimeError as e: + # Actor-side exceptions are wrapped as RuntimeError + print(f"Actor error: {e}") +except ConnectionError as e: + # Network errors + print(f"Connection error: {e}") +except asyncio.TimeoutError as e: + # Timeout errors + print(f"Timeout: {e}") +``` + +### Rust + +```rust +use anyhow::Result; + +match actor.ask(Ping(42)).await { + Ok(response) => println!("Got: {:?}", response), + Err(e) => println!("Error: {:?}", e), +} +``` + +## Security Considerations + +### Trust Boundaries + +- **Pickle payloads** in Python-Python communication can lead to RCE if untrusted +- Use TLS in production deployments +- Treat the cluster as an authenticated trust boundary + +### Network Security + +```python +# Enable TLS +system = await pul.actor_system( + addr="0.0.0.0:8000", + passphrase="your-secret-passphrase" +) +``` + +## Performance Characteristics + +- **Low Latency**: HTTP/2 transport with binary serialization +- **High Throughput**: Async runtime with efficient task scheduling +- **Memory Efficient**: Actor-based concurrency without threads +- **Scalable**: Gossip-based cluster discovery for large deployments + +## Next Steps + +- **[Python API Reference](python.md)**: Complete Python API documentation +- **[Rust API Reference](rust.md)**: Complete Rust API documentation +- **[Examples](../examples/)**: Working code examples +- **[Guide](../guide/)**: In-depth guides and tutorials \ No newline at end of file diff --git a/docs/src/api/overview.zh.md b/docs/src/api/overview.zh.md new file mode 100644 index 000000000..9aa863fac --- /dev/null +++ b/docs/src/api/overview.zh.md @@ -0,0 +1,299 @@ +# API 抂述 + +Pulsing 是䞀䞪分垃匏 actor 框架可以䜜䞺任意分垃匏系统的通信骚干以方䟿快速搭建分垃匏系统和应甚。 + +## 栞心抂念 + +### Actor 系统 + +Pulsing 基于[Actor 暡型](https://en.wikipedia.org/wiki/Actor_model)构建其䞭 actor 是计算的基本单䜍。Actor 通过匂步消息䌠递进行通信提䟛 + +- **䜍眮透明性**本地和远皋 actor 䜿甚盞同 API +- **容错性**Actor 可以独立倱莥䞍䌚圱响其他 actor +- **并发性**Actor 䞀次倄理䞀条消息简化并发猖皋 + +### 䞻芁特性 + +- **零倖郚䟝赖**纯 Rust + Tokio 实现 +- **内眮服务发现**SWIM/Gossip 协议管理集矀 +- **流匏支持**原生支持流匏请求/响应 +- **倚语蚀**Python 䌘先Rust 栞心可扩展到其他语蚀 + +## API 风栌 + +### Python API + +Pulsing 提䟛倚种 API 风栌来适应䞍同甚䟋 + +#### 1. Actor System 风栌星匏管理 + +```python +import pulsing as pul + +# 星匏创建和管理 actor 系统 +system = await pul.actor_system(addr="0.0.0.0:8000") + +# 生成 actor +actor = await system.spawn(MyActor(), name="my_actor") + +# 通信 +response = await actor.ask({"message": "hello"}) + +# 关闭 +await system.shutdown() +``` + +#### 2. Ray 风栌党局 API䟿捷 + +```python +import pulsing as pul + +# 初始化党局系统 +await pul.init(addr="0.0.0.0:8000") + +# 䜿甚党局系统生成 actor +actor = await pul.spawn(MyActor(), name="my_actor") + +# 通信 +response = await actor.ask({"message": "hello"}) + +# 关闭 +await pul.shutdown() +``` + +#### 3. Ray 兌容 API迁移 + +```python +from pulsing.compat import ray + +# Ray 兌容 API方䟿迁移 +ray.init(address="0.0.0.0:8000") + +@ray.remote +class MyActor: + def process(self, data): + return f"Processed: {data}" + +actor = MyActor.remote() +result = ray.get(actor.process.remote("hello")) + +ray.shutdown() +``` + +### Actor 暡匏 + +#### Remote 装饰噚掚荐 + +```python +import pulsing as pul + +@pul.remote +class Counter: + def __init__(self, init=0): + self.value = init + + # 同步方法 - 䞲行执行 + def incr(self): + self.value += 1 + return self.value + + # 匂步方法 - 并发执行 + async def fetch_and_add(self, url): + data = await http_get(url) + self.value += data + return self.value + +# 䜿甚 +counter = await Counter.spawn(name="counter") +result = await counter.incr() +``` + +#### 基础 Actor ç±» + +```python +from pulsing.actor import Actor + +class MyActor(Actor): + async def receive(self, msg): + if msg.get("action") == "greet": + return f"Hello, {msg.get('name', 'World')}!" + return "Unknown action" + +# 䜿甚 +system = await pul.actor_system() +actor = await system.spawn(MyActor(), name="greeter") +response = await actor.ask({"action": "greet", "name": "Alice"}) +``` + +### 消息䌠递 + +#### Ask vs Tell + +- **`ask(msg)`**请求/响应暡匏等埅并返回响应 +- **`tell(msg)`**发射后䞍管暡匏发送消息䞍等埅 + +```python +# Ask - 获取响应 +response = await actor.ask({"action": "compute", "data": [1, 2, 3]}) + +# Tell - 无需响应 +await actor.tell({"action": "log", "level": "info", "message": "Event occurred"}) +``` + +### 流匏响应 + +Pulsing 支持流匏响应甚于倧数据或持续生成 + +```python +@pul.remote +class StreamingService: + async def generate_tokens(self, prompt): + for token in generate_tokens(prompt): + yield token + +# 䜿甚 +service = await StreamingService.spawn() +async for token in service.generate_tokens("Hello world"): + print(token, end="") +``` + +### 监督䞎容错 + +Actor 可以配眮重启策略以实现容错 + +```python +@pul.remote( + restart_policy="on_failure", # "never", "on_failure", "always" + max_restarts=3, + min_backoff=0.1, + max_backoff=30.0 +) +class ResilientWorker: + def process(self, data): + # 劂果抛出匂垞Actor 䌚自劚重启 + return risky_computation(data) +``` + +### 分垃匏队列 + +Pulsing 包含分垃匏队列系统甚于数据管道 + +```python +# 写入 +writer = await system.queue.write("my_topic", bucket_column="user_id") +await writer.put({"user_id": "u1", "data": "hello"}) +await writer.flush() + +# 读取 +reader = await system.queue.read("my_topic") +records = await reader.get(limit=100) +``` + +## Rust API + +### 栞心 Trait + +Rust API 通过 trait 定义契纊分䞺䞉层 + +#### ActorSystemCoreExt䞻路埄prelude 自劚富入 + +```rust +use pulsing_actor::prelude::*; + +// 生成 actor +let actor = system.spawn_named("services/echo", EchoActor).await?; + +// 通信 +let response = actor.ask(Ping(42)).await?; +``` + +#### ActorSystemAdvancedExt高级可重启监督 + +Factory 暡匏生成支持监督重启仅呜名 actor + +```rust +let options = SpawnOptions::new() + .supervision(SupervisionSpec::on_failure().max_restarts(3)); + +// 仅呜名 actor 支持 supervision +system.spawn_named_factory("services/worker", || Ok(Worker::new()), options).await?; +``` + +#### ActorSystemOpsExt运绎/诊断/生呜呚期 + +系统信息、集矀成员、停止/关闭等 + +```rust +system.node_id(); +system.addr(); +system.members().await; +system.all_named_actors().await; +system.stop("name").await?; +system.shutdown().await?; +``` + +### Behavior类型安党Akka Typed 风栌 + +- **栞心**`Behavior` + `TypedRef` + `BehaviorAction (Same/Become/Stop)` +- **纊定**`TypedRef` 芁求 `M: Serialize + DeserializeOwned + Send + 'static` + +## 错误倄理 + +### Python + +```python +try: + response = await actor.ask({"action": "process", "data": data}) +except RuntimeError as e: + # Actor 端匂垞䜜䞺 RuntimeError 䌠蟓 + print(f"Actor error: {e}") +except ConnectionError as e: + # 眑络错误 + print(f"Connection error: {e}") +except asyncio.TimeoutError as e: + # 超时错误 + print(f"Timeout: {e}") +``` + +### Rust + +```rust +use anyhow::Result; + +match actor.ask(Ping(42)).await { + Ok(response) => println!("Got: {:?}", response), + Err(e) => println!("Error: {:?}", e), +} +``` + +## 安党考虑 + +### 信任蟹界 + +- **Pickle 蜜荷**圚 Python-Python 通信䞭可胜富臎 RCE +- 生产环境䜿甚 TLS +- 将集矀视䞺经过讀证的信任蟹界 + +### 眑络安党 + +```python +# 启甚 TLS +system = await pul.actor_system( + addr="0.0.0.0:8000", + passphrase="your-secret-passphrase" +) +``` + +## 性胜特性 + +- **䜎延迟**HTTP/2 䌠蟓䞎二进制序列化 +- **高吞吐量**匂步运行时䞎高效任务调床 +- **内存高效**基于 actor 的并发无需线皋 +- **可扩展**Gossip 基础集矀发现适甚于倧型郚眲 + +## 后续步骀 + +- **[Python API](python.md)**: Python 接口完敎文档 +- **[Rust API](rust.md)**: Rust 接口完敎文档 +- **[瀺䟋](../../examples/)**: 工䜜代码瀺䟋 +- **[指南](../../guide/)**: 深入指南和教皋 \ No newline at end of file diff --git a/docs/src/api/python.md b/docs/src/api/python.md new file mode 100644 index 000000000..b8d534bea --- /dev/null +++ b/docs/src/api/python.md @@ -0,0 +1,39 @@ +# Python API Reference + +This page contains the complete auto-generated API documentation for Pulsing's Python interface. + +## Installation + +Pulsing requires Python 3.10+ and can be installed via pip: + +```bash +pip install pulsing +``` + +For development, clone the repository and install in development mode: + +```bash +git clone https://github.com/DeepLink-org/pulsing +cd pulsing +pip install -e . +``` + +## Core Module + +::: pulsing + +## Actor Module + +::: pulsing.actor + +## Agent Module + +::: pulsing.agent + +## Compatibility Module + +::: pulsing.compat + +## Queue Module + +::: pulsing.queue \ No newline at end of file diff --git a/docs/src/api/python.zh.md b/docs/src/api/python.zh.md new file mode 100644 index 000000000..dbc81bb4c --- /dev/null +++ b/docs/src/api/python.zh.md @@ -0,0 +1,39 @@ +# Python API 参考 + +歀页面包含 Pulsing Python 接口的完敎自劚生成 API 文档。 + +## 安装 + +Pulsing 需芁 Python 3.10+可以通过 pip 安装 + +```bash +pip install pulsing +``` + +匀发时克隆仓库并以匀发暡匏安装 + +```bash +git clone https://github.com/DeepLink-org/pulsing +cd pulsing +pip install -e . +``` + +## 栞心暡块 + +::: pulsing + +## Actor 暡块 + +::: pulsing.actor + +## Agent 暡块 + +::: pulsing.agent + +## 兌容性暡块 + +::: pulsing.compat + +## 队列暡块 + +::: pulsing.queue \ No newline at end of file diff --git a/docs/src/api/rust.md b/docs/src/api/rust.md new file mode 100644 index 000000000..ca14ce2da --- /dev/null +++ b/docs/src/api/rust.md @@ -0,0 +1,394 @@ +# Rust API Reference + +This page provides an overview of Pulsing's Rust API with examples and usage patterns. + +## Installation + +Add Pulsing to your `Cargo.toml`: + +```toml +[dependencies] +pulsing-actor = "0.1" +tokio = { version = "1.0", features = ["full"] } +serde = { version = "1.0", features = ["derive"] } +``` + +## Core Concepts + +The Rust API is organized into trait layers that provide different levels of functionality: + +- **ActorSystemCoreExt**: Primary API for spawning and resolving actors +- **ActorSystemAdvancedExt**: Advanced features like supervision and factory-based spawning +- **ActorSystemOpsExt**: Operations, diagnostics, and lifecycle management + +## Quick Start + +```rust +use pulsing_actor::prelude::*; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +struct Ping(i32); + +#[derive(Serialize, Deserialize)] +struct Pong(i32); + +struct Echo; + +#[async_trait::async_trait] +impl Actor for Echo { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + let Ping(x) = msg.unpack()?; + Message::pack(&Pong(x)) + } +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let system = ActorSystem::builder().build().await?; + + // Spawn a named actor + let actor = system.spawn_named("services/echo", Echo).await?; + + // Send a message and wait for response + let Pong(x): Pong = actor.ask(Ping(42)).await?; + + println!("Received: {}", x); + + system.shutdown().await?; + Ok(()) +} +``` + +## Core API + +### ActorSystem + +The main entry point for the actor system. + +```rust +pub struct ActorSystem { /* fields omitted */ } + +impl ActorSystem { + pub async fn builder() -> ActorSystemBuilder { + // Create a new actor system builder + } +} +``` + +#### ActorSystemBuilder + +Builder pattern for configuring the actor system. + +```rust +pub struct ActorSystemBuilder { /* fields omitted */ } + +impl ActorSystemBuilder { + pub fn addr>(self, addr: A) -> Self { + // Set the bind address + } + + pub fn seeds>(self, seeds: I) -> Self { + // Set seed nodes for cluster discovery + } + + pub fn build(self) -> impl Future> { + // Build the actor system + } +} +``` + +### ActorSystemCoreExt + +Core spawning and resolving functionality. + +```rust +#[async_trait::async_trait] +pub trait ActorSystemCoreExt { + async fn spawn(&self, actor: A) -> anyhow::Result> + where + A: Actor + 'static; + + async fn spawn_named( + &self, + name: &str, + actor: A + ) -> anyhow::Result> + where + A: Actor + 'static; + + async fn actor_ref(&self, id: &ActorId) -> anyhow::Result; + + async fn resolve(&self, name: &str) -> anyhow::Result; +} +``` + +### Actor Trait + +The core trait that all actors must implement. + +```rust +#[async_trait::async_trait] +pub trait Actor: Send + 'static { + type Message: Serialize + for<'de> Deserialize<'de> + Send + 'static; + + async fn receive( + &mut self, + msg: Message, + ctx: &mut ActorContext + ) -> anyhow::Result; + + fn on_start(&mut self, _id: ActorId, _ctx: &mut ActorContext) {} + + fn on_stop(&mut self, _ctx: &mut ActorContext) {} +} +``` + +### TypedRef + +Type-safe reference to an actor. + +```rust +pub struct TypedRef { /* fields omitted */ } + +impl TypedRef +where + M: Serialize + for<'de> Deserialize<'de> + Send + 'static, +{ + pub async fn ask(&self, msg: M) -> anyhow::Result { + // Send message and wait for typed response + } + + pub async fn tell(&self, msg: M) -> anyhow::Result<()> { + // Send message without waiting for response + } +} +``` + +## Advanced Features + +### Supervision + +Actors can be configured with restart policies for fault tolerance. + +```rust +use pulsing_actor::system::SupervisionSpec; + +let options = SpawnOptions::new() + .supervision(SupervisionSpec::on_failure().max_restarts(3)); + +// Factory-based spawning with supervision +system.spawn_named_factory("services/worker", || Ok(Worker::new()), options).await?; +``` + +### Behavior (Type-Safe Actors) + +A higher-level API for type-safe actors using the behavior pattern. + +```rust +use pulsing_actor::prelude::*; + +fn counter(init: i32) -> Behavior { + stateful(init, |count, n, _ctx| { + *count += n; + BehaviorAction::Same + }) +} + +// Behavior implements IntoActor trait +let counter = system.spawn(counter(0)).await?; +let result: i32 = counter.ask(5).await?; // Result is 5 +``` + +### Message Types + +#### Regular Messages + +```rust +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +struct MyMessage { + action: String, + data: Vec, +} + +// Pack/unpack messages +let msg = Message::pack(&MyMessage { + action: "process".to_string(), + data: vec![1, 2, 3], +})?; + +let MyMessage { action, data } = msg.unpack()?; +``` + +#### Streaming Messages + +```rust +// For streaming responses +let stream_msg = Message::Stream(Stream::from_iter(items)); + +// Handle streaming in actor +async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + match msg { + Message::Stream(stream) => { + // Process stream + let result = process_stream(stream).await?; + Message::pack(&result) + } + _ => Message::pack(&"Unsupported message type") + } +} +``` + +## Cluster Management + +### Node Discovery + +Pulsing uses the SWIM protocol for automatic cluster discovery. + +```rust +// Single node +let system = ActorSystem::builder() + .addr("0.0.0.0:8000") + .build() + .await?; + +// Join existing cluster +let system = ActorSystem::builder() + .addr("0.0.0.0:8001") + .seeds(vec!["127.0.0.1:8000".to_string()]) + .build() + .await?; +``` + +### ActorSystemOpsExt + +Operations and diagnostics. + +```rust +#[async_trait::async_trait] +pub trait ActorSystemOpsExt { + fn node_id(&self) -> NodeId; + + fn addr(&self) -> &str; + + async fn members(&self) -> Vec; + + async fn all_named_actors(&self) -> HashMap; + + async fn stop(&self, name: &str) -> anyhow::Result<()>; + + async fn shutdown(self) -> anyhow::Result<()>; +} +``` + +## Error Handling + +Pulsing uses `anyhow::Result` for error handling throughout the API. + +```rust +use anyhow::{Result, Context}; + +async fn my_actor_logic(system: &ActorSystem) -> Result<()> { + let actor = system.spawn_named("my_actor", MyActor) + .await + .context("Failed to spawn actor")?; + + let response = actor.ask(MyMessage::default()) + .await + .context("Failed to send message")?; + + Ok(()) +} +``` + +## Examples + +### HTTP Server Actor + +```rust +use pulsing_actor::prelude::*; +use warp::Filter; + +struct HttpServer { + system: ActorSystem, +} + +#[async_trait::async_trait] +impl Actor for HttpServer { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + // Handle HTTP requests by forwarding to other actors + let request: HttpRequest = msg.unpack()?; + let processor = self.system.resolve("request_processor").await?; + processor.ask(request).await + } +} +``` + +### Worker Pool + +```rust +use pulsing_actor::prelude::*; + +struct WorkerPool { + workers: Vec, +} + +#[async_trait::async_trait] +impl Actor for WorkerPool { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + // Round-robin task distribution + let worker = &self.workers[self.next_worker()]; + worker.ask(msg).await + } +} +``` + +## Performance Considerations + +- **Zero-copy messaging**: Messages are passed by reference when possible +- **Async runtime**: Built on Tokio for high concurrency +- **Binary serialization**: Efficient bincode serialization +- **Connection pooling**: HTTP/2 connection reuse + +## Integration + +### With Axum/Warp + +```rust +use axum::{routing::post, Router}; +use pulsing_actor::prelude::*; + +async fn handle_request( + Extension(system): Extension, + Json(payload): Json, +) -> Json { + let actor = system.resolve("request_handler").await?; + let response: MyResponse = actor.ask(payload).await?; + Ok(Json(response)) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let system = ActorSystem::builder().build().await?; + + let app = Router::new() + .route("/api", post(handle_request)) + .layer(Extension(system.clone())); + + // Start both HTTP server and actor system + tokio::select! { + _ = serve(app, ([127, 0, 0, 1], 3000)) => {}, + _ = system.run() => {}, + } + + Ok(()) +} +``` + +## Next Steps + +- **[Python API](python.md)**: Python interface documentation +- **[Examples](../../examples/)**: Working Rust examples +- **[Design Documents](../../design/)**: Architecture and design decisions \ No newline at end of file diff --git a/docs/src/api/rust.zh.md b/docs/src/api/rust.zh.md new file mode 100644 index 000000000..d1ca1da38 --- /dev/null +++ b/docs/src/api/rust.zh.md @@ -0,0 +1,394 @@ +# Rust API 参考 + +歀页面提䟛 Pulsing Rust API 的抂述包括瀺䟋和䜿甚暡匏。 + +## 安装 + +圚 `Cargo.toml` 䞭添加 Pulsing + +```toml +[dependencies] +pulsing-actor = "0.1" +tokio = { version = "1.0", features = ["full"] } +serde = { version = "1.0", features = ["derive"] } +``` + +## 栞心抂念 + +Rust API 通过 trait 层组织提䟛䞍同级别的功胜 + +- **ActorSystemCoreExt**䞻芁 API甚于生成和解析 actor +- **ActorSystemAdvancedExt**高级功胜劂监督和基于工厂的生成 +- **ActorSystemOpsExt**运绎、诊断和生呜呚期管理 + +## 快速匀始 + +```rust +use pulsing_actor::prelude::*; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +struct Ping(i32); + +#[derive(Serialize, Deserialize)] +struct Pong(i32); + +struct Echo; + +#[async_trait::async_trait] +impl Actor for Echo { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + let Ping(x) = msg.unpack()?; + Message::pack(&Pong(x)) + } +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let system = ActorSystem::builder().build().await?; + + // 生成呜名 actor + let actor = system.spawn_named("services/echo", Echo).await?; + + // 发送消息并等埅响应 + let Pong(x): Pong = actor.ask(Ping(42)).await?; + + println!("Received: {}", x); + + system.shutdown().await?; + Ok(()) +} +``` + +## 栞心 API + +### ActorSystem + +Actor 系统的䞻芁入口点。 + +```rust +pub struct ActorSystem { /* fields omitted */ } + +impl ActorSystem { + pub async fn builder() -> ActorSystemBuilder { + // 创建新的 actor 系统构建噚 + } +} +``` + +#### ActorSystemBuilder + +构建噚暡匏甚于配眮 actor 系统。 + +```rust +pub struct ActorSystemBuilder { /* fields omitted */ } + +impl ActorSystemBuilder { + pub fn addr>(self, addr: A) -> Self { + // 讟眮绑定地址 + } + + pub fn seeds>(self, seeds: I) -> Self { + // 讟眮集矀发现的种子节点 + } + + pub fn build(self) -> impl Future> { + // 构建 actor 系统 + } +} +``` + +### ActorSystemCoreExt + +栞心生成和解析功胜。 + +```rust +#[async_trait::async_trait] +pub trait ActorSystemCoreExt { + async fn spawn(&self, actor: A) -> anyhow::Result> + where + A: Actor + 'static; + + async fn spawn_named( + &self, + name: &str, + actor: A + ) -> anyhow::Result> + where + A: Actor + 'static; + + async fn actor_ref(&self, id: &ActorId) -> anyhow::Result; + + async fn resolve(&self, name: &str) -> anyhow::Result; +} +``` + +### Actor Trait + +所有 actor 必须实现的 core trait。 + +```rust +#[async_trait::async_trait] +pub trait Actor: Send + 'static { + type Message: Serialize + for<'de> Deserialize<'de> + Send + 'static; + + async fn receive( + &mut self, + msg: Message, + ctx: &mut ActorContext + ) -> anyhow::Result; + + fn on_start(&mut self, _id: ActorId, _ctx: &mut ActorContext) {} + + fn on_stop(&mut self, _ctx: &mut ActorContext) {} +} +``` + +### TypedRef + +Actor 的类型安党匕甚。 + +```rust +pub struct TypedRef { /* fields omitted */ } + +impl TypedRef +where + M: Serialize + for<'de> Deserialize<'de> + Send + 'static, +{ + pub async fn ask(&self, msg: M) -> anyhow::Result { + // 发送消息并等埅类型化响应 + } + + pub async fn tell(&self, msg: M) -> anyhow::Result<()> { + // 发送消息而䞍等埅响应 + } +} +``` + +## 高级功胜 + +### 监督 + +Actor 可以配眮重启策略以实现容错。 + +```rust +use pulsing_actor::system::SupervisionSpec; + +let options = SpawnOptions::new() + .supervision(SupervisionSpec::on_failure().max_restarts(3)); + +// 基于工厂的生成支持监督 +system.spawn_named_factory("services/worker", || Ok(Worker::new()), options).await?; +``` + +### Behavior类型安党 Actor + +䜿甚行䞺暡匏的曎高级别 API。 + +```rust +use pulsing_actor::prelude::*; + +fn counter(init: i32) -> Behavior { + stateful(init, |count, n, _ctx| { + *count += n; + BehaviorAction::Same + }) +} + +// Behavior 实现 IntoActor trait +let counter = system.spawn(counter(0)).await?; +let result: i32 = counter.ask(5).await?; // Result is 5 +``` + +### 消息类型 + +#### 垞规消息 + +```rust +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +struct MyMessage { + action: String, + data: Vec, +} + +// 打包/解包消息 +let msg = Message::pack(&MyMessage { + action: "process".to_string(), + data: vec![1, 2, 3], +})?; + +let MyMessage { action, data } = msg.unpack()?; +``` + +#### 流匏消息 + +```rust +// 流匏响应 +let stream_msg = Message::Stream(Stream::from_iter(items)); + +// 圚 actor 䞭倄理流匏 +async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + match msg { + Message::Stream(stream) => { + // 倄理流 + let result = process_stream(stream).await?; + Message::pack(&result) + } + _ => Message::pack(&"Unsupported message type") + } +} +``` + +## 集矀管理 + +### 节点发现 + +Pulsing 䜿甚 SWIM 协议进行自劚集矀发现。 + +```rust +// 单节点 +let system = ActorSystem::builder() + .addr("0.0.0.0:8000") + .build() + .await?; + +// 加入现有集矀 +let system = ActorSystem::builder() + .addr("0.0.0.0:8001") + .seeds(vec!["127.0.0.1:8000".to_string()]) + .build() + .await?; +``` + +### ActorSystemOpsExt + +运绎和诊断。 + +```rust +#[async_trait::async_trait] +pub trait ActorSystemOpsExt { + fn node_id(&self) -> NodeId; + + fn addr(&self) -> &str; + + async fn members(&self) -> Vec; + + async fn all_named_actors(&self) -> HashMap; + + async fn stop(&self, name: &str) -> anyhow::Result<()>; + + async fn shutdown(self) -> anyhow::Result<()>; +} +``` + +## 错误倄理 + +Pulsing 圚敎䞪 API 䞭䜿甚 `anyhow::Result` 进行错误倄理。 + +```rust +use anyhow::{Result, Context}; + +async fn my_actor_logic(system: &ActorSystem) -> Result<()> { + let actor = system.spawn_named("my_actor", MyActor) + .await + .context("Failed to spawn actor")?; + + let response = actor.ask(MyMessage::default()) + .await + .context("Failed to send message")?; + + Ok(()) +} +``` + +## 瀺䟋 + +### HTTP 服务噚 Actor + +```rust +use pulsing_actor::prelude::*; +use warp::Filter; + +struct HttpServer { + system: ActorSystem, +} + +#[async_trait::async_trait] +impl Actor for HttpServer { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + // 通过蜬发到其他 actor 来倄理 HTTP 请求 + let request: HttpRequest = msg.unpack()?; + let processor = self.system.resolve("request_processor").await?; + processor.ask(request).await + } +} +``` + +### Worker æ±  + +```rust +use pulsing_actor::prelude::*; + +struct WorkerPool { + workers: Vec, +} + +#[async_trait::async_trait] +impl Actor for WorkerPool { + async fn receive(&mut self, msg: Message, _ctx: &mut ActorContext) -> anyhow::Result { + // 蜮询任务分发 + let worker = &self.workers[self.next_worker()]; + worker.ask(msg).await + } +} +``` + +## 性胜考虑 + +- **零拷莝消息䌠递**消息尜可胜通过匕甚䌠递 +- **匂步运行时**基于 Tokio 实现高并发 +- **二进制序列化**高效的 bincode 序列化 +- **连接池化**HTTP/2 连接重甚 + +## 集成 + +### 侎 Axum/Warp 集成 + +```rust +use axum::{routing::post, Router}; +use pulsing_actor::prelude::*; + +async fn handle_request( + Extension(system): Extension, + Json(payload): Json, +) -> Json { + let actor = system.resolve("request_handler").await?; + let response: MyResponse = actor.ask(payload).await?; + Ok(Json(response)) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let system = ActorSystem::builder().build().await?; + + let app = Router::new() + .route("/api", post(handle_request)) + .layer(Extension(system.clone())); + + // 同时启劚 HTTP 服务噚和 actor 系统 + tokio::select! { + _ = serve(app, ([127, 0, 0, 1], 3000)) => {}, + _ = system.run() => {}, + } + + Ok(()) +} +``` + +## 后续步骀 + +- **[Python API](python.md)**: Python 接口文档 +- **[瀺䟋](../../examples/)**: Rust 瀺䟋代码 +- **[讟计文档](../../design/)**: 架构和讟计决策 \ No newline at end of file diff --git a/docs/src/api_reference.md b/docs/src/api_reference.md index 9804a38e2..fafb074b8 100644 --- a/docs/src/api_reference.md +++ b/docs/src/api_reference.md @@ -69,30 +69,51 @@ Note: error *type information and remote stack traces* are not guaranteed to be Create a new Actor System instance. ```python +import asyncio import pulsing as pul -system = await pul.actor_system( - addr: str | None = None, # Bind address, None for standalone - *, - seeds: list[str] | None = None, # Seed nodes for cluster - passphrase: str | None = None, # TLS passphrase -) -> ActorSystem +async def example(): + system = await pul.actor_system( + addr=None, # Bind address, None for standalone + # Keyword-only arguments follow + seeds=None, # Seed nodes for cluster (list[str] or None) + passphrase=None, # TLS passphrase (str or None) + ) + return system + +# Usage example +if __name__ == "__main__": + # To run the example + # system = asyncio.run(example()) + pass ``` +**Returns:** `ActorSystem` instance + **Example:** ```python -# Standalone mode -system = await pul.actor_system() +import asyncio +import pulsing as pul -# Cluster mode -system = await pul.actor_system(addr="0.0.0.0:8000") +async def main(): + # Standalone mode + system = await pul.actor_system() + await system.shutdown() -# Join existing cluster -system = await pul.actor_system(addr="0.0.0.0:8001", seeds=["127.0.0.1:8000"]) + # Cluster mode + system = await pul.actor_system(addr="0.0.0.0:8000") + await system.shutdown() -# Shutdown -await system.shutdown() + # Join existing cluster + system = await pul.actor_system( + addr="0.0.0.0:8001", + seeds=["127.0.0.1:8000"] + ) + await system.shutdown() + +if __name__ == "__main__": + asyncio.run(main()) ``` ### pul.init / pul.shutdown @@ -100,19 +121,33 @@ await system.shutdown() Global system initialization (Ray-style async API). ```python +import asyncio import pulsing as pul -# Initialize global system -await pul.init(addr=None, seeds=None, passphrase=None) +class MyActor: + async def receive(self, msg): + return f"echo: {msg}" + +async def main(): + # Initialize global system + await pul.init(addr=None, seeds=None, passphrase=None) + + # Use global system + actor = await pul.spawn(MyActor()) + ref = await pul.resolve("actor_name") -# Use global system -actor = await pul.spawn(MyActor()) -ref = await pul.resolve("actor_name") + # Shutdown + await pul.shutdown() -# Shutdown -await pul.shutdown() +if __name__ == "__main__": + asyncio.run(main()) ``` +**Parameters:** +- `addr`: Bind address (str or None for standalone) +- `seeds`: Seed nodes to join cluster (list[str] or None) +- `passphrase`: TLS passphrase (str or None) + ## Core Classes ### ActorSystem @@ -123,15 +158,15 @@ Main entry point for the actor system. class ActorSystem: async def spawn( self, - actor: Actor, - *, - name: str | None = None, + actor, # Actor instance + # Keyword-only arguments follow + name=None, # Actor name (str or None) # public parameter is deprecated: all named actors are resolvable - restart_policy: str = "never", - max_restarts: int = 3, - min_backoff: float = 0.1, - max_backoff: float = 30.0 - ) -> ActorRef: + restart_policy="never", # Restart policy ("never", "always", "on-failure") + max_restarts=3, # Maximum restart attempts + min_backoff=0.1, # Minimum backoff seconds + max_backoff=30.0 # Maximum backoff seconds + ): """ Spawn a new actor. @@ -140,15 +175,15 @@ class ActorSystem: """ pass - async def refer(self, actorid: ActorId | str) -> ActorRef: + async def refer(self, actorid): """Get ActorRef by ActorId.""" pass - async def resolve(self, name: str, *, node_id: int | None = None) -> ActorRef: + async def resolve(self, name, *, node_id=None): """Resolve actor by name.""" pass - async def shutdown(self) -> None: + async def shutdown(self): """Shutdown the actor system.""" pass ``` @@ -160,15 +195,15 @@ Low-level reference to an actor. Use `ask()` and `tell()` to communicate. ```python class ActorRef: @property - def actor_id(self) -> ActorId: + def actor_id(self): """Get the actor's ID.""" pass - async def ask(self, msg: Any) -> Any: + async def ask(self, msg): """Send a message and wait for response.""" pass - async def tell(self, msg: Any) -> None: + async def tell(self, msg): """Send a message without waiting for response (fire-and-forget).""" pass ``` @@ -180,7 +215,7 @@ High-level proxy for `@remote` classes. Call methods directly. ```python class ActorProxy: @property - def ref(self) -> ActorRef: + def ref(self): """Get underlying ActorRef.""" pass @@ -203,12 +238,12 @@ class Counter: self.value = init_value # Sync method - sequential execution - def incr(self) -> int: + def incr(self): self.value += 1 return self.value # Async method - concurrent execution during await - async def fetch_and_add(self, url: str) -> int: + async def fetch_and_add(self, url): data = await http_get(url) self.value += data return self.value diff --git a/docs/src/api_reference.zh.md b/docs/src/api_reference.zh.md index f3a71ae93..6195f8b91 100644 --- a/docs/src/api_reference.zh.md +++ b/docs/src/api_reference.zh.md @@ -69,30 +69,52 @@ Pulsing Actor 框架的完敎 API 文档。 创建新的 Actor System 实䟋。 ```python +import asyncio import pulsing as pul -system = await pul.actor_system( - addr: str | None = None, # 绑定地址None 䞺单机暡匏 - *, - seeds: list[str] | None = None, # 集矀种子节点 - passphrase: str | None = None, # TLS 密码短语 -) -> ActorSystem +async def example(): + # 凜数筟名: actor_system(addr=None, *, seeds=None, passphrase=None) -> ActorSystem + system = await pul.actor_system( + addr=None, # 绑定地址str 或 None 衚瀺单机暡匏 +# # 关键字参数匀始 # 关键字参数匀始 + seeds=None, # 集矀种子节点list[str] 或 None + passphrase=None, # TLS 密码短语str 或 None + ) + return system + +# 䜿甚瀺䟋 +if __name__ == "__main__": + # 运行瀺䟋 + # system = asyncio.run(example()) + pass ``` +**返回:** `ActorSystem` 实䟋 + **瀺䟋** ```python -# 单机暡匏 -system = await pul.actor_system() +import asyncio +import pulsing as pul -# 集矀暡匏 -system = await pul.actor_system(addr="0.0.0.0:8000") +async def main(): + # 单机暡匏 + system = await pul.actor_system() + await system.shutdown() -# 加入现有集矀 -system = await pul.actor_system(addr="0.0.0.0:8001", seeds=["127.0.0.1:8000"]) + # 集矀暡匏 + system = await pul.actor_system(addr="0.0.0.0:8000") + await system.shutdown() -# 关闭 -await system.shutdown() + # 加入现有集矀 + system = await pul.actor_system( + addr="0.0.0.0:8001", + seeds=["127.0.0.1:8000"] + ) + await system.shutdown() + +if __name__ == "__main__": + asyncio.run(main()) ``` ### pul.init / pul.shutdown @@ -100,19 +122,33 @@ await system.shutdown() 党局系统初始化Ray 风栌匂步 API。 ```python +import asyncio import pulsing as pul -# 初始化党局系统 -await pul.init(addr=None, seeds=None, passphrase=None) +class MyActor: + async def receive(self, msg): + return f"echo: {msg}" -# 䜿甚党局系统 -actor = await pul.spawn(MyActor()) -ref = await pul.resolve("actor_name") +async def main(): + # 初始化党局系统 + await pul.init(addr=None, seeds=None, passphrase=None) -# 关闭 -await pul.shutdown() + # 䜿甚党局系统 + actor = await pul.spawn(MyActor()) + ref = await pul.resolve("actor_name") + + # 关闭 + await pul.shutdown() + +if __name__ == "__main__": + asyncio.run(main()) ``` +**参数:** +- `addr`: 绑定地址str 或 None 衚瀺单机暡匏 +- `seeds`: 加入集矀的种子节点list[str] 或 None +- `passphrase`: TLS 密码短语str 或 None + ## 栞心类 ### ActorSystem @@ -123,15 +159,15 @@ Actor 系统的䞻入口点。 class ActorSystem: async def spawn( self, - actor: Actor, - *, - name: str | None = None, + actor, # Actor 实䟋 +# # 关键字参数匀始 # 关键字参数匀始 + name=None, # Actor 名称str 或 None # public 参数已废匃所有呜名 actor 自劚可被 resolve - restart_policy: str = "never", - max_restarts: int = 3, - min_backoff: float = 0.1, - max_backoff: float = 30.0 - ) -> ActorRef: + restart_policy="never", # 重启策略"never" | "always" | "on-failure" + max_restarts=3, # 最倧重启次数 + min_backoff=0.1, # 最小退避时闎秒 + max_backoff=30.0 # 最倧退避时闎秒 + ): """ 生成新的 actor。 @@ -140,15 +176,30 @@ class ActorSystem: """ pass - async def refer(self, actorid: ActorId | str) -> ActorRef: - """通过 ActorId 获取 ActorRef。""" + async def refer(self, actorid): + """ + 通过 ActorId 获取 ActorRef。 + + **参数:** + - `actorid`: Actor IDActorId 实䟋或字笊䞲栌匏 "node_id:local_id" + + **返回:** 对应 actor 的 ActorRef + """ pass - async def resolve(self, name: str, *, node_id: int | None = None) -> ActorRef: - """通过名称解析 actor。""" + async def resolve(self, name, *, node_id=None): + """ + 通过名称解析 actor。 + + **参数:** + - `name`: Actor 名称str + - `node_id`: 目标节点 IDint 或 None + + **返回:** 对应 actor 的 ActorRef + """ pass - async def shutdown(self) -> None: + async def shutdown(self): """关闭 actor 系统。""" pass ``` @@ -160,15 +211,22 @@ Actor 的底层匕甚。䜿甚 `ask()` 和 `tell()` 进行通信。 ```python class ActorRef: @property - def actor_id(self) -> ActorId: + def actor_id(self): """获取 actor 的 ID。""" pass - async def ask(self, msg: Any) -> Any: - """发送消息并等埅响应。""" + async def ask(self, msg): + """ + 发送消息并等埅响应。 + + **参数:** + - `msg`: 任意消息对象Any + + **返回:** 响应消息Any + """ pass - async def tell(self, msg: Any) -> None: + async def tell(self, msg): """发送消息䜆䞍等埅响应fire-and-forget。""" pass ``` @@ -180,7 +238,7 @@ class ActorRef: ```python class ActorProxy: @property - def ref(self) -> ActorRef: + def ref(self): """获取底层 ActorRef。""" pass @@ -203,12 +261,12 @@ class Counter: self.value = init_value # 同步方法 - 顺序执行 - def incr(self) -> int: + def incr(self): self.value += 1 return self.value # 匂步方法 - await 期闎可并发执行 - async def fetch_and_add(self, url: str) -> int: + async def fetch_and_add(self, url): data = await http_get(url) self.value += data return self.value diff --git a/docs/src/design/as-actor-decorator.md b/docs/src/design/as-actor-decorator.md index 0e87b820b..d355610d2 100644 --- a/docs/src/design/as-actor-decorator.md +++ b/docs/src/design/as-actor-decorator.md @@ -147,7 +147,7 @@ print(await counter.get()) # 100 ### 匂步方法支持 ```python -@remote +@pul.remote class AsyncWorker: async def fetch_data(self, url): async with aiohttp.ClientSession() as session: @@ -199,7 +199,7 @@ counter.increment(5) # 同步调甚返回 15 ### 1. 方法讟计 ```python -@remote +@pul.remote class GoodDesign: # ✓ 返回完敎状态 def get_state(self): diff --git a/docs/src/design/as-actor-decorator.zh.md b/docs/src/design/as-actor-decorator.zh.md index 4f7ffddeb..1e1a67ef5 100644 --- a/docs/src/design/as-actor-decorator.zh.md +++ b/docs/src/design/as-actor-decorator.zh.md @@ -147,7 +147,7 @@ print(await counter.get()) # 100 ### 匂步方法支持 ```python -@remote +@pul.remote class AsyncWorker: async def fetch_data(self, url): async with aiohttp.ClientSession() as session: @@ -199,7 +199,7 @@ counter.increment(5) # 同步调甚返回 15 ### 1. 方法讟计 ```python -@remote +@pul.remote class GoodDesign: # ✓ 返回完敎状态 def get_state(self): diff --git a/docs/src/examples/index.md b/docs/src/examples/index.md index 9223242a8..33639fb7e 100644 --- a/docs/src/examples/index.md +++ b/docs/src/examples/index.md @@ -99,7 +99,7 @@ async def main(): Distributing work across multiple workers: ```python -@remote +@pul.remote class Worker: def __init__(self, worker_id: int): self.worker_id = worker_id @@ -122,7 +122,7 @@ class Worker: } -@remote +@pul.remote class WorkerPool: def __init__(self): self.workers = [] @@ -156,7 +156,7 @@ class WorkerPool: ### Simple LLM Service ```python -@remote +@pul.remote class LLMService: def __init__(self, model_name: str): self.model_name = model_name @@ -199,7 +199,7 @@ class LLMService: ### Load-Balanced LLM Cluster ```python -@remote +@pul.remote class LLMRouter: """Routes requests to LLM workers with load balancing.""" diff --git a/docs/src/examples/index.zh.md b/docs/src/examples/index.zh.md index 06c7d44cc..db877e949 100644 --- a/docs/src/examples/index.zh.md +++ b/docs/src/examples/index.zh.md @@ -99,7 +99,7 @@ async def main(): 将工䜜分配给倚䞪工䜜噚 ```python -@remote +@pul.remote class Worker: def __init__(self, worker_id: int): self.worker_id = worker_id @@ -122,7 +122,7 @@ class Worker: } -@remote +@pul.remote class WorkerPool: def __init__(self): self.workers = [] @@ -150,7 +150,7 @@ class WorkerPool: ### 简单 LLM 服务 ```python -@remote +@pul.remote class LLMService: def __init__(self, model_name: str): self.model_name = model_name diff --git a/docs/src/guide/actors.md b/docs/src/guide/actors.md index 7fd82e7c0..37e7019b1 100644 --- a/docs/src/guide/actors.md +++ b/docs/src/guide/actors.md @@ -120,7 +120,7 @@ ray.shutdown() result = await calc.add(10) ``` -### Tell (Fire-and-Forget) +### Tell (Fire-and-forget) ```python await actor_ref.tell({"event": "notify", "data": "event_data"}) @@ -131,7 +131,7 @@ await actor_ref.tell({"event": "notify", "data": "event_data"}) For continuous data flow (e.g., LLM token generation), just return a generator: ```python -@remote +@pul.remote class TokenGenerator: async def generate(self, prompt: str): # Just return an async generator - Pulsing handles streaming automatically @@ -150,7 +150,7 @@ async for chunk in generator.generate("Hello"): Pulsing supports automatic actor restart on failure: ```python -@remote( +@pul.remote( restart_policy="on_failure", # "never" | "on_failure" | "always" max_restarts=3, min_backoff=1.0, @@ -172,7 +172,7 @@ class ReliableWorker: ### 1. Stateful Actor ```python -@remote +@pul.remote class SessionManager: def __init__(self): self.sessions = {} @@ -189,7 +189,7 @@ class SessionManager: ### 2. Worker Pool (Round-Robin) ```python -@remote +@pul.remote class WorkerPool: def __init__(self, workers: list): self.workers = workers @@ -204,7 +204,7 @@ class WorkerPool: ### 3. Pipeline ```python -@remote +@pul.remote class PipelineStage: def __init__(self, next_stage=None): self.next_stage = next_stage @@ -219,7 +219,7 @@ class PipelineStage: ### 4. LLM Inference Service ```python -@remote +@pul.remote class LLMService: def __init__(self, model_name: str): self.model_name = model_name @@ -250,7 +250,7 @@ class LLMService: ### Error Handling ```python -@remote +@pul.remote class ResilientActor: async def risky_operation(self, data: dict) -> dict: try: @@ -281,7 +281,7 @@ actor = await system.spawn(MyActor(), name="my_actor") # Call method result = await actor.ask({"action": "do_something"}) -# Using @remote decorator (recommended) +# Using @pul.remote decorator (recommended) @pul.remote class MyService: def process(self, data): return data diff --git a/docs/src/guide/actors.zh.md b/docs/src/guide/actors.zh.md index e4d9b817d..b91b5baf4 100644 --- a/docs/src/guide/actors.zh.md +++ b/docs/src/guide/actors.zh.md @@ -131,7 +131,7 @@ await actor_ref.tell({"event": "notify", "data": "event_data"}) 甚于持续数据流劂 LLM token 生成只需返回 generator ```python -@remote +@pul.remote class TokenGenerator: async def generate(self, prompt: str): # 盎接返回 async generator - Pulsing 自劚倄理流匏䌠蟓 @@ -150,7 +150,7 @@ async for chunk in generator.generate("Hello"): Pulsing 支持 Actor 倱莥后自劚重启 ```python -@remote( +@pul.remote( restart_policy="on_failure", # "never" | "on_failure" | "always" max_restarts=3, min_backoff=1.0, @@ -172,7 +172,7 @@ class ReliableWorker: ### 1. 有状态 Actor ```python -@remote +@pul.remote class SessionManager: def __init__(self): self.sessions = {} @@ -189,7 +189,7 @@ class SessionManager: ### 2. Worker 池蜮询 ```python -@remote +@pul.remote class WorkerPool: def __init__(self, workers: list): self.workers = workers @@ -204,7 +204,7 @@ class WorkerPool: ### 3. 流氎线 ```python -@remote +@pul.remote class PipelineStage: def __init__(self, next_stage=None): self.next_stage = next_stage @@ -219,7 +219,7 @@ class PipelineStage: ### 4. LLM 掚理服务 ```python -@remote +@pul.remote class LLMService: def __init__(self, model_name: str): self.model_name = model_name @@ -250,7 +250,7 @@ class LLMService: ### 错误倄理 ```python -@remote +@pul.remote class ResilientActor: async def risky_operation(self, data: dict) -> dict: try: diff --git a/docs/src/index.md b/docs/src/index.md index 8a879c8b5..90214abd4 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -122,6 +122,6 @@ asyncio.run(main()) ## Community -- [GitHub Repository](https://github.com/reiase/pulsing) -- [Issue Tracker](https://github.com/reiase/pulsing/issues) -- [Discussions](https://github.com/reiase/pulsing/discussions) +- [GitHub Repository](https://github.com/DeepLink-org/pulsing) +- [Issue Tracker](https://github.com/DeepLink-org/pulsing/issues) +- [Discussions](https://github.com/DeepLink-org/pulsing/discussions) diff --git a/docs/src/index.zh.md b/docs/src/index.zh.md index f22f1145a..564840869 100644 --- a/docs/src/index.zh.md +++ b/docs/src/index.zh.md @@ -122,6 +122,6 @@ asyncio.run(main()) ## 瀟区 -- [GitHub 仓库](https://github.com/reiase/pulsing) -- [Issue 远螪](https://github.com/reiase/pulsing/issues) -- [讚论区](https://github.com/reiase/pulsing/discussions) +- [GitHub 仓库](https://github.com/DeepLink-org/pulsing) +- [Issue 远螪](https://github.com/DeepLink-org/pulsing/issues) +- [讚论区](https://github.com/DeepLink-org/pulsing/discussions) diff --git a/python/pulsing/actor/remote.py b/python/pulsing/actor/remote.py index 6b221b824..36ce248c0 100644 --- a/python/pulsing/actor/remote.py +++ b/python/pulsing/actor/remote.py @@ -1,29 +1,4 @@ -""" -@remote decorator - Ray-like distributed object wrapper - -Usage: - from pulsing.actor import init, shutdown, remote - - @remote - class Counter: - def __init__(self, init_value=0): - self.value = init_value - - def increment(self, n=1): - self.value += n - return self.value - - async def main(): - await init() - - # Create actor - counter = await Counter.spawn(init_value=10) - - # Call methods (automatically converted to actor messages) - result = await counter.increment(5) # Returns 15 - - await shutdown() -""" +"""Ray-like distributed object wrapper.""" import asyncio import inspect @@ -39,7 +14,7 @@ async def main(): class _ActorBase(ABC): - """Actor base class (avoids circular imports)""" + """Actor base class.""" def on_start(self, actor_id) -> None: pass @@ -52,21 +27,19 @@ def metadata(self) -> dict[str, str]: @abstractmethod async def receive(self, msg) -> Any: - """Handle incoming message. Can receive and return any Python object.""" + """Handle incoming message.""" pass T = TypeVar("T") -# Global class registry _actor_class_registry: dict[str, type] = {} -# Actor instance metadata registry (actor_name -> metadata) _actor_metadata_registry: dict[str, dict[str, str]] = {} def _register_actor_metadata(name: str, cls: type): - """Register actor metadata for later retrieval""" + """Register actor metadata for later retrieval.""" import inspect metadata = { @@ -74,7 +47,6 @@ def _register_actor_metadata(name: str, cls: type): "python_module": cls.__module__, } - # Try to get source file try: source_file = inspect.getfile(cls) metadata["python_file"] = source_file @@ -85,21 +57,15 @@ def _register_actor_metadata(name: str, cls: type): def get_actor_metadata(name: str) -> dict[str, str] | None: - """Get metadata for an actor by name""" + """Get metadata for an actor by name.""" return _actor_metadata_registry.get(name) -# Python actor service name (different from Rust SystemActor "system/core") PYTHON_ACTOR_SERVICE_NAME = "system/python_actor_service" class ActorProxy: - """Actor proxy: automatically converts method calls to ask messages - - Supports two method types: - - Regular methods: synchronous ask/response - - Async methods: streaming response, non-blocking actor - """ + """Actor proxy.""" def __init__( self, @@ -114,7 +80,6 @@ def __init__( def __getattr__(self, name: str): if name.startswith("_"): raise AttributeError(f"Cannot access private attribute: {name}") - # If no method list, allow arbitrary method calls (dynamic mode) if self._method_names is not None and name not in self._method_names: raise AttributeError(f"No method '{name}'") is_async = name in self._async_methods @@ -122,7 +87,7 @@ def __getattr__(self, name: str): @property def ref(self) -> ActorRef: - """Get underlying ActorRef""" + """Get underlying ActorRef.""" return self._ref @classmethod @@ -132,34 +97,12 @@ def from_ref( methods: list[str] | None = None, async_methods: set[str] | None = None, ) -> "ActorProxy": - """Create ActorProxy from ActorRef - - Args: - actor_ref: Underlying actor reference - methods: Optional list of method names. If not provided, allows calling any method (dynamic mode) - async_methods: Set of async method names, these methods use streaming response - - Example: - # Dynamic mode - allows calling any method - ref = await system.resolve_named("my_counter") - proxy = ActorProxy.from_ref(ref) - await proxy.increment(5) # Can call any method - - # Static mode - only allows specified methods - proxy = ActorProxy.from_ref(ref, methods=["increment", "get_value"]) - - # With async method marking - proxy = ActorProxy.from_ref( - ref, - methods=["get", "generate"], - async_methods={"generate"} - ) - """ + """Create ActorProxy from ActorRef.""" return cls(actor_ref, methods, async_methods) class _MethodCaller: - """Method caller: executes remote method calls""" + """Method caller.""" def __init__(self, actor_ref: ActorRef, method_name: str, is_async: bool = False): self._ref = actor_ref @@ -168,14 +111,12 @@ def __init__(self, actor_ref: ActorRef, method_name: str, is_async: bool = False def __call__(self, *args, **kwargs): if self._is_async: - # Return an object that can be awaited or async iterated return _AsyncMethodCall(self._ref, self._method, args, kwargs) else: - # Return a coroutine for synchronous methods return self._sync_call(*args, **kwargs) async def _sync_call(self, *args, **kwargs) -> Any: - """Synchronous method call""" + """Synchronous method call.""" call_msg = { "__call__": self._method, "args": args, @@ -184,16 +125,13 @@ async def _sync_call(self, *args, **kwargs) -> Any: } resp = await self._ref.ask(call_msg) - # Handle normal response if isinstance(resp, dict): if "__error__" in resp: raise RuntimeError(resp["__error__"]) return resp.get("__result__") elif isinstance(resp, Message): - # Check if it's a stream message (generator returned) if resp.is_stream: return _SyncGeneratorStreamReader(resp) - # Fallback for Rust actor communication data = resp.to_json() if resp.msg_type == "Error": raise RuntimeError(data.get("error", "Remote call failed")) diff --git a/python/pulsing/actors/router.py b/python/pulsing/actors/router.py index 551e326ff..991192750 100644 --- a/python/pulsing/actors/router.py +++ b/python/pulsing/actors/router.py @@ -1,4 +1,4 @@ -"""Router - OpenAI-compatible HTTP API router""" +"""OpenAI-compatible HTTP API router.""" import asyncio import json @@ -52,7 +52,7 @@ def from_dict(cls, data: dict) -> "CompletionRequest": class _OpenAIHandler: - """OpenAI-compatible HTTP request handler""" + """OpenAI-compatible HTTP request handler.""" def __init__(self, actor_system: ActorSystem, model_name: str, scheduler): self._actor_system = actor_system @@ -69,10 +69,8 @@ async def index(self, request: web.Request) -> web.Response: ) async def health_check(self, request: web.Request) -> web.Response: - # Compatible with different types of schedulers if hasattr(self._scheduler, "get_worker_count"): count = self._scheduler.get_worker_count() - # If coroutine then await if hasattr(count, "__await__"): total_workers = await count else: @@ -83,7 +81,6 @@ async def health_check(self, request: web.Request) -> web.Response: if hasattr(self._scheduler, "get_healthy_worker_count"): healthy_workers = await self._scheduler.get_healthy_worker_count() elif hasattr(self._scheduler, "get_all_loads"): - # StreamLoadScheduler: use get_all_loads to calculate healthy count healthy_workers = len(self._scheduler.get_all_loads()) else: healthy_workers = total_workers diff --git a/python/pulsing/agent/runtime.py b/python/pulsing/agent/runtime.py index 36be82dd6..9a84123b6 100644 --- a/python/pulsing/agent/runtime.py +++ b/python/pulsing/agent/runtime.py @@ -1,4 +1,4 @@ -"""Actor system lifecycle management""" +"""Actor system lifecycle management.""" from __future__ import annotations @@ -14,20 +14,7 @@ async def runtime( seeds: list[str] | None = None, passphrase: str | None = None, ): - """ - Actor system runtime context manager. - - Example: - from pulsing.agent import runtime - - async with runtime(): - agent = await MyAgent.spawn(name="agent") - result = await agent.run() - - # Distributed mode - async with runtime(addr="0.0.0.0:8001", seeds=["node1:8001"]): - ... - """ + """Actor system runtime context manager.""" await init(addr=addr, seeds=seeds, passphrase=passphrase) try: yield get_system() diff --git a/python/pulsing/topic/broker.py b/python/pulsing/topic/broker.py index b280f8a22..31ac886ac 100644 --- a/python/pulsing/topic/broker.py +++ b/python/pulsing/topic/broker.py @@ -1,8 +1,4 @@ -"""TopicBroker - Lightweight Pub/Sub Broker Actor (internal implementation) - -TopicBroker lifecycle is managed by StorageManager in queue/manager.py, -uses consistent hashing to ensure only one broker per topic in the cluster. -""" +"""Topic broker (internal).""" from __future__ import annotations @@ -19,37 +15,28 @@ logger = logging.getLogger(__name__) -# Subscriber lifecycle management configuration -MAX_CONSECUTIVE_FAILURES = 3 # Consecutive failure threshold, evict after exceeding -REF_TTL_SECONDS = 60.0 # ActorRef cache TTL, re-resolve after expiration - -# Timeout configuration (approach 2+3: timeout + idempotency) -DEFAULT_FANOUT_TIMEOUT = 30.0 # Default timeout for wait_any_ack / wait_all_acks +MAX_CONSECUTIVE_FAILURES = 3 +REF_TTL_SECONDS = 60.0 +DEFAULT_FANOUT_TIMEOUT = 30.0 @dataclass class _Subscriber: - """Subscriber information (internal use)""" + """Subscriber information.""" subscriber_id: str actor_name: str node_id: int | None = None subscribed_at: float = field(default_factory=time.time) _ref: "ActorRef | None" = field(default=None, repr=False) - _ref_resolved_at: float = 0 # ActorRef resolution time, for TTL judgment + _ref_resolved_at: float = 0 messages_delivered: int = 0 messages_failed: int = 0 - consecutive_failures: int = 0 # Consecutive failures, for eviction judgment + consecutive_failures: int = 0 class TopicBroker(Actor): - """Topic Broker Actor (internal implementation) - - Each topic corresponds to one broker, responsible for: - 1. Managing subscriber list - 2. Receiving publish requests and distributing to subscribers - 3. Deciding whether to wait for ack based on publish mode - """ + """Topic broker actor.""" def __init__(self, topic: str, system: "ActorSystem"): self.topic = topic @@ -133,20 +120,11 @@ async def _unsubscribe(self, data: dict) -> Message: return Message.from_json("UnsubscribeResult", {"success": False}) async def _resolve(self, sub: _Subscriber) -> "ActorRef | None": - """Resolve subscriber ActorRef (with TTL cache) - - Cache strategy: - - Cache ActorRef after first resolution - - Re-resolve after TTL expiration to detect node failures - - Clear cache on resolution failure, retry next time - """ now = time.time() - # Check if cache is valid (exists and not expired) if sub._ref is not None and (now - sub._ref_resolved_at) < REF_TTL_SECONDS: return sub._ref - # TTL expired or no cache, re-resolve try: sub._ref = await self.system.resolve_named( sub.actor_name, node_id=sub.node_id @@ -155,7 +133,7 @@ async def _resolve(self, sub: _Subscriber) -> "ActorRef | None": return sub._ref except Exception as e: logger.warning(f"Failed to resolve {sub.subscriber_id}: {e}") - sub._ref = None # Clear invalid cache + sub._ref = None sub._ref_resolved_at = 0 return None @@ -191,22 +169,15 @@ async def _publish(self, data: dict) -> Message: return Message.from_json("Error", {"error": f"Unknown mode: {mode}"}) def _record_success(self, sub: _Subscriber) -> None: - """Record delivery success, reset consecutive failure count""" sub.messages_delivered += 1 sub.consecutive_failures = 0 def _record_failure(self, sub: _Subscriber) -> bool: - """Record delivery failure, return whether should evict - - Returns: - True if consecutive failures exceed threshold, should evict - """ sub.messages_failed += 1 sub.consecutive_failures += 1 return sub.consecutive_failures >= MAX_CONSECUTIVE_FAILURES async def _evict_zombies(self, zombie_ids: list[str]) -> None: - """Evict zombie subscribers""" if not zombie_ids: return async with self._lock: @@ -218,7 +189,6 @@ async def _evict_zombies(self, zombie_ids: list[str]) -> None: ) async def _fanout_tell(self, envelope: dict, sender_id: str | None) -> Message: - """Fire-and-forget: tell without waiting for response""" sent = 0 failed = 0 zombies: list[str] = [] @@ -241,7 +211,6 @@ async def _fanout_tell(self, envelope: dict, sender_id: str | None) -> Message: if self._record_failure(sub): zombies.append(sub_id) - # Evict zombie subscribers await self._evict_zombies(zombies) self._total_delivered += sent @@ -264,23 +233,10 @@ async def _fanout_ask( wait_all: bool, timeout: float = DEFAULT_FANOUT_TIMEOUT, ) -> Message: - """Wait for ack mode - - Args: - envelope: Message envelope - sender_id: Sender ID (to exclude self) - wait_all: True=wait for all responses, False=wait for any response (wait_any_ack) - timeout: Timeout in seconds. Local task will be cancelled after timeout, - remote handler may still be executing (relies on HTTP/2 RST_STREAM propagation). - - Note (cancellation semantics - approach 2+3): - - Local cancellation: via asyncio.wait timeout or task.cancel() - - Remote cancellation: relies on HTTP/2 RST_STREAM auto-propagation (triggered when body read is interrupted) - - Idempotency: Handler should implement idempotent operations to ensure repeated requests don't produce side effects - """ + """Wait for ack mode.""" tasks = [] sub_ids = [] - resolve_failed: list[str] = [] # Subscribers that failed to resolve + resolve_failed: list[str] = [] for sub_id, sub in list(self._subscribers.items()): if sender_id and sub_id == sender_id: @@ -290,7 +246,6 @@ async def _fanout_ask( tasks.append(ref.ask(envelope)) sub_ids.append(sub_id) else: - # Resolve failures also count as failures if self._record_failure(sub): resolve_failed.append(sub_id) From 18f76e29821742eba35a464bf662c72f3b94e72b Mon Sep 17 00:00:00 2001 From: Reiase Date: Sat, 24 Jan 2026 23:16:33 +0800 Subject: [PATCH 4/4] Update documentation to enhance clarity and consistency across Pulsing framework - Revised descriptions in `README.md`, `README.zh.md`, and `llms.binding.md` to emphasize Pulsing as a distributed actor framework with specialized support for AI applications. - Improved site description in `mkdocs.yml` for better alignment with the framework's capabilities. - Added a comprehensive FAQ section in `faq.md` and its Chinese counterpart `faq.zh.md`, addressing common user questions and installation issues. - Enhanced navigation structure in `mkdocs.yml` for better accessibility to documentation resources. - Updated API overview documentation to reflect recent changes and improve user understanding of the framework's functionalities. --- README.md | 2 +- README.zh.md | 2 +- docs/mkdocs.yml | 68 +++--- docs/src/api/overview.md | 2 +- docs/src/api/overview.zh.md | 4 +- docs/src/api/python.md | 2 +- docs/src/api/python.zh.md | 2 +- docs/src/api/rust.md | 2 +- docs/src/api/rust.zh.md | 2 +- docs/src/design/actor-addressing.md | 86 +++----- docs/src/design/architecture.md | 98 +++++---- docs/src/design/architecture.zh.md | 98 +++++---- docs/src/faq.md | 329 ++++++++++++++++++++++++++++ docs/src/faq.zh.md | 329 ++++++++++++++++++++++++++++ docs/src/index.md | 8 +- docs/src/index.zh.md | 4 +- llms.binding.md | 2 +- 17 files changed, 852 insertions(+), 188 deletions(-) create mode 100644 docs/src/faq.md create mode 100644 docs/src/faq.zh.md diff --git a/README.md b/README.md index 3dc5d6637..c6675a3ee 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ **[䞭文文档](README.zh.md)** -**Lightweight distributed framework designed for high-performance AI applications.** +**Pulsing is a distributed actor framework that provides a communication backbone for building distributed systems, with specialized support for AI applications.** 🚀 **Zero Dependencies** — Pure Rust + Tokio, no NATS/etcd/Redis diff --git a/README.zh.md b/README.zh.md index 0155f1f98..6611b76ea 100644 --- a/README.zh.md +++ b/README.zh.md @@ -7,7 +7,7 @@ **[English](README.md)** -**蜻量级分垃匏框架䞓䞺高性胜 AI 应甚讟计。** +**Pulsing 是䞀䞪分垃匏 actor 框架䞺构建分垃匏系统提䟛通信骚干并䞺 AI 应甚提䟛䞓闚支持。** 🚀 **零倖郚䟝赖** — 纯 Rust + Tokio无需 NATS/etcd/Redis diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5a6facef5..6ac7803ae 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,5 +1,5 @@ site_name: Pulsing -site_description: A lightweight distributed Actor framework for building scalable systems +site_description: Pulsing is a distributed actor framework that provides a communication backbone for building distributed systems, with specialized support for AI applications. # Docs may be hosted elsewhere, but repo is the canonical entry point. site_url: https://github.com/DeepLink-org/pulsing docs_dir: src @@ -64,7 +64,8 @@ markdown_extensions: permalink: true plugins: - - search + - search: + separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' - i18n: docs_structure: suffix fallback_to_default: true @@ -82,28 +83,31 @@ plugins: site_name: "Pulsing 文档" nav_translations: Home: 銖页 - Getting Started: 快速匀始 + Home: 銖页 + Getting Started: 匀始䜿甚 + Overview: 抂述 LLM Inference: LLM 掚理 Distributed Agents: 分垃匏 Agent Migrate from Ray: 从 Ray 迁移 - Guide: 甚户指南 + User Guide: 甚户指南 + Guide: 指南 Actors: Actor 指南 Remote Actors: 远皋 Actor - Security: 安党 - Reliability: 可靠性 Operations: CLI 运绎 - Distributed Queue: 分垃匏内存队列 - Python API Contract: Python API 契纊 + Reliability: 可靠性 + Security: 安党 + Distributed Queue: 分垃匏队列 Semantics: 语义䞎保证 Style Guide: 术语䞎风栌 - Agent: Agent 框架 - Overview: 抂述 - Pulsing Native: Pulsing 原生 - AutoGen: AutoGen - LangGraph: LangGraph + Agent Framework: Agent 框架 Examples: 瀺䟋 Ping-Pong: Ping-Pong Distributed Counter: 分垃匏计数噚 + API Reference: API 参考 + Overview: API 抂述 + Python: Python API + Rust: Rust API + FAQ: 垞见问题 Design: 讟计文档 Architecture: 架构抂览 Actor System: Actor 系统 @@ -113,11 +117,6 @@ plugins: HTTP2 Transport: HTTP2 䌠蟓 Load Sync: 莟蜜同步 AS Actor Decorator: AS Actor 装饰噚 - API Reference: API 参考 - Overview: 抂述 - Python API: Python API - Overview: API 抂述 - Python API: Python API - mkdocstrings: handlers: python: @@ -136,12 +135,12 @@ plugins: nav: - Home: index.md - Getting Started: - - quickstart/index.md + - Overview: quickstart/index.md - LLM Inference: quickstart/llm_inference.md - Distributed Agents: quickstart/agent.md - Migrate from Ray: quickstart/migrate_from_ray.md - - Guide: - - guide/index.md + - User Guide: + - Guide: guide/index.md - Actors: guide/actors.md - Remote Actors: guide/remote_actors.md - Operations: guide/operations.md @@ -150,15 +149,20 @@ nav: - Distributed Queue: guide/queue.md - Semantics: guide/semantics.md - Style Guide: guide/style.md - - Agent: - - agent/index.md - - Pulsing Native: agent/native.md - - AutoGen: agent/autogen.md - - LangGraph: agent/langgraph.md - - Examples: - - examples/index.md - - Ping-Pong: examples/ping_pong.md - - Distributed Counter: examples/distributed_counter.md + - Agent Framework: + - Overview: agent/index.md + - Pulsing Native: agent/native.md + - AutoGen: agent/autogen.md + - LangGraph: agent/langgraph.md + - Examples: + - Overview: examples/index.md + - Ping-Pong: examples/ping_pong.md + - Distributed Counter: examples/distributed_counter.md + - API Reference: + - Overview: api/overview.md + - Python: api/python.md + - Rust: api/rust.md + - FAQ: faq.md - Design: - Architecture: design/architecture.md - Actor System: design/actor-system.md @@ -168,10 +172,6 @@ nav: - HTTP2 Transport: design/http2-transport.md - Load Sync: design/load_sync.md - AS Actor Decorator: design/as-actor-decorator.md - - API Reference: - - Overview: api/overview.md - - Python: api/python.md - - Rust: api/rust.md extra: generator: false diff --git a/docs/src/api/overview.md b/docs/src/api/overview.md index d202c863c..16ce1b1f5 100644 --- a/docs/src/api/overview.md +++ b/docs/src/api/overview.md @@ -299,4 +299,4 @@ system = await pul.actor_system( - **[Python API Reference](python.md)**: Complete Python API documentation - **[Rust API Reference](rust.md)**: Complete Rust API documentation - **[Examples](../examples/)**: Working code examples -- **[Guide](../guide/)**: In-depth guides and tutorials \ No newline at end of file +- **[Guide](../guide/)**: In-depth guides and tutorials diff --git a/docs/src/api/overview.zh.md b/docs/src/api/overview.zh.md index 9aa863fac..53f1dc2bd 100644 --- a/docs/src/api/overview.zh.md +++ b/docs/src/api/overview.zh.md @@ -1,6 +1,6 @@ # API 抂述 -Pulsing 是䞀䞪分垃匏 actor 框架可以䜜䞺任意分垃匏系统的通信骚干以方䟿快速搭建分垃匏系统和应甚。 +Pulsing 是䞀䞪分垃匏 actor 框架䞺构建分垃匏系统提䟛通信骚干并䞺 AI 应甚提䟛䞓闚支持。 ## 栞心抂念 @@ -296,4 +296,4 @@ system = await pul.actor_system( - **[Python API](python.md)**: Python 接口完敎文档 - **[Rust API](rust.md)**: Rust 接口完敎文档 - **[瀺䟋](../../examples/)**: 工䜜代码瀺䟋 -- **[指南](../../guide/)**: 深入指南和教皋 \ No newline at end of file +- **[指南](../../guide/)**: 深入指南和教皋 diff --git a/docs/src/api/python.md b/docs/src/api/python.md index b8d534bea..222d2480d 100644 --- a/docs/src/api/python.md +++ b/docs/src/api/python.md @@ -36,4 +36,4 @@ pip install -e . ## Queue Module -::: pulsing.queue \ No newline at end of file +::: pulsing.queue diff --git a/docs/src/api/python.zh.md b/docs/src/api/python.zh.md index dbc81bb4c..92a86aa1b 100644 --- a/docs/src/api/python.zh.md +++ b/docs/src/api/python.zh.md @@ -36,4 +36,4 @@ pip install -e . ## 队列暡块 -::: pulsing.queue \ No newline at end of file +::: pulsing.queue diff --git a/docs/src/api/rust.md b/docs/src/api/rust.md index ca14ce2da..4e66c70a0 100644 --- a/docs/src/api/rust.md +++ b/docs/src/api/rust.md @@ -391,4 +391,4 @@ async fn main() -> anyhow::Result<()> { - **[Python API](python.md)**: Python interface documentation - **[Examples](../../examples/)**: Working Rust examples -- **[Design Documents](../../design/)**: Architecture and design decisions \ No newline at end of file +- **[Design Documents](../../design/)**: Architecture and design decisions diff --git a/docs/src/api/rust.zh.md b/docs/src/api/rust.zh.md index d1ca1da38..0d0f14ea5 100644 --- a/docs/src/api/rust.zh.md +++ b/docs/src/api/rust.zh.md @@ -391,4 +391,4 @@ async fn main() -> anyhow::Result<()> { - **[Python API](python.md)**: Python 接口文档 - **[瀺䟋](../../examples/)**: Rust 瀺䟋代码 -- **[讟计文档](../../design/)**: 架构和讟计决策 \ No newline at end of file +- **[讟计文档](../../design/)**: 架构和讟计决策 diff --git a/docs/src/design/actor-addressing.md b/docs/src/design/actor-addressing.md index 5b52f1d3a..169b4a208 100644 --- a/docs/src/design/actor-addressing.md +++ b/docs/src/design/actor-addressing.md @@ -156,20 +156,20 @@ actor:///workers/inference/gpu/pool # 3 级 ### 抂念暡型 -``` -┌──────────────────────────────────────────────────────────────────────────┐ -│ │ -│ 具名 Actor: actor:///services/api │ -│ │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ Instance │ │ Instance │ │ Instance │ │ -│ │ @node_a │ │ @node_b │ │ @node_c │ │ -│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ -│ │ -│ 访问 actor:///services/api 时自劚莟蜜均衡选择实䟋 │ -│ 访问 actor:///services/api@node_b 时盎接路由到 node_b │ -│ │ -└──────────────────────────────────────────────────────────────────────────┘ +```mermaid +graph TB + subgraph NamedActor["具名 Actor: actor:///services/api"] + subgraph Instances["实䟋"] + I1["Instance
@node_a"] + I2["Instance
@node_b"] + I3["Instance
@node_c"] + end + end + + Note["访问 actor:///services/api 时自劚莟蜜均衡选择实䟋
访问 actor:///services/api@node_b 时盎接路由到 node_b"] + + style NamedActor fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + style Instances fill:#fff3e0,stroke:#f57c00 ``` ### 实䟋泚册 @@ -207,42 +207,28 @@ let resp = system.ask(&addr, request).await?; // 可胜路由到 A、B 或 C ### 解析流皋 -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ Address Resolution │ -├────────────────────────────────────────────────────────────────────────────── -│ │ -│ actor:///services/api │ -│ │ │ -│ ├──→ 查询 Gossip Registry │ -│ │ │ │ -│ │ ↓ │ -│ │ instances: [node_a, node_b, node_c] │ -│ │ │ │ -│ │ ↓ (莟蜜均衡选择) │ -│ │ selected: node_b │ -│ │ │ │ -│ └────────→ http://node_b_ip:port/named/services/api │ -│ │ -│ actor:///services/api@node_a │ -│ │ │ -│ ├──→ 查询 node_a 地址 │ -│ │ │ │ -│ └────────→ http://node_a_ip:port/named/services/api │ -│ │ -│ actor://node_a/worker_123 │ -│ │ │ -│ ├──→ 查询 node_a 地址 │ -│ │ │ │ -│ └────────→ http://node_a_ip:port/actors/worker_123 │ -│ │ -│ actor://localhost/worker_123 │ -│ │ │ -│ ├──→ 替换 localhost → current_node_id │ -│ │ │ │ -│ └────────→ 本地盎接调甚䞍走眑络 │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ +```mermaid +flowchart TD + A["actor:///services/api"] --> B["查询 Gossip Registry"] + B --> C["instances: [node_a, node_b, node_c]"] + C --> D["莟蜜均衡选择"] + D --> E["selected: node_b"] + E --> F["http://node_b_ip:port/named/services/api"] + + G["actor:///services/api@node_a"] --> H["查询 node_a 地址"] + H --> I["http://node_a_ip:port/named/services/api"] + + J["actor://node_a/worker_123"] --> K["查询 node_a 地址"] + K --> L["http://node_a_ip:port/actors/worker_123"] + + M["actor://localhost/worker_123"] --> N["替换 localhost → current_node_id"] + N --> O["本地盎接调甚䞍走眑络"] + + style A fill:#e3f2fd,stroke:#1976d2 + style F fill:#c8e6c9,stroke:#388e3c + style I fill:#c8e6c9,stroke:#388e3c + style L fill:#c8e6c9,stroke:#388e3c + style O fill:#fff3e0,stroke:#f57c00 ``` ### HTTP 映射 diff --git a/docs/src/design/architecture.md b/docs/src/design/architecture.md index 0175850af..7cf14b90d 100644 --- a/docs/src/design/architecture.md +++ b/docs/src/design/architecture.md @@ -4,30 +4,34 @@ Overview of Pulsing Actor System architecture. ## System Components -``` -┌─────────────────────────────────────────────────────────────────┐ -│ ActorSystem │ -├────────────────────────────────────────────────────────────────── -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ Actor A │ │ Actor B │ │ Actor C │ Local │ -│ │ (Mailbox) │ │ (Mailbox) │ │ (Mailbox) │ Actors │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ -│ │ │ │ │ -│ └────────────────┌────────────────┘ │ -│ │ │ -│ ┌───────────────────────┮───────────────────────┐ │ -│ │ HTTP Transport │ │ -│ │ POST /actor/{name} - Actor Messages │ │ -│ │ POST /cluster/gossip - Cluster Protocol │ │ -│ └───────────────────────┬───────────────────────┘ │ -│ │ │ -│ ┌───────────────────────┮───────────────────────┐ │ -│ │ GossipCluster │ │ -│ │ - 成员发现 (Membership) │ │ -│ │ - Actor 䜍眮 (Actor Registry) │ │ -│ │ - 故障检测 (SWIM) │ │ -│ └───────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ +```mermaid +graph TB + subgraph ActorSystem["ActorSystem"] + subgraph LocalActors["Local Actors"] + A["Actor A
(Mailbox)"] + B["Actor B
(Mailbox)"] + C["Actor C
(Mailbox)"] + end + + subgraph Transport["HTTP Transport"] + T1["POST /actor/{name}
Actor Messages"] + T2["POST /cluster/gossip
Cluster Protocol"] + end + + subgraph Cluster["GossipCluster"] + M["成员发现
(Membership)"] + R["Actor 䜍眮
(Actor Registry)"] + S["故障检测
(SWIM)"] + end + end + + A & B & C --> Transport + Transport --> Cluster + + style ActorSystem fill:#f5f5f5,stroke:#333,stroke-width:2px + style LocalActors fill:#e3f2fd,stroke:#1976d2 + style Transport fill:#fff3e0,stroke:#f57c00 + style Cluster fill:#e8f5e9,stroke:#388e3c ``` ## Key Concepts @@ -64,30 +68,36 @@ The cluster provides: ### Local Message -``` -Sender Mailbox Actor - │ │ │ - │── ask(Ping) ────────────→│ │ - │ │── recv() ─────────────→│ - │ │ │── handle() - │ │←─ respond(Pong) ───────│ - │←─ Pong ──────────────────│ │ +```mermaid +sequenceDiagram + participant S as Sender + participant M as Mailbox + participant A as Actor + + S->>M: ask(Ping) + M->>A: recv() + A->>A: handle() + A-->>M: respond(Pong) + M-->>S: Pong ``` ### Remote Message -``` -Node A Network Node B - │ │ │ - │ │ │ -Sender ─→ ActorRef(Remote) │ Actor - │ │ │ │ - │ │── HTTP POST ──────→│── /actor/{name} ─────────────→│ - │ │ {msg_type, │ Envelope │ - │ │ payload} │ │── handle() - │ │ │ │ - │ │←─ HTTP Response ───│←─ {result} ──────────────────│ - │←─ Pong ────│ │ │ +```mermaid +sequenceDiagram + participant S as Sender + participant R as ActorRef(Remote) + participant N as Network + participant A as Actor (Node B) + + S->>R: ask(Ping) + R->>N: HTTP POST /actor/{name} + Note over R,N: {msg_type, payload} + N->>A: Envelope + A->>A: handle() + A-->>N: {result} + N-->>R: HTTP Response + R-->>S: Pong ``` ## Design Principles diff --git a/docs/src/design/architecture.zh.md b/docs/src/design/architecture.zh.md index ff4e48187..025d46c73 100644 --- a/docs/src/design/architecture.zh.md +++ b/docs/src/design/architecture.zh.md @@ -4,30 +4,34 @@ Pulsing Actor 系统架构抂览。 ## 系统组件 -``` -┌─────────────────────────────────────────────────────────────────┐ -│ ActorSystem │ -├────────────────────────────────────────────────────────────────── -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ Actor A │ │ Actor B │ │ Actor C │ Local │ -│ │ (Mailbox) │ │ (Mailbox) │ │ (Mailbox) │ Actors │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ -│ │ │ │ │ -│ └────────────────┌────────────────┘ │ -│ │ │ -│ ┌───────────────────────┮───────────────────────┐ │ -│ │ HTTP Transport │ │ -│ │ POST /actor/{name} - Actor Messages │ │ -│ │ POST /cluster/gossip - Cluster Protocol │ │ -│ └───────────────────────┬───────────────────────┘ │ -│ │ │ -│ ┌───────────────────────┮───────────────────────┐ │ -│ │ GossipCluster │ │ -│ │ - 成员发现 (Membership) │ │ -│ │ - Actor 䜍眮 (Actor Registry) │ │ -│ │ - 故障检测 (SWIM) │ │ -│ └───────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ +```mermaid +graph TB + subgraph ActorSystem["ActorSystem"] + subgraph LocalActors["Local Actors"] + A["Actor A
(Mailbox)"] + B["Actor B
(Mailbox)"] + C["Actor C
(Mailbox)"] + end + + subgraph Transport["HTTP Transport"] + T1["POST /actor/{name}
Actor Messages"] + T2["POST /cluster/gossip
Cluster Protocol"] + end + + subgraph Cluster["GossipCluster"] + M["成员发现
(Membership)"] + R["Actor 䜍眮
(Actor Registry)"] + S["故障检测
(SWIM)"] + end + end + + A & B & C --> Transport + Transport --> Cluster + + style ActorSystem fill:#f5f5f5,stroke:#333,stroke-width:2px + style LocalActors fill:#e3f2fd,stroke:#1976d2 + style Transport fill:#fff3e0,stroke:#f57c00 + style Cluster fill:#e8f5e9,stroke:#388e3c ``` ## 栞心抂念 @@ -64,30 +68,36 @@ ActorRef 提䟛䜍眮透明性 ### 本地消息 -``` -Sender Mailbox Actor - │ │ │ - │── ask(Ping) ────────────→│ │ - │ │── recv() ─────────────→│ - │ │ │── handle() - │ │←─ respond(Pong) ───────│ - │←─ Pong ──────────────────│ │ +```mermaid +sequenceDiagram + participant S as Sender + participant M as Mailbox + participant A as Actor + + S->>M: ask(Ping) + M->>A: recv() + A->>A: handle() + A-->>M: respond(Pong) + M-->>S: Pong ``` ### 远皋消息 -``` -Node A Network Node B - │ │ │ - │ │ │ -Sender ─→ ActorRef(Remote) │ Actor - │ │ │ │ - │ │── HTTP POST ──────→│── /actor/{name} ─────────────→│ - │ │ {msg_type, │ Envelope │ - │ │ payload} │ │── handle() - │ │ │ │ - │ │←─ HTTP Response ───│←─ {result} ──────────────────│ - │←─ Pong ────│ │ │ +```mermaid +sequenceDiagram + participant S as Sender + participant R as ActorRef(Remote) + participant N as Network + participant A as Actor (Node B) + + S->>R: ask(Ping) + R->>N: HTTP POST /actor/{name} + Note over R,N: {msg_type, payload} + N->>A: Envelope + A->>A: handle() + A-->>N: {result} + N-->>R: HTTP Response + R-->>S: Pong ``` ## 讟计原则 diff --git a/docs/src/faq.md b/docs/src/faq.md new file mode 100644 index 000000000..a043e10c4 --- /dev/null +++ b/docs/src/faq.md @@ -0,0 +1,329 @@ +# Frequently Asked Questions + +This page addresses common questions and issues users encounter when working with Pulsing. + +## General Questions + +### What is Pulsing? + +Pulsing is a distributed actor framework that provides a communication backbone for building distributed systems, with specialized support for AI applications. + +### How does Pulsing differ from Ray? + +While Ray focuses on general distributed computing with task-based parallelism, Pulsing specializes in the Actor model with: + +- **Location transparency**: Same API for local and remote actors +- **True actor semantics**: Actors process messages one at a time +- **Zero external dependencies**: Pure Rust + Tokio implementation +- **Streaming support**: Native support for streaming responses + +### When should I use Pulsing vs Ray? + +Choose Pulsing if you need: + +- Actor-based programming with location transparency +- Streaming responses (LLM applications) +- Minimal operational complexity (no external services) +- High-performance actor communication + +Choose Ray if you need: + +- General distributed computing tasks +- Complex dependency management +- Integration with existing Ray ecosystem + +## Installation Issues + +### ImportError: No module named 'pulsing' + +**Problem**: Pulsing package is not installed or not in Python path. + +**Solutions**: + +1. **Install Pulsing**: + ```bash + pip install pulsing + ``` + +2. **For development**: + ```bash + git clone https://github.com/DeepLink-org/pulsing + cd pulsing + pip install -e . + ``` + +3. **Check Python path**: + ```python + import sys + print(sys.path) + ``` + +### Build failures on macOS/Linux + +**Problem**: Rust compilation issues. + +**Solutions**: + +1. **Install Rust**: + ```bash + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + source ~/.cargo/env + ``` + +2. **Install system dependencies** (Ubuntu/Debian): + ```bash + sudo apt-get install build-essential pkg-config libssl-dev + ``` + +3. **Install system dependencies** (macOS): + ```bash + brew install openssl pkg-config + ``` + +## Runtime Issues + +### Actor not responding to messages + +**Problem**: Actor appears to be stuck or not processing messages. + +**Possible causes**: + +1. **Blocking operations**: Actor is blocked on synchronous I/O +2. **Infinite loop**: Actor code contains an infinite loop +3. **Deadlock**: Actor is waiting for a message that will never arrive + +**Solutions**: + +```python +# ❌ Bad: Blocking I/O in actor +@pul.remote +class BadActor: + def process(self, url): + response = requests.get(url) # Blocks the actor! + return response.text + +# ✅ Good: Use async I/O +@pul.remote +class GoodActor: + async def process(self, url): + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + return await response.text() +``` + +### Connection refused errors + +**Problem**: Cannot connect to remote actors. + +**Possible causes**: + +1. **Wrong address**: Actor system listening on different address +2. **Firewall**: Network traffic blocked +3. **TLS issues**: Certificate validation failures + +**Solutions**: + +1. **Check actor system address**: + ```python + # Make sure addresses match + system1 = await pul.actor_system(addr="0.0.0.0:8000") + system2 = await pul.actor_system(addr="0.0.0.0:8001", seeds=["127.0.0.1:8000"]) + ``` + +2. **Disable TLS for testing**: + ```python + # For development only + system = await pul.actor_system(addr="0.0.0.0:8000", passphrase=None) + ``` + +### Memory leaks + +**Problem**: Memory usage grows over time. + +**Possible causes**: + +1. **Message accumulation**: Messages not being processed fast enough +2. **Large message payloads**: Messages containing large data structures +3. **Actor leaks**: Actors not being properly cleaned up + +**Solutions**: + +1. **Monitor mailbox size**: + ```python + # Check actor mailbox size + mailbox_size = await system.get_mailbox_size("actor_name") + ``` + +2. **Use streaming for large data**: + ```python + @pul.remote + class StreamingActor: + async def process_large_data(self, data_stream): + async for chunk in data_stream: + # Process chunk by chunk + yield self.process_chunk(chunk) + ``` + +## Performance Issues + +### High latency + +**Problem**: Message round-trip takes too long. + +**Optimizations**: + +1. **Use local actors when possible**: + ```python + # Local actor (fast) + local_actor = await MyActor.spawn() + + # Remote actor (slower) + remote_actor = await MyActor.resolve("remote_actor") + ``` + +2. **Batch messages**: + ```python + # Instead of multiple calls + results = [] + for item in items: + result = await actor.process(item) + results.append(result) + + # Batch processing + results = await actor.process_batch(items) + ``` + +3. **Use tell() for fire-and-forget**: + ```python + # Don't wait for response if not needed + await actor.log_event(event_data) # Uses ask() internally + await actor.tell({"action": "log", "data": event_data}) # Fire-and-forget + ``` + +### Serialization overhead + +**Problem**: Message serialization is slow. + +**Solutions**: + +1. **Use efficient data formats**: + ```python + # ✅ Good: Use simple types + await actor.process({"numbers": [1, 2, 3], "text": "hello"}) + + # ❌ Bad: Complex nested objects + await actor.process({"data": very_complex_nested_object}) + ``` + +2. **Avoid sending large payloads**: + ```python + # Send references instead of data + await actor.process_data(data_id) # Send ID, not the data itself + ``` + +## Deployment Issues + +### Clustering not working + +**Problem**: Multiple nodes cannot discover each other. + +**Solutions**: + +1. **Check seed node configuration**: + ```python + # Node 1 (seed) + system1 = await pul.actor_system(addr="192.168.1.100:8000") + + # Node 2 (join cluster) + system2 = await pul.actor_system( + addr="192.168.1.101:8000", + seeds=["192.168.1.100:8000"] + ) + ``` + +2. **Verify network connectivity**: + ```bash + # Test if ports are open + telnet 192.168.1.100 8000 + ``` + +3. **Check firewall settings**: + ```bash + # Linux + sudo ufw status + sudo ufw allow 8000 + + # macOS + sudo pfctl -s rules + ``` + +### Load balancing issues + +**Problem**: Requests not distributed evenly across cluster. + +**Solutions**: + +1. **Use round-robin resolution**: + ```python + # Default behavior distributes across instances + actor = await MyActor.resolve("service_name") + ``` + +2. **Check actor distribution**: + ```python + # Monitor cluster membership + members = await system.members() + print(f"Cluster has {len(members)} nodes") + ``` + +## Migration Issues + +### Migrating from Ray + +**Common issues**: + +1. **API differences**: + ```python + # Ray + @ray.remote + class MyActor: + def __init__(self, value): + self.value = value + + actor = MyActor.remote(42) + result = ray.get(actor.method.remote()) + + # Pulsing + @pul.remote + class MyActor: + def __init__(self, value): + self.value = value + + actor = await MyActor.spawn(value=42) + result = await actor.method() + ``` + +2. **Async/await everywhere**: + ```python + # Pulsing requires async/await + async def main(): + await pul.init() + actor = await MyActor.spawn() + result = await actor.method() + await pul.shutdown() + + asyncio.run(main()) + ``` + +## Getting Help + +If you can't find the answer here: + +1. **Check the documentation**: [User Guide](../guide/) and [API Reference](../api/overview.md) +2. **Search existing issues**: [GitHub Issues](https://github.com/DeepLink-org/pulsing/issues) +3. **Ask the community**: [GitHub Discussions](https://github.com/DeepLink-org/pulsing/discussions) +4. **File a bug report**: If you found a bug, please [open an issue](https://github.com/DeepLink-org/pulsing/issues/new) + +## Contributing + +Found an issue with this FAQ? [Help improve it!](https://github.com/DeepLink-org/pulsing/blob/main/docs/src/faq.md) \ No newline at end of file diff --git a/docs/src/faq.zh.md b/docs/src/faq.zh.md new file mode 100644 index 000000000..1710a35fb --- /dev/null +++ b/docs/src/faq.zh.md @@ -0,0 +1,329 @@ +# 垞见问题解答 + +歀页面解答甚户圚䜿甚 Pulsing 时遇到的垞见问题和问题。 + +## 䞀般问题 + +### 什么是 Pulsing + +Pulsing 是䞀䞪分垃匏 actor 框架䞺构建分垃匏系统提䟛通信骚干并䞺 AI 应甚提䟛䞓闚支持。 + +### Pulsing 侎 Ray 有䜕区别 + +Ray 䞓泚于通甚分垃匏计算和基于任务的并行性而 Pulsing 䞓闚针对 Actor 暡型 + +- **䜍眮透明性**本地和远皋 actor 䜿甚盞同 API +- **真正的 actor 语义**Actor 䞀次倄理䞀条消息 +- **零倖郚䟝赖**纯 Rust + Tokio 实现 +- **流匏支持**原生支持流匏响应 + +### 䜕时应该䜿甚 Pulsing 而䞍是 Ray + +圓䜠需芁以䞋特性时选择 Pulsing + +- 基于 actor 的猖皋和䜍眮透明性 +- 流匏响应LLM 应甚 +- 最小的运绎倍杂床无需倖郚服务 +- 高性胜 actor 通信 + +圓䜠需芁以䞋特性时选择 Ray + +- 通甚分垃匏计算任务 +- 倍杂的䟝赖管理 +- 䞎现有 Ray 生态系统集成 + +## 安装问题 + +### ImportError: No module named 'pulsing' + +**问题**Pulsing 包未安装或䞍圚 Python 路埄䞭。 + +**解决方案** + +1. **安装 Pulsing** + ```bash + pip install pulsing + ``` + +2. **匀发环境** + ```bash + git clone https://github.com/DeepLink-org/pulsing + cd pulsing + pip install -e . + ``` + +3. **检查 Python 路埄** + ```python + import sys + print(sys.path) + ``` + +### macOS/Linux 䞊的构建倱莥 + +**问题**Rust 猖译问题。 + +**解决方案** + +1. **安装 Rust** + ```bash + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + source ~/.cargo/env + ``` + +2. **安装系统䟝赖**Ubuntu/Debian + ```bash + sudo apt-get install build-essential pkg-config libssl-dev + ``` + +3. **安装系统䟝赖**macOS + ```bash + brew install openssl pkg-config + ``` + +## 运行时问题 + +### Actor 䞍响应消息 + +**问题**Actor 䌌乎卡䜏或䞍倄理消息。 + +**可胜原因** + +1. **阻塞操䜜**Actor 圚同步 I/O 䞊阻塞 +2. **无限埪环**Actor 代码包含无限埪环 +3. **死锁**Actor 正圚等埅氞远䞍䌚到蟟的消息 + +**解决方案** + +```python +# ❌ 错误圚 actor 䞭䜿甚阻塞 I/O +@pul.remote +class BadActor: + def process(self, url): + response = requests.get(url) # 阻塞 actor + return response.text + +# ✅ 正确䜿甚匂步 I/O +@pul.remote +class GoodActor: + async def process(self, url): + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + return await response.text() +``` + +### 连接拒绝错误 + +**问题**无法连接到远皋 actor。 + +**可胜原因** + +1. **地址错误**Actor 系统监听䞍同的地址 +2. **防火墙**眑络流量被阻塞 +3. **TLS 问题**证乊验证倱莥 + +**解决方案** + +1. **检查 actor 系统地址** + ```python + # 确保地址匹配 + system1 = await pul.actor_system(addr="0.0.0.0:8000") + system2 = await pul.actor_system(addr="0.0.0.0:8001", seeds=["127.0.0.1:8000"]) + ``` + +2. **测试时犁甚 TLS** + ```python + # 仅甚于匀发环境 + system = await pul.actor_system(addr="0.0.0.0:8000", passphrase=None) + ``` + +### 内存泄挏 + +**问题**内存䜿甚量随时闎增长。 + +**可胜原因** + +1. **消息积环**消息倄理䞍借快 +2. **倧型消息莟蜜**消息包含倧型数据结构 +3. **Actor 泄挏**Actor 未正确枅理 + +**解决方案** + +1. **监控邮箱倧小** + ```python + # 检查 actor 邮箱倧小 + mailbox_size = await system.get_mailbox_size("actor_name") + ``` + +2. **对倧型数据䜿甚流匏倄理** + ```python + @pul.remote + class StreamingActor: + async def process_large_data(self, data_stream): + async for chunk in data_stream: + # 分块倄理 + yield self.process_chunk(chunk) + ``` + +## 性胜问题 + +### 高延迟 + +**问题**消息埀返耗时倪长。 + +**䌘化方案** + +1. **尜可胜䜿甚本地 actor** + ```python + # 本地 actor快速 + local_actor = await MyActor.spawn() + + # 远皋 actor蟃慢 + remote_actor = await MyActor.resolve("remote_actor") + ``` + +2. **批倄理消息** + ```python + # 䞍芁进行倚次调甚 + results = [] + for item in items: + result = await actor.process(item) + results.append(result) + + # 批倄理 + results = await actor.process_batch(items) + ``` + +3. **对无需响应的操䜜䜿甚 tell()** + ```python + # 劂果䞍需芁响应䞍芁等埅 + await actor.log_event(event_data) # 圚内郚䜿甚 ask() + await actor.tell({"action": "log", "data": event_data}) # 发射后䞍管 + ``` + +### 序列化匀销 + +**问题**消息序列化埈慢。 + +**解决方案** + +1. **䜿甚高效的数据栌匏** + ```python + # ✅ 良奜䜿甚简单类型 + await actor.process({"numbers": [1, 2, 3], "text": "hello"}) + + # ❌ 错误倍杂的嵌套对象 + await actor.process({"data": very_complex_nested_object}) + ``` + +2. **避免发送倧型莟蜜** + ```python + # 发送匕甚而䞍是数据 + await actor.process_data(data_id) # 发送 ID而䞍是数据本身 + ``` + +## 郚眲问题 + +### 集矀无法正垞工䜜 + +**问题**倚䞪节点无法盞互发现。 + +**解决方案** + +1. **检查种子节点配眮** + ```python + # 节点 1种子 + system1 = await pul.actor_system(addr="192.168.1.100:8000") + + # 节点 2加入集矀 + system2 = await pul.actor_system( + addr="192.168.1.101:8000", + seeds=["192.168.1.100:8000"] + ) + ``` + +2. **验证眑络连接** + ```bash + # 测试端口是吊匀攟 + telnet 192.168.1.100 8000 + ``` + +3. **检查防火墙讟眮** + ```bash + # Linux + sudo ufw status + sudo ufw allow 8000 + + # macOS + sudo pfctl -s rules + ``` + +### 莟蜜均衡问题 + +**问题**请求未圚集矀䞭均匀分垃。 + +**解决方案** + +1. **䜿甚蜮询解析** + ```python + # 默讀行䞺圚实䟋闎分垃 + actor = await MyActor.resolve("service_name") + ``` + +2. **检查 actor 分垃** + ```python + # 监控集矀成员 + members = await system.members() + print(f"Cluster has {len(members)} nodes") + ``` + +## 迁移问题 + +### 从 Ray 迁移 + +**垞见问题** + +1. **API 差匂** + ```python + # Ray + @ray.remote + class MyActor: + def __init__(self, value): + self.value = value + + actor = MyActor.remote(42) + result = ray.get(actor.method.remote()) + + # Pulsing + @pul.remote + class MyActor: + def __init__(self, value): + self.value = value + + actor = await MyActor.spawn(value=42) + result = await actor.method() + ``` + +2. **倄倄需芁 async/await** + ```python + # Pulsing 需芁 async/await + async def main(): + await pul.init() + actor = await MyActor.spawn() + result = await actor.method() + await pul.shutdown() + + asyncio.run(main()) + ``` + +## 获取垮助 + +劂果圚歀倄扟䞍到答案 + +1. **查看文档**[甚户指南](../guide/) 和 [API 参考](../api/overview.md) +2. **搜玢现有问题**[GitHub Issues](https://github.com/DeepLink-org/pulsing/issues) +3. **咚询瀟区**[GitHub Discussions](https://github.com/DeepLink-org/pulsing/discussions) +4. **提亀错误报告**劂果发现 bug请[创建 issue](https://github.com/DeepLink-org/pulsing/issues/new) + +## 莡献 + +发现歀 FAQ 有问题[垮助改进它](https://github.com/DeepLink-org/pulsing/blob/main/docs/src/faq.md) \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 90214abd4..499196d24 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,7 @@ --- template: home.html -title: Pulsing - Lightweight Distributed Actor Framework -description: A lightweight distributed Actor framework for building scalable AI systems. Zero external dependencies, SWIM protocol discovery, Python-first design. +title: Pulsing - Lightweight Distributed Actor Framework for AI +description: Pulsing is a distributed actor framework that provides a communication backbone for building distributed systems, with specialized support for AI applications. hide: toc --- @@ -9,7 +9,7 @@ hide: toc # Pulsing -A **lightweight distributed Actor framework** built with Rust and Tokio. +A **distributed actor framework** that provides a communication backbone for building distributed systems, with specialized support for AI applications. ## Why Pulsing? @@ -116,7 +116,7 @@ asyncio.run(main()) | Build a cluster | [Guide: Remote Actors](guide/remote_actors.md) | | Operate your system | [Guide: CLI Operations](guide/operations.md) | | Deep dive into design | [Design Documents](design/architecture.md) | -| API details | [API Reference](api_reference.md) | +| API details | [API Reference](api/overview.md) | --- diff --git a/docs/src/index.zh.md b/docs/src/index.zh.md index 564840869..a23275b83 100644 --- a/docs/src/index.zh.md +++ b/docs/src/index.zh.md @@ -1,7 +1,7 @@ --- template: home.html title: Pulsing - 蜻量级分垃匏 Actor 框架 -description: 基于 Rust 和 Tokio 构建的蜻量级分垃匏 Actor 框架䞓䞺 AI 系统讟计。零倖郚䟝赖内眮服务发现Python 䌘先。 +description: Pulsing 是䞀䞪分垃匏 actor 框架䞺构建分垃匏系统提䟛通信骚干并䞺 AI 应甚提䟛䞓闚支持。 hide: toc --- @@ -9,7 +9,7 @@ hide: toc # Pulsing -基于 Rust 和 Tokio 构建的**蜻量级分垃匏 Actor 框架**。 +䞀䞪**分垃匏 actor 框架**䞺构建分垃匏系统提䟛通信骚干并䞺 AI 应甚提䟛䞓闚支持。 ## 䞺什么选择 Pulsing diff --git a/llms.binding.md b/llms.binding.md index f007e30b1..87b4720b2 100644 --- a/llms.binding.md +++ b/llms.binding.md @@ -2,7 +2,7 @@ ## Overview -`Pulsing`是䞀欟分垃匏系统通信框架可以䜜䞺任意分垃匏系统的通信骚架以方䟿快速搭建分垃匏系统和应甚。 +`Pulsing` is a distributed actor framework that provides a communication backbone for building distributed systems, with specialized support for AI applications. ## Python 接口