From 9b13a062e358931596a54f01fd6961e60ae4ce23 Mon Sep 17 00:00:00 2001 From: "SYM.BOT" Date: Wed, 29 Apr 2026 21:05:25 +0100 Subject: [PATCH] 0.5.5: lower stale-prior threshold from 10s to 1s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 10s window from v0.5.3+v0.5.4 was too lenient. Peer process killed and quickly relaunched left lastSeen within the window (old run had sent a CMB seconds before death), so the dedup-reject path killed legitimate redials. 1s threshold tolerates sub-second TCP-retry races during initial handshake; peer restarts (≥1s gap) now recover at the application layer. 150/150 unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 22 ++++++++++++++++++++++ lib/node.js | 10 +++++++++- package.json | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dce360..aa5723a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,28 @@ > **Note:** Versions 0.3.26 – 0.3.55 were released as git tags without changelog entries. Changelog resumes at 0.3.56 below. +## 0.5.5 + +### Fixed + +- **Stale-prior threshold lowered from 10s to 1s.** v0.5.3+v0.5.4 + introduced lastSeen-aware stale detection in the inbound-connection + and `_createPeer` dedup paths, with the threshold tied to + `_heartbeatInterval` (default 10s). Field testing showed this was too + lenient: when a peer process was killed and quickly relaunched, the + old run had typically sent a CMB seconds before death, so `lastSeen` + was still within the 10s window. The dedup logic then rejected the + legitimate redial as a same-direction-duplicate, producing + `connection ready → immediate disconnect` with no handshake-complete + on the dialing side. + + Lowered to a hardcoded 1s threshold in both dedup paths. + Sub-second TCP-retry races during initial handshake still keep prior + (the case the same-direction-duplicate rule was designed for); peer + restarts with ≥1s between kill and re-dial now recover within the + application layer instead of being blocked until OS keepalive reaps + the underlying socket (~100s). + ## 0.5.4 ### Fixed diff --git a/lib/node.js b/lib/node.js index 261d1a3..997b22f 100644 --- a/lib/node.js +++ b/lib/node.js @@ -549,8 +549,16 @@ class SymNode extends EventEmitter { // restarting peer reconnects within seconds, long enough that // a momentary lull during initial handshake doesn't trip it. const staleByFlag = !prev || prev._closed; + // 1-second threshold (NOT _heartbeatInterval=10s). When a peer + // process is killed and quickly relaunches, its old run sent a + // CMB seconds before death, so lastSeen is still recent. A 10s + // threshold misses this and the dedup-reject path kills the + // legitimate redial. 1s tolerates sub-second TCP-retry races + // during initial handshake while letting normal peer-restart + // (≥1s gap between kill and re-dial) recover within the + // application layer. const staleByLastSeen = existingPeer.lastSeen - && (Date.now() - existingPeer.lastSeen) > this._heartbeatInterval; + && (Date.now() - existingPeer.lastSeen) > 1000; const stalePrior = staleByFlag || staleByLastSeen; if (!stalePrior) { // Determine prior direction. The peer's `isOutbound` field reflects diff --git a/package.json b/package.json index 8b09975..038b09f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@sym-bot/sym", - "version": "0.5.4", + "version": "0.5.5", "description": "Infrastructure and protocol for multi-agent collective intelligence", "main": "lib/node.js", "bin": {