From 13a6c4edacf76aad8d4662498059ea9c730ff987 Mon Sep 17 00:00:00 2001 From: UtkarshBhardwaj007 Date: Sat, 18 Apr 2026 18:05:52 +0100 Subject: [PATCH] Harden deploy memory watchdog, add verbose logs, fix signing counter Two-part fix addressing (1) a 20 GB RSS growth + freeze during chunk upload in phone-signer mode that macOS jetsam terminated with SIGKILL, and (2) a phone-signer approval counter that printed "approve step 5 of 4" whenever the domain required a Proof-of-Personhood upgrade. Memory hardening ---------------- * `src/utils/process-guard.ts` moves the RSS watchdog into a `worker_threads` Worker. The previous `setInterval`-on-main version was starved by polkadot-api subscription microtask floods; RSS climbed past 10 GB between samples and macOS jetsam delivered SIGKILL before we could print any abort message. The worker has its own event loop and samples at 1 s (down from 5 s). On cap crossed, it SIGKILLs the whole process with a clear reason; on main-thread shutdown it tears down via `postMessage("stop")` with a `terminate()` fallback. * `src/utils/deploy/storage.ts` gains a `DOT_DEPLOY_VERBOSE=1` env var that passes every bulletin-deploy log line through to stderr with a `[+s]` prefix. Previously the interceptor dropped everything that wasn't a phase banner or `[N/M]` chunk line, which made freeze reports diagnostically opaque. Pair with `DOT_MEMORY_TRACE=1` to correlate log events with RSS growth. * `src/commands/deploy/index.ts` destroys the Asset Hub client immediately after preflight. Nothing in the deploy flow between preflight and playground publish uses `getConnection()`, so holding an idle polkadot-api client + live best-block subscription for the full deploy window was unnecessary background pressure. Publish re-establishes via `getConnection()`. Signing counter --------------- For a PoP-gated label signed by a lower-tier signer, bulletin-deploy submits `setUserPopStatus` before `register()`, so `dot deploy --signer phone --playground` actually fires 5 sigs (setPop + commit + finalize + setCH + playground publish), not 4. The approvals list was hardcoded to 3 DotNS taps, so the summary card advertised "4 approvals" and the phone prompt later said "approve step 5 of 4". Fix threads a structured `plan: { action: "register" | "update", needsPopUpgrade }` from the availability check through to the signing proxy: * `availability.ts` calls `getUserPopStatus(userH160)` + `isTestnet()` and predicts `needsPopUpgrade` via a mirrored `simulateUserStatus` rule (reproduced locally because the helper isn't exported from bulletin- deploy's root; the rule has been stable since 0.6.9-rc.5). * `signerMode.ts` generates a variable-length approvals list per plan: `update` -> only setContenthash (1 tap); `register` -> commit + finalize + contenthash (3), plus a prefix "Grant Proof of Personhood" entry when `needsPopUpgrade`. Summary card + runtime counter consume the same list so they always agree. * `run.ts` passes the built approvals into `maybeWrapAuthForSigning` instead of its old hardcoded 3 labels; the Nth `signTx` call now labels itself with the Nth dotns entry of the approvals list. * `signingProxy.ts` `createSigningCounter` clamps `total` upward when `step > total`. Belt-and-braces -- if a future bulletin-deploy version adds a sig our prediction missed, the TUI will show "N of N" instead of regressing to "N of N-1". Tests ----- 170/170 pass (161 existing + 9 new): * 4 in availability.test.ts covering needsPopUpgrade prediction, update path, and RPC-flake fallback to safe default * 2 in run.test.ts covering 5-approval (PoP upgrade) and 2-approval (re-deploy) phone paths * 3 in signingProxy.test.ts covering the counter clamp `tsc --noEmit` clean, `biome format` clean. Related CLAUDE.md invariants touched: process-guard watchdog now worker- based (the "Process-guard safety net" bullet), and deploy log stream now has a documented verbose opt-in. --- .../watchdog-worker-and-verbose-logs.md | 11 ++ src/commands/deploy/DeployScreen.tsx | 16 +- src/commands/deploy/index.ts | 12 ++ src/utils/deploy/availability.test.ts | 89 +++++++++- src/utils/deploy/availability.ts | 88 +++++++++- src/utils/deploy/index.ts | 1 + src/utils/deploy/run.test.ts | 66 +++++++ src/utils/deploy/run.ts | 40 +++-- src/utils/deploy/signerMode.ts | 50 +++++- src/utils/deploy/signingProxy.test.ts | 34 ++++ src/utils/deploy/signingProxy.ts | 10 +- src/utils/deploy/storage.ts | 18 +- src/utils/process-guard.ts | 161 +++++++++++++----- 13 files changed, 524 insertions(+), 72 deletions(-) create mode 100644 .changeset/watchdog-worker-and-verbose-logs.md create mode 100644 src/utils/deploy/signingProxy.test.ts diff --git a/.changeset/watchdog-worker-and-verbose-logs.md b/.changeset/watchdog-worker-and-verbose-logs.md new file mode 100644 index 0000000..86d578d --- /dev/null +++ b/.changeset/watchdog-worker-and-verbose-logs.md @@ -0,0 +1,11 @@ +--- +"playground-cli": patch +--- + +Harden the deploy memory watchdog, add diagnostic logging for freezes / runaway RSS, and fix the phone-signer approval counter when a PoP upgrade is required. + +- **Watchdog now runs in a `worker_threads` Worker**, not a `setInterval` on the main thread. Under heavy microtask load (polkadot-api block subscriptions, bulletin-deploy retry loops) the main thread's macrotask queue can be starved for long enough that RSS climbs to 10+ GB between samples — at which point macOS jetsam delivers SIGKILL and the user sees a mystery `zsh: killed` with no guidance. The worker has its own event loop that can't be starved by the main thread, so the 4 GB cap now actually fires with a clear abort message. Sampling rate is also tightened from 5 s → 1 s now that it's off the hot path. +- **New `DOT_DEPLOY_VERBOSE=1` env var** writes every bulletin-deploy log line (chunk progress, broadcast / included / finalized transitions, nonce traces, RPC reconnects) to stderr with a `[+s]` timestamp. Previously the interceptor swallowed everything that wasn't a phase banner or `[N/M]` chunk line to keep the TUI clean; that made "deploy froze at chunk 2/6" reports diagnostically opaque. Pair with `DOT_MEMORY_TRACE=1` to correlate log events with RSS growth. +- **Asset Hub client is now destroyed immediately after preflight** instead of lingering until deploy cleanup. Nothing in the deploy flow (build, bulletin-deploy's storage + DotNS, our playground publish) uses it between preflight and the publish step — and holding an idle polkadot-api client with a live best-block subscription for the full deploy window was measurable background pressure. Playground publish calls `getConnection()` which auto-re-establishes a fresh client at that point. +- **Phone-signer approval count now matches reality.** For a PoP-gated name registered with a signer below the required tier, bulletin-deploy submits an extra `setUserPopStatus` tx before `register()` — so `dot deploy --signer phone --playground` actually fires 5 sigs, not 4. The summary card used to advertise "4 approvals" and the phone prompt later said "approve step 5 of 4". Fixed by predicting `needsPopUpgrade` during the availability check (via `getUserPopStatus` + mirrored `simulateUserStatus` logic) and threading that prediction into `resolveSignerSetup`, so the approvals list (and the derived summary, and the signing-proxy labels) are variable-length. Added: a belt-and-braces clamp in `createSigningCounter` that grows `total` when `step > total`, so even if our prediction mis-estimates for any reason the TUI never shows "step 5 of 4" again. +- **Re-deploy path now shows a minimal phone tap count.** When the availability check reports the domain is already owned by the signer, bulletin-deploy skips `register()` entirely and only fires `setContenthash`. The summary card and counter now reflect that (1 DotNS tap instead of 3). diff --git a/src/commands/deploy/DeployScreen.tsx b/src/commands/deploy/DeployScreen.tsx index 1d5732a..182d4b7 100644 --- a/src/commands/deploy/DeployScreen.tsx +++ b/src/commands/deploy/DeployScreen.tsx @@ -21,6 +21,7 @@ import { type DeployEvent, type DeployOutcome, type DeployPhase, + type DeployPlan, type SignerMode, type DeployApproval, type SigningEvent, @@ -72,6 +73,10 @@ export function DeployScreen({ const [domain, setDomain] = useState(initialDomain); const [publishToPlayground, setPublishToPlayground] = useState(initialPublish); const [domainError, setDomainError] = useState(null); + // Captured from the availability check; feeds `resolveSignerSetup` so + // the summary card shows the correct phone-approval count (register + + // PoP upgrade = 4 DotNS taps, vs register alone = 3, vs update = 1). + const [plan, setPlan] = useState(null); const [stage, setStage] = useState(() => pickInitialStage(initialMode, initialBuildDir, initialDomain, initialPublish), ); @@ -168,6 +173,7 @@ export function DeployScreen({ ownerSs58Address={userSigner?.address} onAvailable={(result) => { setDomain(result.fullDomain); + setPlan(result.plan); advance(mode, buildDir, result.fullDomain); }} onUnavailable={(reason) => { @@ -196,6 +202,7 @@ export function DeployScreen({ setStage({ kind: "running" })} onCancel={() => { onDone(null); @@ -208,6 +215,7 @@ export function DeployScreen({ projectDir={projectDir} inputs={resolved} userSigner={userSigner} + plan={plan} onFinish={(outcome, chunkTimings) => { setStage({ kind: "done", outcome }); // Surface completion on the terminal tab so users can glance over. @@ -331,11 +339,13 @@ function ValidateDomainStage({ function ConfirmStage({ inputs, userSigner, + plan, onProceed, onCancel, }: { inputs: Resolved; userSigner: ResolvedSigner | null; + plan: DeployPlan | null; onProceed: () => void; onCancel: () => void; }) { @@ -345,6 +355,7 @@ function ConfirmStage({ mode: inputs.mode, userSigner, publishToPlayground: inputs.publishToPlayground, + plan: plan ?? undefined, }); } catch (err) { return { @@ -352,7 +363,7 @@ function ConfirmStage({ error: err instanceof Error ? err.message : String(err), }; } - }, [inputs, userSigner]); + }, [inputs, userSigner, plan]); const view = buildSummaryView({ mode: inputs.mode, @@ -442,12 +453,14 @@ function RunningStage({ projectDir, inputs, userSigner, + plan, onFinish, onError, }: { projectDir: string; inputs: Resolved; userSigner: ResolvedSigner | null; + plan: DeployPlan | null; onFinish: (outcome: DeployOutcome, chunkTimings: number[]) => void; onError: (message: string) => void; }) { @@ -509,6 +522,7 @@ function RunningStage({ mode: inputs.mode, publishToPlayground: inputs.publishToPlayground, userSigner, + plan: plan ?? undefined, onEvent: (event) => handleEvent(event), }); if (!cancelled) onFinish(outcome, chunkTimingsRef.current); diff --git a/src/commands/deploy/index.ts b/src/commands/deploy/index.ts index 1877ea5..56155c2 100644 --- a/src/commands/deploy/index.ts +++ b/src/commands/deploy/index.ts @@ -93,6 +93,16 @@ export const deployCommand = new Command("deploy") return; } + // Release the Asset Hub client we opened for preflight mapping + + // allowance checks. Nothing else in the deploy path (build, chunk + // upload, bulletin-deploy's own DotNS preflight + registration) + // touches `getConnection()` — and holding an idle polkadot-api client + // with a live best-block subscription for the entire deploy window + // was a measurable contributor to background memory pressure. The + // playground publish step calls `getConnection()` which auto-creates + // a fresh client at that point. + destroyConnection(); + try { const nonInteractive = isFullySpecified(opts); if (nonInteractive) { @@ -223,6 +233,7 @@ async function runHeadless(ctx: { mode, userSigner: ctx.userSigner, publishToPlayground, + plan: availability.plan, }); const view = buildSummaryView({ mode, @@ -240,6 +251,7 @@ async function runHeadless(ctx: { mode, publishToPlayground, userSigner: ctx.userSigner, + plan: availability.plan, env: ctx.env, onEvent: (event) => logHeadlessEvent(event), }); diff --git a/src/utils/deploy/availability.test.ts b/src/utils/deploy/availability.test.ts index 95a5064..f33d02d 100644 --- a/src/utils/deploy/availability.test.ts +++ b/src/utils/deploy/availability.test.ts @@ -3,9 +3,13 @@ import { describe, it, expect, vi } from "vitest"; // Mock bulletin-deploy's DotNS class. Ownership check is now driven by the // caller's H160 (derived from SS58 via `@polkadot-apps/address::ss58ToH160`), // so the mock needs to reflect the full `{ owned, owner }` shape the caller -// sees when they DO pass a user address. +// sees when they DO pass a user address. `getUserPopStatus` + `isTestnet` +// feed the `needsPopUpgrade` prediction that's threaded into the summary +// card's phone-approval count. const classifyName = vi.fn(); const checkOwnership = vi.fn(); +const getUserPopStatus = vi.fn(async () => 2); // default: user already has Full PoP → no upgrade fires +const isTestnet = vi.fn(async () => true); const connect = vi.fn(async () => {}); const disconnect = vi.fn(); @@ -14,6 +18,8 @@ vi.mock("bulletin-deploy", () => ({ connect, classifyName, checkOwnership, + getUserPopStatus, + isTestnet, disconnect, })), })); @@ -27,6 +33,10 @@ import { checkDomainAvailability, formatAvailability } from "./availability.js"; beforeEach(() => { classifyName.mockReset(); checkOwnership.mockReset(); + getUserPopStatus.mockReset(); + getUserPopStatus.mockResolvedValue(2); // default: Full PoP → no upgrade + isTestnet.mockReset(); + isTestnet.mockResolvedValue(true); connect.mockClear(); disconnect.mockClear(); }); @@ -39,10 +49,15 @@ describe("checkDomainAvailability", () => { classifyName.mockResolvedValue({ requiredStatus: 0, message: "" }); const result = await checkDomainAvailability("my-app"); + // No ownerSs58Address passed → we can't check user's current PoP, so + // we default the plan to the common path (register + no PoP upgrade). + // The signing counter's clamp-up behavior fixes the summary at + // runtime if we under-estimated. expect(result).toEqual({ status: "available", label: "my-app", fullDomain: "my-app.dot", + plan: { action: "register", needsPopUpgrade: false }, }); }); @@ -142,6 +157,65 @@ describe("checkDomainAvailability", () => { } }); + it("predicts needsPopUpgrade=true when the user's current PoP is below what the label demands", async () => { + // Regression: the TUI used to hard-code "3 DotNS taps" for phone mode + // and print "step 5 of 4" on the phone prompt once bulletin-deploy + // fired its `setUserPopStatus` tx. Fix: plumb the predicted PoP + // transition from availability → `resolveSignerSetup` so the summary + // card and runtime counter agree on the real count. + classifyName.mockResolvedValue({ requiredStatus: 2, message: "PoP Full" }); // name wants Full + checkOwnership.mockResolvedValue({ owned: false, owner: null }); + getUserPopStatus.mockResolvedValue(1); // user only has Lite + isTestnet.mockResolvedValue(true); + + const result = await checkDomainAvailability("short", { ownerSs58Address: ALICE_SS58 }); + expect(result.status).toBe("available"); + if (result.status === "available") { + expect(result.plan).toEqual({ action: "register", needsPopUpgrade: true }); + } + }); + + it("predicts needsPopUpgrade=false when the user already has ≥ the required PoP", async () => { + classifyName.mockResolvedValue({ requiredStatus: 2, message: "PoP Full" }); + checkOwnership.mockResolvedValue({ owned: false, owner: null }); + getUserPopStatus.mockResolvedValue(2); // already Full + isTestnet.mockResolvedValue(true); + + const result = await checkDomainAvailability("short", { ownerSs58Address: ALICE_SS58 }); + if (result.status === "available") { + expect(result.plan.needsPopUpgrade).toBe(false); + } + }); + + it("re-deploy: plan is { action: 'update', needsPopUpgrade: false } — only setContenthash fires", async () => { + classifyName.mockResolvedValue({ requiredStatus: 0, message: "" }); + checkOwnership.mockImplementation(async (_label: string, checkAddress: string) => ({ + owned: true, + owner: checkAddress, + })); + + const result = await checkDomainAvailability("my-existing-site", { + ownerSs58Address: ALICE_SS58, + }); + if (result.status === "available") { + expect(result.plan).toEqual({ action: "update", needsPopUpgrade: false }); + } + }); + + it("falls back to a safe default when getUserPopStatus throws", async () => { + // RPC flake on the PoP query shouldn't block the whole availability + // check — under-counting is recoverable via the counter's clamp. + classifyName.mockResolvedValue({ requiredStatus: 2, message: "PoP Full" }); + checkOwnership.mockResolvedValue({ owned: false, owner: null }); + getUserPopStatus.mockRejectedValue(new Error("RPC hiccup")); + + const result = await checkDomainAvailability("short", { ownerSs58Address: ALICE_SS58 }); + expect(result.status).toBe("available"); + if (result.status === "available") { + expect(result.plan).toEqual({ action: "register", needsPopUpgrade: false }); + } + }); + it("returns 'unknown' and disconnects when the RPC call throws", async () => { classifyName.mockRejectedValue(new Error("RPC down")); @@ -159,9 +233,15 @@ describe("checkDomainAvailability", () => { describe("formatAvailability", () => { it("renders a friendly sentence for each result kind", () => { - expect(formatAvailability({ status: "available", label: "x", fullDomain: "x.dot" })).toBe( - "x.dot is available", - ); + const freshRegisterPlan = { action: "register" as const, needsPopUpgrade: false }; + expect( + formatAvailability({ + status: "available", + label: "x", + fullDomain: "x.dot", + plan: freshRegisterPlan, + }), + ).toBe("x.dot is available"); expect( formatAvailability({ status: "reserved", @@ -176,6 +256,7 @@ describe("formatAvailability", () => { label: "x", fullDomain: "x.dot", note: "Requires Proof of Personhood (Lite). Will be set up automatically.", + plan: freshRegisterPlan, }), ).toMatch(/Proof of Personhood \(Lite\)/); expect( diff --git a/src/utils/deploy/availability.ts b/src/utils/deploy/availability.ts index 845333d..c1de4c6 100644 --- a/src/utils/deploy/availability.ts +++ b/src/utils/deploy/availability.ts @@ -20,14 +20,67 @@ import { normalizeDomain } from "./playground.js"; import { getChainConfig, type Env } from "../../config.js"; /** Mirror of bulletin-deploy's `ProofOfPersonhoodStatus` enum. Kept local so we don't couple to internals. */ -const POP_STATUS_RESERVED = 3; +const POP_STATUS_NO_STATUS = 0; const POP_STATUS_LITE = 1; const POP_STATUS_FULL = 2; +const POP_STATUS_RESERVED = 3; const ZERO_ADDRESS = "0x0000000000000000000000000000000000000000"; +/** + * Mirror of `simulateUserStatus` from bulletin-deploy. Reproduced (not + * imported) because the helper isn't exported from the package root and we + * don't want to reach into `bulletin-deploy/dist/dotns.js`. If the upstream + * rule set changes, this needs to track it — but the rule is small and has + * been stable since the preflight feature shipped in 0.6.9-rc.5. + * + * Predicts the user's PoP status AFTER `registerDomain`'s internal + * `setUserPopStatus` call completes. Used here to decide whether + * bulletin-deploy will actually submit that extra tx — which determines + * whether we count 3 or 4 DotNS approvals in the summary card. + */ +function predictPostRegisterPopStatus( + currentStatus: number, + requiredStatus: number, + isTestnet: boolean, +): number { + const max = (a: number, b: number) => (a > b ? a : b); + if (requiredStatus === POP_STATUS_NO_STATUS && currentStatus === POP_STATUS_LITE && isTestnet) { + // Paseo auto-escape: Lite signer on a NoStatus label gets bumped to + // Full so `PopRules.priceWithCheck` accepts the signer. Mainnet path + // never triggers this branch. + return POP_STATUS_FULL; + } + if (requiredStatus !== POP_STATUS_NO_STATUS) { + return max(currentStatus, requiredStatus); + } + return currentStatus; +} + +/** + * What bulletin-deploy will actually submit on-chain, in order. The TUI uses + * this to render a correct "N phone taps" count BEFORE the user confirms — + * the previous hard-coded "3 DotNS taps" assumption missed the extra + * `setUserPopStatus` tap that fires whenever the classifier demands a PoP + * level above the user's current one (e.g. short NoStatus name + Lite signer, + * or any PoP-gated name + NoStatus signer). Seeing "step 5 of 4" after the + * fact is a worse UX than predicting "5" upfront. + */ +export interface DeployPlan { + /** `register` = new domain (commit + reveal + setContenthash, ± setUserPopStatus). `update` = already owned by us; only setContenthash fires. */ + action: "register" | "update"; + /** True iff bulletin-deploy will submit a `setUserPopStatus` tx before `register()`. */ + needsPopUpgrade: boolean; +} + export type AvailabilityResult = - | { status: "available"; label: string; fullDomain: string; note?: string } + | { + status: "available"; + label: string; + fullDomain: string; + note?: string; + plan: DeployPlan; + } | { status: "reserved"; label: string; fullDomain: string; message: string } | { status: "taken"; label: string; fullDomain: string; owner: string } | { status: "unknown"; label: string; fullDomain: string; message: string }; @@ -92,10 +145,38 @@ export async function checkDomainAvailability( label, fullDomain, note: "Already owned by you — will update the existing deployment.", + plan: { action: "update", needsPopUpgrade: false }, }; } } + // Whether bulletin-deploy will fire `setUserPopStatus` before + // `register()` is a pure function of (classifier requirement, user's + // current status, chain = testnet). Reading getUserPopStatus requires + // an EVM address — if the caller didn't supply one, we can't tell, + // so we assume no upgrade (the counter will self-correct at runtime). + let needsPopUpgrade = false; + if (userH160) { + try { + const [userStatus, isTestnet] = await Promise.all([ + dotns.getUserPopStatus(userH160), + dotns.isTestnet(), + ]); + const target = predictPostRegisterPopStatus( + userStatus, + classification.requiredStatus, + isTestnet, + ); + needsPopUpgrade = target !== userStatus && target !== POP_STATUS_NO_STATUS; + } catch { + // RPC flake here shouldn't block the availability check — + // under-counting DotNS approvals is recoverable at runtime + // via the counter's clamp-up safety net. + } + } + + const plan: DeployPlan = { action: "register", needsPopUpgrade }; + // Names that require Proof-of-Personhood are still registrable on // testnet — bulletin-deploy self-attests during `register()` via // `setUserPopStatus`. Surface it as an advisory note, not a blocker. @@ -109,10 +190,11 @@ export async function checkDomainAvailability( label, fullDomain, note: `Requires Proof of Personhood (${requirement}). Will be set up automatically.`, + plan, }; } - return { status: "available", label, fullDomain }; + return { status: "available", label, fullDomain, plan }; } catch (err) { return { status: "unknown", diff --git a/src/utils/deploy/index.ts b/src/utils/deploy/index.ts index 3545681..d34f9c6 100644 --- a/src/utils/deploy/index.ts +++ b/src/utils/deploy/index.ts @@ -34,6 +34,7 @@ export { formatAvailability, type AvailabilityResult, type CheckAvailabilityOptions, + type DeployPlan, } from "./availability.js"; // Re-exported so SDK consumers (RevX) can tear down the shared Paseo client diff --git a/src/utils/deploy/run.test.ts b/src/utils/deploy/run.test.ts index 4cbdfa6..bc749e2 100644 --- a/src/utils/deploy/run.test.ts +++ b/src/utils/deploy/run.test.ts @@ -161,6 +161,72 @@ describe("runDeploy", () => { } }); + it("phone mode + needsPopUpgrade: 5 planned approvals (setPop + commit + finalize + contenthash + playground)", async () => { + // Regression: this used to be 4 approvals across the board, so a + // PoP-gated name with a lower-tier signer (which triggers + // `setUserPopStatus` inside bulletin-deploy.register) printed + // "approve step 5 of 4" at the playground step. + const { events, push } = collectEvents(); + const outcome = await runDeploy({ + projectDir: "/tmp/proj", + buildDir: "/tmp/proj/dist", + domain: "my-app", + mode: "phone", + publishToPlayground: true, + userSigner: fakeUserSigner, + plan: { action: "register", needsPopUpgrade: true }, + onEvent: push, + }); + + expect(outcome.approvalsRequested).toHaveLength(5); + expect(outcome.approvalsRequested.map((a) => a.label)).toEqual([ + "Grant Proof of Personhood", + "Reserve domain (DotNS commitment)", + "Finalize domain (DotNS register)", + "Link content (DotNS setContenthash)", + "Publish to Playground registry", + ]); + + const plan = events.find((e) => e.kind === "plan"); + if (plan?.kind === "plan") { + expect(plan.approvals.map((a) => a.phase)).toEqual([ + "dotns", + "dotns", + "dotns", + "dotns", + "playground", + ]); + } + }); + + it("phone mode + re-deploy (plan.action=update): only setContenthash + playground taps", async () => { + // When the domain is already owned by the signer, bulletin-deploy + // skips `register()` entirely (no commit, no reveal, no PoP grant) + // and jumps straight to `setContenthash`. Summary should reflect that. + const { events, push } = collectEvents(); + const outcome = await runDeploy({ + projectDir: "/tmp/proj", + buildDir: "/tmp/proj/dist", + domain: "my-app", + mode: "phone", + publishToPlayground: true, + userSigner: fakeUserSigner, + plan: { action: "update", needsPopUpgrade: false }, + onEvent: push, + }); + + expect(outcome.approvalsRequested).toHaveLength(2); + expect(outcome.approvalsRequested.map((a) => a.label)).toEqual([ + "Link content (DotNS setContenthash)", + "Publish to Playground registry", + ]); + + const plan = events.find((e) => e.kind === "plan"); + if (plan?.kind === "plan") { + expect(plan.approvals.map((a) => a.phase)).toEqual(["dotns", "playground"]); + } + }); + it("phone mode without a logged-in session throws before touching the network", async () => { const { push } = collectEvents(); await expect( diff --git a/src/utils/deploy/run.ts b/src/utils/deploy/run.ts index 0a0e743..d6ae87d 100644 --- a/src/utils/deploy/run.ts +++ b/src/utils/deploy/run.ts @@ -20,6 +20,7 @@ import { import type { DeployLogEvent } from "./progress.js"; import type { ResolvedSigner } from "../signer.js"; import type { Env } from "../../config.js"; +import type { DeployPlan } from "./availability.js"; // ── Events ─────────────────────────────────────────────────────────────────── @@ -56,6 +57,12 @@ export interface RunDeployOptions { onEvent: (event: DeployEvent) => void; /** Target environment. Defaults to `testnet`. */ env?: Env; + /** + * DotNS plan from the availability check — shapes the approvals list. + * Optional; the signing counter falls back to "register, no PoP upgrade" + * (3 DotNS taps) if absent and auto-corrects at runtime. + */ + plan?: DeployPlan; } export interface DeployOutcome { @@ -82,6 +89,7 @@ export async function runDeploy(options: RunDeployOptions): Promise["bulletinDeployAuthOptions"], options: RunDeployOptions, counter: SigningCounter, + approvals: DeployApproval[], ) { if (!auth.signer || !auth.signerAddress) return auth; - const labels = ["Reserve domain", "Finalize domain", "Link content"]; + const labels = approvals.filter((a) => a.phase === "dotns").map((a) => a.label); + const fallbackLabel = labels[labels.length - 1] ?? "DotNS step"; let seen = 0; const wrapped = { publicKey: auth.signer.publicKey, signTx: (...args: Parameters) => { - // Bulletin-deploy's current DotNS path makes exactly one signTx - // per logical step, so `seen` matches the intended label. If a - // future version retries signTx inside a step, `seen` would - // drift past `labels.length` and we'd show "DotNS step 4" on - // the phone — misleading. Cap at the last label so a retry - // reuses the most recent step name instead of inventing one. - const idx = Math.min(seen, labels.length - 1); - const label = labels[idx]; + const label = labels[seen] ?? fallbackLabel; seen += 1; const proxy = wrapSignerWithEvents(auth.signer!, { label, diff --git a/src/utils/deploy/signerMode.ts b/src/utils/deploy/signerMode.ts index c9af4c2..d34d173 100644 --- a/src/utils/deploy/signerMode.ts +++ b/src/utils/deploy/signerMode.ts @@ -15,6 +15,7 @@ import type { DeployOptions } from "bulletin-deploy"; import type { ResolvedSigner } from "../signer.js"; +import type { DeployPlan } from "./availability.js"; export type SignerMode = "dev" | "phone"; @@ -53,6 +54,49 @@ export interface ResolveOptions { userSigner: ResolvedSigner | null; /** Whether `--playground` / the prompt enabled playground publish. */ publishToPlayground: boolean; + /** + * Known DotNS plan from the availability check. Shapes the approvals list + * to match what bulletin-deploy will actually submit. Absent = we haven't + * run the check yet, so assume the most common path (new register, no + * PoP upgrade, 3 DotNS taps). The signing counter clamps up at runtime if + * we under-estimated, so users never see "step 5 of 4" even on this path. + */ + plan?: DeployPlan; +} + +/** + * DotNS approvals in the exact order bulletin-deploy will fire them. Order + * matters because `maybeWrapAuthForSigning` in run.ts labels each incoming + * `signTx` call by its index in this list — the Nth `signTx` is labelled + * with the Nth entry here, so a mismatch ends up showing "Finalize domain" + * on the phone when the app is actually asking for commitment. + */ +function dotnsApprovals(plan: DeployPlan | undefined): DeployApproval[] { + // Default to the most common path when the caller hasn't told us the + // plan yet. Counter will self-correct if we under-estimated. + const effective: DeployPlan = plan ?? { action: "register", needsPopUpgrade: false }; + + if (effective.action === "update") { + // Domain already owned by the signer — bulletin-deploy skips + // `register()` entirely (no commitment, no finalize, no PoP grant) + // and jumps straight to `setContenthash`. So only one tap. + return [{ phase: "dotns", label: "Link content (DotNS setContenthash)" }]; + } + + const approvals: DeployApproval[] = []; + if (effective.needsPopUpgrade) { + // `register()` submits `setUserPopStatus` first whenever the predicted + // post-grant status differs from the user's current one. Without this + // entry the counter previously ran one past total ("step 5 of 4") for + // any PoP-gated name. + approvals.push({ phase: "dotns", label: "Grant Proof of Personhood" }); + } + approvals.push( + { phase: "dotns", label: "Reserve domain (DotNS commitment)" }, + { phase: "dotns", label: "Finalize domain (DotNS register)" }, + { phase: "dotns", label: "Link content (DotNS setContenthash)" }, + ); + return approvals; } export function resolveSignerSetup(opts: ResolveOptions): DeploySignerSetup { @@ -70,11 +114,7 @@ export function resolveSignerSetup(opts: ResolveOptions): DeploySignerSetup { signer: opts.userSigner.signer, signerAddress: opts.userSigner.address, }; - approvals.push( - { phase: "dotns", label: "Reserve domain (DotNS commitment)" }, - { phase: "dotns", label: "Finalize domain (DotNS register)" }, - { phase: "dotns", label: "Link content (DotNS setContenthash)" }, - ); + approvals.push(...dotnsApprovals(opts.plan)); } // Playground publish always uses the user's signer so ownership ties to diff --git a/src/utils/deploy/signingProxy.test.ts b/src/utils/deploy/signingProxy.test.ts new file mode 100644 index 0000000..a2996ce --- /dev/null +++ b/src/utils/deploy/signingProxy.test.ts @@ -0,0 +1,34 @@ +import { describe, it, expect } from "vitest"; +import { createSigningCounter } from "./signingProxy.js"; + +describe("createSigningCounter", () => { + it("returns sequential step numbers against the configured total", () => { + const c = createSigningCounter(3); + expect(c.next()).toEqual({ step: 1, total: 3 }); + expect(c.next()).toEqual({ step: 2, total: 3 }); + expect(c.next()).toEqual({ step: 3, total: 3 }); + expect(c.count()).toBe(3); + }); + + it("clamps total upward when step exceeds the predicted count", () => { + // Regression: TUI used to print "approve step 5 of 4" whenever + // bulletin-deploy fired an extra `setUserPopStatus` tx that the + // predicted plan missed. Clamping the total up to the current step + // keeps the display coherent even when the prediction under-shoots. + const c = createSigningCounter(2); + expect(c.next()).toEqual({ step: 1, total: 2 }); + expect(c.next()).toEqual({ step: 2, total: 2 }); + // Predicted 2 taps, but a third fires: + expect(c.next()).toEqual({ step: 3, total: 3 }); + // And a fourth: + expect(c.next()).toEqual({ step: 4, total: 4 }); + }); + + it("count() reflects every reserved step regardless of clamping", () => { + const c = createSigningCounter(1); + c.next(); + c.next(); + c.next(); + expect(c.count()).toBe(3); + }); +}); diff --git a/src/utils/deploy/signingProxy.ts b/src/utils/deploy/signingProxy.ts index 4ed27d3..0f3cca1 100644 --- a/src/utils/deploy/signingProxy.ts +++ b/src/utils/deploy/signingProxy.ts @@ -21,10 +21,18 @@ export interface SigningCounter { export function createSigningCounter(total: number): SigningCounter { let step = 0; + // Treat the caller's `total` as a *minimum* — if the real deploy fires + // more sigs than we predicted (e.g. bulletin-deploy adds a new DotNS tx + // in a future version, or our PoP upgrade detection missed a case), the + // UI shows "step N of N" rather than the obviously-wrong "step N of N-1". + // Summary card's pre-deploy count can still be stale; this only fixes + // the runtime counter. + let runningTotal = total; return { next() { step += 1; - return { step, total }; + if (step > runningTotal) runningTotal = step; + return { step, total: runningTotal }; }, count() { return step; diff --git a/src/utils/deploy/storage.ts b/src/utils/deploy/storage.ts index 691ed15..a529eee 100644 --- a/src/utils/deploy/storage.ts +++ b/src/utils/deploy/storage.ts @@ -89,6 +89,13 @@ export async function runStorageDeploy(options: StorageDeployOptions): Promises]` timestamp. This is the + * diagnostic path for OOM / freeze reports — you get the exact last line + * bulletin-deploy managed to print before the process froze, plus timing for + * every chunk state transition (`broadcasting` → `included` → `finalized`). + * Combine with `DOT_MEMORY_TRACE=1` to correlate log events with RSS growth. */ function interceptConsoleLog( onEvent: ((event: DeployLogEvent) => void) | undefined, @@ -97,9 +104,15 @@ function interceptConsoleLog( const originalLog = console.log; const originalWarn = console.warn; const originalError = console.error; + const verbose = process.env.DOT_DEPLOY_VERBOSE === "1"; + const started = Date.now(); const feed = (parts: unknown[]) => { const combined = parts.map((p) => (typeof p === "string" ? p : String(p))).join(" "); + if (verbose) { + const elapsed = ((Date.now() - started) / 1000).toFixed(1); + process.stderr.write(`[+${elapsed}s] ${combined}\n`); + } for (const line of combined.split("\n")) { const event = parser.feed(line); if (event && onEvent) onEvent(event); @@ -110,9 +123,12 @@ function interceptConsoleLog( console.warn = (...args: unknown[]) => feed(args); // bulletin-deploy only prints errors on the sad path; keep them visible on // stderr so diagnostics don't disappear if something unexpected happens. + // In verbose mode `feed()` already wrote to stderr — skip the double-print. console.error = (...args: unknown[]) => { feed(args); - originalError.apply(console, args as Parameters); + if (!verbose) { + originalError.apply(console, args as Parameters); + } }; return () => { diff --git a/src/utils/process-guard.ts b/src/utils/process-guard.ts index 8110c61..8997124 100644 --- a/src/utils/process-guard.ts +++ b/src/utils/process-guard.ts @@ -20,11 +20,22 @@ * main async flow returns, if the process hasn't exited naturally within * a short grace period we exit anyway. * - * 3. `startMemoryWatchdog()` samples `process.memoryUsage().rss` every few - * seconds. If it crosses a hard cap the process aborts with a loud - * error message rather than taking the user's machine down. + * 3. `startMemoryWatchdog()` samples `process.memoryUsage().rss` from a + * dedicated worker thread. The worker has its own event loop, so a + * microtask flood on the main thread (polkadot-api subscriptions firing + * `.andThen(...)` chains faster than the queue drains) cannot starve + * the sampler. An earlier `setInterval`-on-main-thread version was seen + * to miss every sample while RSS climbed to 20 GB before macOS jetsam + * delivered SIGKILL — giving the user a mystery "zsh: killed" instead + * of our abort message. If the cap is crossed, the worker sends SIGKILL + * to the containing process (its `process.pid` IS the main PID in + * `worker_threads`). SIGKILL skips cleanup hooks, which is fine: at + * this point the event loop is already starved and signal handlers + * wouldn't fire anyway. */ +import { Worker } from "node:worker_threads"; + /** * Maximum RSS we're willing to tolerate before aborting the deploy. A * legitimate deploy loads polkadot-api runtime metadata for three chains, @@ -37,8 +48,12 @@ */ const MEMORY_LIMIT_BYTES = 4 * 1024 * 1024 * 1024; // 4 GB -/** How often the watchdog samples memory. */ -const MEMORY_POLL_MS = 5000; +/** + * How often the worker samples memory. 1 s is cheap now that sampling is + * off the main thread — earlier 5 s main-thread sampling gave a leak ~15 s + * of headroom to grow into GB territory before the first missed sample. + */ +const MEMORY_POLL_MS = 1000; /** Grace period after the main flow returns before we force-exit. */ const HARD_EXIT_GRACE_MS = 2000; @@ -99,46 +114,109 @@ export function scheduleHardExit(exitCode: number): void { } /** - * Poll RSS and abort if we blow past the absolute limit. Returns a `stop()` - * that cancels the watchdog — call it from a `finally` block. + * Worker-thread body. Sampled as source: `new Worker(WORKER_CODE, { eval: true })`. + * + * Runs in its own V8 isolate + event loop, so the main thread's microtask + * pressure cannot delay these samples. `process.memoryUsage()` returns + * process-wide stats (the worker shares the same OS process), so RSS here + * is the full `dot` process's RSS. + * + * If the cap is crossed we SIGKILL `process.pid` — which is the containing + * process's PID in a worker — and both threads die. We deliberately don't + * try to run cleanup: if the event loop is starved enough to hit 4 GB, + * cleanup hooks won't fire on the main thread anyway, and leaving the + * machine swappy is the worse failure mode. + */ +const WATCHDOG_WORKER_SOURCE = ` +const { parentPort, workerData } = require('node:worker_threads'); +const { limit, pollMs, trace } = workerData; + +const fmt = (n) => { + if (n >= 1024 ** 3) return (n / (1024 ** 3)).toFixed(2) + ' GB'; + if (n >= 1024 ** 2) return (n / (1024 ** 2)).toFixed(2) + ' MB'; + if (n >= 1024) return (n / 1024).toFixed(2) + ' KB'; + return n + ' B'; +}; + +const started = Date.now(); +let peak = 0; + +const interval = setInterval(() => { + const mem = process.memoryUsage(); + if (mem.rss > peak) peak = mem.rss; + + if (trace) { + const elapsed = ((Date.now() - started) / 1000).toFixed(1); + process.stderr.write( + '[mem +' + elapsed + 's] rss=' + fmt(mem.rss) + + ' heap=' + fmt(mem.heapUsed) + '/' + fmt(mem.heapTotal) + + ' external=' + fmt(mem.external) + + ' peak=' + fmt(peak) + '\\n' + ); + } + + if (mem.rss > limit) { + process.stderr.write( + '\\n\\u2716 Memory use exceeded ' + fmt(limit) + + ' (RSS \\u2248 ' + fmt(mem.rss) + '). Watchdog killing process.\\n' + + 'This is almost certainly a leaked subscription or runaway retry loop. ' + + 'Re-run with DOT_MEMORY_TRACE=1 DOT_DEPLOY_VERBOSE=1 to capture the timeline.\\n' + ); + // SIGKILL the whole process. process.pid from a worker is the host + // process PID, so this takes down the main thread too. + process.kill(process.pid, 'SIGKILL'); + } +}, pollMs); + +parentPort.on('message', (msg) => { + if (msg === 'stop') { + clearInterval(interval); + process.exit(0); + } +}); +`; + +/** + * Start the memory watchdog in a dedicated worker thread. Returns a `stop()` + * that tears the worker down — call it from a `finally` block. * - * When `DOT_MEMORY_TRACE=1` is set we also log every sample to stderr so a + * When `DOT_MEMORY_TRACE=1` is set we stream every sample to stderr so a * user hitting a leak can attach the timeline to a bug report without * needing a heap snapshot. */ export function startMemoryWatchdog(): () => void { const trace = process.env.DOT_MEMORY_TRACE === "1"; - const started = Date.now(); - let peak = 0; - - const interval = setInterval(() => { - const mem = process.memoryUsage(); - const rss = mem.rss; - if (rss > peak) peak = rss; - - if (trace) { - const elapsed = ((Date.now() - started) / 1000).toFixed(1); - process.stderr.write( - `[mem +${elapsed}s] rss=${formatBytes(rss)} ` + - `heap=${formatBytes(mem.heapUsed)}/${formatBytes(mem.heapTotal)} ` + - `external=${formatBytes(mem.external)} ` + - `peak=${formatBytes(peak)}\n`, - ); - } - - if (rss > MEMORY_LIMIT_BYTES) { - process.stderr.write( - `\n✖ Memory use exceeded ${formatBytes(MEMORY_LIMIT_BYTES)} ` + - `(RSS ≈ ${formatBytes(rss)}). Aborting to protect your machine.\n` + - `This is almost certainly a leaked subscription or runaway ` + - `retry loop. To diagnose, re-run with DOT_MEMORY_TRACE=1.\n`, - ); - runAllCleanupAndExit(137); + const worker = new Worker(WATCHDOG_WORKER_SOURCE, { + eval: true, + workerData: { + limit: MEMORY_LIMIT_BYTES, + pollMs: MEMORY_POLL_MS, + trace, + }, + }); + // Don't let the worker itself keep the host process alive on a clean + // exit — postMessage('stop') on shutdown handles the happy path. + worker.unref(); + + let stopped = false; + return () => { + if (stopped) return; + stopped = true; + try { + worker.postMessage("stop"); + } catch { + // Worker may already be gone (e.g. it triggered SIGKILL on itself); + // fall through to the defensive terminate below. } - }, MEMORY_POLL_MS); - // Don't let the watchdog itself keep the event loop alive. - interval.unref(); - return () => clearInterval(interval); + // Defense in depth: if the worker hangs on shutdown for any reason, + // force-terminate it so it can't keep the process alive. + const killTimer = setTimeout(() => { + worker.terminate().catch(() => { + /* best-effort */ + }); + }, 500); + killTimer.unref(); + }; } async function runAllCleanupAndExit(code: number): Promise { @@ -161,10 +239,3 @@ async function runAllCleanupAndExit(code: number): Promise { // `process.exit` never returns; the `never` return type is for TS. throw new Error("unreachable"); } - -function formatBytes(n: number): string { - if (n >= 1024 * 1024 * 1024) return `${(n / (1024 * 1024 * 1024)).toFixed(2)} GB`; - if (n >= 1024 * 1024) return `${(n / (1024 * 1024)).toFixed(2)} MB`; - if (n >= 1024) return `${(n / 1024).toFixed(2)} KB`; - return `${n} B`; -}