Skip to content

Commit 13ac79c

Browse files
committed
docs(telemetry): add instrumentation conventions, snake_case attribute keys
Add src/instrumentation/CONVENTIONS.md documenting how telemetry is structured (explicit span threading, imperative setProperty, event/attribute naming, namespace grouping, properties vs measurements) and link it from CONTRIBUTING. Align existing instrumentation with the OTel naming convention: - rename caller-supplied property/measurement keys from camelCase to snake_case (auth, ssh, workspace, activation, websocket, cli.download); - strip unit suffixes from exported metric names into the OTLP unit field (latency_ms -> metric latency, unit ms) so Prometheus suffixes cleanly; - group the token-refresh span under its namespace (auth.token_refreshed -> auth.token_refresh.completed) next to auth.token_refresh.deduped. Framework-managed result/durationMs are left unchanged.
1 parent 4ae51e1 commit 13ac79c

24 files changed

Lines changed: 289 additions & 151 deletions

CONTRIBUTING.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,15 @@ pnpm watch # Rebuild extension and webviews on changes
9494
Press F5 to launch the Extension Development Host. Use "Developer: Reload
9595
Webviews" to see webview changes.
9696

97+
## Telemetry
98+
99+
Local telemetry instrumentation follows a shared style: how spans are threaded,
100+
how events and properties are named, and properties vs measurements. Read this
101+
before adding new telemetry so it stays consistent across the codebase. It lives
102+
next to the code:
103+
104+
**[`src/instrumentation/CONVENTIONS.md`](src/instrumentation/CONVENTIONS.md)**
105+
97106
## Testing
98107

99108
There are a few ways you can test the "Open in VS Code" flow:

src/core/cliManager.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ export class CliManager {
529529
);
530530
verifySpan.setProperty("outcome", result.kind);
531531
if (result.kind === "sig_not_found") {
532-
verifySpan.setProperty("sigStatus", String(result.status));
532+
verifySpan.setProperty("sig_status", String(result.status));
533533
}
534534
});
535535
}
@@ -590,7 +590,7 @@ export class CliManager {
590590
}
591591
} finally {
592592
if (bytesWritten > 0) {
593-
downloadSpan.setMeasurement("downloadedBytes", bytesWritten);
593+
downloadSpan.setMeasurement("downloaded_bytes", bytesWritten);
594594
}
595595
await downloadProgress.clearProgress(progressLogPath);
596596
}

src/instrumentation/CONVENTIONS.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# Telemetry conventions
2+
3+
How to add telemetry so every instrumentation reads the same way. The framework
4+
lives in `src/telemetry`; the per-domain instrumentation business code talks to
5+
lives here in `src/instrumentation`. See `cli.ts` (spans + phases) and `ssh.ts`
6+
(point-in-time logs) as references.
7+
8+
## Checklist
9+
10+
- One instrumentation class per domain (`FooTelemetry`) wrapping
11+
`TelemetryService`; business code imports that, never a raw span.
12+
- Event name is `domain.snake_case`; point-in-time logs use past tense.
13+
- Event names and attribute keys follow OTel: lowercase, `.` for hierarchy, `_`
14+
to split words, never camelCase. Enumerated values are typed `snake_case`
15+
unions, never bare `string`.
16+
- Numbers go in `measurements` (raw), never pre-bucketed into string properties.
17+
- Set attributes imperatively with `setProperty`/`setMeasurement`; never add a
18+
return value that exists only to be logged.
19+
- No secrets, tokens, query strings, or unbounded values in properties; routes
20+
go through `normalizeRoute`.
21+
- Let the framework set `result`; add a domain `outcome` only when an operation
22+
has several success modes. Failures go to a typed union; non-error early exits
23+
call `markAborted`.
24+
25+
## Layers
26+
27+
- **Framework** (`src/telemetry`): `TelemetryService` (`trace`/`log`/`logError`)
28+
hands out `Span` handles and owns IDs, timing, `result`, level-gating, and the
29+
wire format. Telemetry-off is handled here (`NOOP_SPAN`), so instrumentation
30+
never checks whether telemetry is enabled.
31+
- **Instrumentation** (`src/instrumentation/*`): one typed class per domain, the
32+
only telemetry surface business code sees.
33+
34+
## Threading
35+
36+
Spans are passed **explicitly** as a callback argument; there is no
37+
ambient/active-span context. Two patterns keep telemetry out of business logic:
38+
39+
1. **Imperative attributes**`span.setProperty("outcome", "cache_hit")` at the
40+
point the value is known. This is the standard OpenTelemetry model.
41+
2. **Typed phases** — wrap an async step in `span.phase(...)` and read one
42+
property off its _natural_ return value, e.g.
43+
`trace.versionCheck(() => this.checkBinary(...))`. Extraction stays out of
44+
the business function.
45+
46+
Never return a value purely so a caller can log it; that couples the return type
47+
to observability. Returning is fine when the business uses the value too.
48+
49+
## Spans, phases, logs
50+
51+
- `trace(name, fn, props?, meas?)` — a span with framework-set `result` and
52+
`durationMs`. Use for an operation with a start and end.
53+
- `span.phase(name, fn, ...)` — the same, nested (composed as `parent.child`);
54+
child names cannot contain `.`.
55+
- `span.log(name, ...)` / `span.logError(name, error, ...)` — point-in-time
56+
events under a span, no `result`/`durationMs`. Use for discrete signals.
57+
58+
## Naming
59+
60+
| Thing | Convention | Examples |
61+
| -------------------------- | ------------------------------------------- | --------------------------------------------------------- |
62+
| Event name | `domain.snake_case` | `cli.resolve`, `remote.setup`, `connection.dropped` |
63+
| Point-in-time log | past tense | `connection.dropped`, `ssh.process.lost` |
64+
| Child phase | bare `snake_case` | `cache_lookup`, `version_check`, `connection_handoff` |
65+
| Property / measurement key | lowercase; `_` splits words, `.` for groups | `cache_source`, `failure_category`, `status.from` |
66+
| Enumerated value | typed `snake_case` union | `"cache_hit"`, `"session_token"`, `"unrecoverable_close"` |
67+
68+
This is the [OTel attribute convention](https://opentelemetry.io/docs/specs/semconv/general/naming/):
69+
`.` is the namespace delimiter, `_` joins words within a segment, never
70+
camelCase. Default to a flat `snake_case` key; use `.` only to group genuinely
71+
related attributes (a `status.from` / `status.to` pair). Keep phase names
72+
subject-first within a domain (`agent_resolve`, not `resolve_agent`).
73+
74+
**Grouping.** Group related events under a shared dotted namespace so a prefix
75+
query returns the whole family: `workspace.update.triggered` and
76+
`workspace.update.prompted` both sit under `workspace.update.*`, as do
77+
`auth.token_refresh.completed` / `.deduped` under `auth.token_refresh.*`. Keep
78+
the namespace node a pure prefix; don't also emit an event with that exact name,
79+
since dotted names otherwise read as span phases (`parent.child`). This is
80+
[OTel namespacing](https://opentelemetry.io/docs/specs/semconv/general/naming/),
81+
which exists precisely so related signals query together.
82+
83+
**Namespacing.** OTel suggests prefixing custom attributes (`com.acme.*`) to
84+
avoid clashing with future semconv. We don't namespace event-level attributes:
85+
each is already scoped by its (namespaced) event name and only flows into
86+
Coder's own pipeline, so a bare `cache_source` can't collide with a future OTel
87+
`cache.source`. Resource and provenance attributes stay namespaced
88+
(`coder.deployment.url`, `coder.event.*`).
89+
90+
## Properties vs measurements
91+
92+
- **Properties** are low-cardinality string dimensions (the framework stringifies
93+
`string | number | boolean`). Use them for what you group or filter by.
94+
- **Measurements** are raw numbers. Don't pre-bucket into string labels: both
95+
export as record attributes, and a query can bucket the raw number at read
96+
time. `result` and `durationMs` are framework-managed and cannot be set.
97+
- **Units.** There is no unit field at emit time, so put the unit in the
98+
measurement key as a `_ms` / `_mbits` suffix, the same way for every event.
99+
The OTLP exporter then resolves it per signal: for metric events
100+
(`http.requests`, `ssh.network.sampled`) it moves the suffix into the OTLP
101+
`unit` field and drops it from the metric name (`latency_ms` exports as metric
102+
`latency`, unit `ms`, which Prometheus then suffixes itself); log and span
103+
attributes keep the key as written. You always author `latency_ms`; only the
104+
exported metric name changes.
105+
106+
## Outcomes, failures, aborts
107+
108+
- The framework sets `result` (`success` / `error` / `aborted`) on every span;
109+
don't duplicate it.
110+
- Add a domain `outcome` property only when an operation has several success
111+
modes worth distinguishing (e.g. `cli.resolve`: `cache_hit`, `downloaded`).
112+
- Classify failures into a typed union (`failure_category`, or a domain
113+
`reason`) via a `categorize*Failure` helper rather than emitting raw error
114+
strings; the
115+
framework already captures the error block.
116+
- For a non-error early exit (cancelled, not-found), call `span.markAborted()`
117+
rather than throwing.
118+
119+
## Safety
120+
121+
Never put tokens, credentials, full URLs with query strings, or unbounded user
122+
input into properties. Routes go through `normalizeRoute`
123+
(`src/logging/routeNormalization.ts`). Prefer a closed union over a free-form
124+
string for any property a dashboard groups by.

src/instrumentation/activation.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,25 +20,25 @@ export interface ActivationTracer {
2020
}
2121

2222
/**
23-
* Emits `activation` with `authState`, plus a sibling `activation.deployment_init`
23+
* Emits `activation` with `auth_state`, plus a sibling `activation.deployment_init`
2424
* trace (sibling, not child, because deployment init outlives the activation span).
2525
*/
2626
export class ActivationTelemetry {
2727
public constructor(private readonly telemetry: TelemetryService) {}
2828

2929
public trace<T>(fn: (tracer: ActivationTracer) => Promise<T>): Promise<T> {
3030
return this.telemetry.trace("activation", async (span) => {
31-
span.setProperty("authState", "none");
31+
span.setProperty("auth_state", "none");
3232
return fn({
33-
setAuthState: (state) => span.setProperty("authState", state),
33+
setAuthState: (state) => span.setProperty("auth_state", state),
3434
traceDeploymentInit: (initFn) =>
3535
this.telemetry.trace(
3636
"activation.deployment_init",
3737
async (childSpan) => {
38-
childSpan.setProperty("authState", "unknown");
38+
childSpan.setProperty("auth_state", "unknown");
3939
const success = await initFn();
4040
childSpan.setProperty(
41-
"authState",
41+
"auth_state",
4242
success ? "valid_token" : "auth_failed",
4343
);
4444
return success;

src/instrumentation/auth.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export class AuthTelemetry {
2626
trigger: AuthTokenRefreshTrigger,
2727
fn: () => Promise<T>,
2828
): Promise<T> {
29-
return this.telemetry.trace("auth.token_refreshed", fn, { trigger });
29+
return this.telemetry.trace("auth.token_refresh.completed", fn, { trigger });
3030
}
3131

3232
/** Logged when a refresh call joins an in-flight refresh and emits no span of its own. */
@@ -48,9 +48,9 @@ export class AuthTelemetry {
4848
logReceived: () => span.log("received"),
4949
setRecovery: (recovery) => span.setProperty("recovery", recovery),
5050
setRefreshAttempted: (attempted) =>
51-
span.setProperty("refreshAttempted", attempted),
51+
span.setProperty("refresh_attempted", attempted),
5252
}),
53-
{ recovery: "none", refreshAttempted: false },
53+
{ recovery: "none", refresh_attempted: false },
5454
);
5555
}
5656

src/instrumentation/ssh.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ export class SshTelemetry {
5757
this.#telemetry.log(
5858
"ssh.process.lost",
5959
{ cause },
60-
{ uptimeMs: now - this.#processStartedAtMs },
60+
{ uptime_ms: now - this.#processStartedAtMs },
6161
);
6262
}
6363

@@ -68,7 +68,7 @@ export class SshTelemetry {
6868
this.#telemetry.log(
6969
"ssh.process.recovered",
7070
{},
71-
{ recoveryDurationMs: performance.now() - this.#processLostAtMs },
71+
{ recovery_duration_ms: performance.now() - this.#processLostAtMs },
7272
);
7373
this.#processLostAtMs = undefined;
7474
}
@@ -80,14 +80,14 @@ export class SshTelemetry {
8080
const now = performance.now();
8181
if (this.#processStartedAtMs !== undefined) {
8282
const measurements: Record<string, number> = {
83-
previousUptimeMs: now - this.#processStartedAtMs,
83+
previous_uptime_ms: now - this.#processStartedAtMs,
8484
};
8585
if (this.#processLostAtMs !== undefined) {
86-
measurements.lostDurationMs = now - this.#processLostAtMs;
86+
measurements.lost_duration_ms = now - this.#processLostAtMs;
8787
}
8888
this.#telemetry.log(
8989
"ssh.process.replaced",
90-
{ wasLost: this.#processLostAtMs !== undefined },
90+
{ was_lost: this.#processLostAtMs !== undefined },
9191
measurements,
9292
);
9393
}
@@ -105,8 +105,8 @@ export class SshTelemetry {
105105
const now = performance.now();
106106
this.#telemetry.log(
107107
"ssh.process.disposed",
108-
{ wasLost: this.#processLostAtMs !== undefined },
109-
{ uptimeMs: now - this.#processStartedAtMs },
108+
{ was_lost: this.#processLostAtMs !== undefined },
109+
{ uptime_ms: now - this.#processStartedAtMs },
110110
);
111111
this.#processStartedAtMs = undefined;
112112
this.#processLostAtMs = undefined;
@@ -130,12 +130,12 @@ export class SshTelemetry {
130130
"ssh.network.sampled",
131131
{
132132
p2p: network.p2p,
133-
preferredDerp: network.preferred_derp,
133+
preferred_derp: network.preferred_derp,
134134
},
135135
{
136-
latencyMs: network.latency,
137-
downloadMbits: bytesPerSecondToMbits(network.download_bytes_sec),
138-
uploadMbits: bytesPerSecondToMbits(network.upload_bytes_sec),
136+
latency_ms: network.latency,
137+
download_mbits: bytesPerSecondToMbits(network.download_bytes_sec),
138+
upload_mbits: bytesPerSecondToMbits(network.upload_bytes_sec),
139139
},
140140
);
141141
}

src/instrumentation/websocket.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ export class WebSocketTelemetry {
8585
this.#telemetry.log(
8686
"connection.opened",
8787
{ route: normalizeRoute(route) },
88-
{ connectDurationMs: now - start },
88+
{ connect_duration_ms: now - start },
8989
);
9090
this.#finishReconnect({ result: "success" });
9191
}
@@ -104,10 +104,10 @@ export class WebSocketTelemetry {
104104

105105
const properties: CallerProperties = { cause };
106106
if (closeCode !== undefined) {
107-
properties.closeCode = closeCode;
107+
properties.close_code = closeCode;
108108
}
109109
const measurements = {
110-
connectionDurationMs: performance.now() - openedAtMs,
110+
connection_duration_ms: performance.now() - openedAtMs,
111111
};
112112
if (error === undefined) {
113113
this.#telemetry.log("connection.dropped", properties, measurements);
@@ -163,11 +163,11 @@ export class WebSocketTelemetry {
163163
reason: cycle.reason,
164164
};
165165
if (outcome.result === "error") {
166-
properties.terminationReason = outcome.terminationReason;
166+
properties.termination_reason = outcome.terminationReason;
167167
}
168168
this.#telemetry.log("connection.reconnect_resolved", properties, {
169169
attempts: cycle.attempts,
170-
totalDurationMs: performance.now() - cycle.startMs,
170+
total_duration_ms: performance.now() - cycle.startMs,
171171
});
172172
}
173173
}

src/instrumentation/workspace.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ interface ObservedAgentState {
3939

4040
/**
4141
* Emits `workspace.state_transitioned` as a workspace progresses through
42-
* statuses, plus `observedBuildDurationMs` when a provisioner run resolves.
42+
* statuses, plus `observed_build_duration_ms` when a provisioner run resolves.
4343
* Construct one per workspace; `WorkspaceMonitor` is the sole call site.
4444
*/
4545
export class WorkspaceStateTelemetry {
@@ -69,7 +69,7 @@ export class WorkspaceStateTelemetry {
6969

7070
const now = performance.now();
7171
const measurements: Record<string, number> = previous
72-
? { observedDurationMs: now - previous.observedAtMs }
72+
? { observed_duration_ms: now - previous.observedAtMs }
7373
: {};
7474

7575
const wasProvisioning =
@@ -79,15 +79,15 @@ export class WorkspaceStateTelemetry {
7979
this.buildStartedAtMs ??= now;
8080
} else {
8181
if (wasProvisioning && this.buildStartedAtMs !== undefined) {
82-
measurements.observedBuildDurationMs = now - this.buildStartedAtMs;
82+
measurements.observed_build_duration_ms = now - this.buildStartedAtMs;
8383
}
8484
this.buildStartedAtMs = undefined;
8585
}
8686

8787
this.telemetry.log(
8888
"workspace.state_transitioned",
8989
{
90-
workspaceName: this.workspaceName,
90+
workspace_name: this.workspaceName,
9191
from: previous?.status ?? INITIAL_STATE,
9292
to: status,
9393
"build.transition": buildTransition,
@@ -131,14 +131,14 @@ export class WorkspaceAgentTelemetry {
131131
this.telemetry.log(
132132
"workspace.agent.state_transitioned",
133133
{
134-
workspaceName: this.workspaceName,
135-
agentName: agent.name,
134+
workspace_name: this.workspaceName,
135+
agent_name: agent.name,
136136
"status.from": previous?.status ?? INITIAL_STATE,
137137
"status.to": agent.status,
138138
"lifecycle_state.from": previous?.lifecycleState ?? INITIAL_STATE,
139139
"lifecycle_state.to": agent.lifecycle_state,
140140
},
141-
previous ? { observedDurationMs: now - previous.observedAtMs } : {},
141+
previous ? { observed_duration_ms: now - previous.observedAtMs } : {},
142142
);
143143
this.observed = {
144144
status: agent.status,
@@ -164,13 +164,13 @@ export class WorkspaceOperationTelemetry {
164164

165165
public traceUpdateTriggered<T>(fn: () => Promise<T>): Promise<T> {
166166
return this.telemetry.trace("workspace.update.triggered", fn, {
167-
workspaceName: this.workspaceName,
167+
workspace_name: this.workspaceName,
168168
});
169169
}
170170

171171
public traceStartTriggered<T>(fn: () => Promise<T>): Promise<T> {
172172
return this.telemetry.trace("workspace.start.triggered", fn, {
173-
workspaceName: this.workspaceName,
173+
workspace_name: this.workspaceName,
174174
});
175175
}
176176

@@ -196,7 +196,7 @@ export class WorkspaceOperationTelemetry {
196196
throw error;
197197
}
198198
},
199-
{ workspaceName: this.workspaceName },
199+
{ workspace_name: this.workspaceName },
200200
);
201201
if (cancel) throw cancel;
202202
return parameters;

0 commit comments

Comments
 (0)