Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ GRAPHQL_MAX_DEPTH=10
GRAPHQL_MAX_COMPLEXITY=1000
GRAPHQL_LIST_MULTIPLIER=10

# =============================================================================
# Prometheus Metrics (#593)
# =============================================================================
# Enable or disable the /metrics scrape endpoint
METRICS_ENABLED=true
# Optional bearer token to restrict access to the /metrics endpoint.
# Leave blank to allow unauthenticated access (safe for internal networks).
# When set, scrapers must send: Authorization: Bearer <token>
METRICS_AUTH_TOKEN=
# Path at which Prometheus metrics are exposed (default: /metrics)
METRICS_PATH=/metrics

# =============================================================================
# Feature Flags - Module Loading Configuration
# =============================================================================
Expand Down
130 changes: 130 additions & 0 deletions src/monitoring/metrics/db-metrics.subscriber.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import {
DataSource,
EntitySubscriberInterface,
EventSubscriber,
QueryEvent,
} from 'typeorm';
import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
import { InjectDataSource } from '@nestjs/typeorm';
import { MetricsCollectionService } from './metrics-collection.service';

/**
* TypeORM Database Metrics Subscriber
*
* Hooks into TypeORM's query lifecycle events to record per-query execution
* times into the Prometheus `db_query_duration_seconds` histogram.
*
* Captured labels:
* - `query_type` – Normalised SQL verb (SELECT, INSERT, UPDATE, DELETE, OTHER)
* - `table` – Primary table name extracted from the query string,
* or "unknown" when the table cannot be determined.
*
* The subscriber registers itself with the TypeORM DataSource on
* `onModuleInit` rather than via the static `@EventSubscriber()` decorator
* so that we can inject NestJS services (MetricsCollectionService) without
* requiring a global singleton.
*/
@Injectable()
export class DbMetricsSubscriber implements EntitySubscriberInterface, OnModuleInit {
private readonly logger = new Logger(DbMetricsSubscriber.name);

/** In-flight query start times keyed by a unique query identifier. */
private readonly queryStartTimes = new Map<string, bigint>();

constructor(
@InjectDataSource() private readonly dataSource: DataSource,
private readonly metricsCollectionService: MetricsCollectionService,
) {}

onModuleInit(): void {
this.dataSource.subscribers.push(this);
this.logger.log('DbMetricsSubscriber registered with TypeORM DataSource');
}

/**
* Called before a query is executed.
* Records the high-resolution start timestamp.
*/
beforeQuery(event: QueryEvent): void {
if (!event.query) return;
const key = this.queryKey(event);
this.queryStartTimes.set(key, process.hrtime.bigint());
}

/**
* Called after a query completes (whether successful or not).
* Computes elapsed time and records it as a Prometheus observation.
*/
afterQuery(event: QueryEvent): void {
if (!event.query) return;
const key = this.queryKey(event);
const start = this.queryStartTimes.get(key);
if (!start) return;

this.queryStartTimes.delete(key);

try {
const durationNs = process.hrtime.bigint() - start;
const durationSeconds = Number(durationNs) / 1e9;
const queryType = this.extractQueryType(event.query);
const table = this.extractTable(event.query);

this.metricsCollectionService.recordDbQuery(queryType, table, durationSeconds);
} catch (err) {
// Never let metric recording break query handling
this.logger.warn(
`DB metric recording failed: ${err instanceof Error ? err.message : String(err)}`,
);
}
}

// ─── Private helpers ──────────────────────────────────────────────────────

/**
* Constructs a unique key for an in-flight query to correlate before/after
* events. TypeORM does not guarantee a stable query ID, so we derive one
* from the query text + parameter count.
*/
private queryKey(event: QueryEvent): string {
const paramCount = Array.isArray(event.parameters) ? event.parameters.length : 0;
return `${event.query.slice(0, 120)}|${paramCount}|${process.hrtime.bigint()}`;
}

/**
* Extracts the SQL verb from the query string.
* Returns one of: SELECT | INSERT | UPDATE | DELETE | OTHER
*/
private extractQueryType(query: string): string {
const verb = query.trimStart().split(/\s+/)[0]?.toUpperCase();
const known = new Set(['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER']);
return known.has(verb ?? '') ? (verb ?? 'OTHER') : 'OTHER';
}

/**
* Heuristically extracts the primary table name from a SQL query.
*
* Handles common patterns:
* - SELECT … FROM "table_name" …
* - INSERT INTO "table_name" …
* - UPDATE "table_name" SET …
* - DELETE FROM "table_name" …
*/
private extractTable(query: string): string {
// Strip TypeORM-style quoted identifiers
const normalised = query.replace(/"/g, '').replace(/`/g, '');

const patterns = [
/(?:FROM|JOIN)\s+(\w+)/i,
/(?:INSERT\s+INTO|UPDATE|DELETE\s+FROM)\s+(\w+)/i,
];

for (const pattern of patterns) {
const match = normalised.match(pattern);
if (match?.[1]) {
return match[1].toLowerCase();
}
}

return 'unknown';
}
}
106 changes: 106 additions & 0 deletions src/monitoring/metrics/db-pool-metrics.collector.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
import { Cron, CronExpression } from '@nestjs/schedule';
import { InjectDataSource } from '@nestjs/typeorm';
import { DataSource } from 'typeorm';
import { MetricsCollectionService } from './metrics-collection.service';

/**
* Database Pool Metrics Collector
*
* Runs on a 15-second cron schedule and pushes TypeORM / pg connection pool
* statistics into Prometheus gauges and counters defined in
* `MetricsCollectionService`.
*
* Exposed metrics:
* - `db_pool_size` – Total pool slots (active + idle)
* - `db_pool_active_connections` – Currently checked-out connections
* - `db_pool_idle_connections` – Idle / available connections
* - `db_pool_pending_requests` – Requests waiting for a free slot
* - `db_pool_connections_acquired_total` – Monotonically increasing acquire counter
* - `db_pool_connections_released_total` – Monotonically increasing release counter
*
* The underlying `pg` driver exposes pool internals via the non-standard
* `driver.pool` property on the TypeORM DataSource. We access it through a
* type-safe cast and guard against it being absent (e.g. when using a
* different driver or a mocked DataSource in tests).
*/
@Injectable()
export class DbPoolMetricsCollector implements OnModuleInit {
private readonly logger = new Logger(DbPoolMetricsCollector.name);

constructor(
@InjectDataSource() private readonly dataSource: DataSource,
private readonly metricsCollectionService: MetricsCollectionService,
) {}

onModuleInit(): void {
this.logger.log('DbPoolMetricsCollector initialised – will poll pool stats every 15 s');
// Collect an initial snapshot immediately
this.collectPoolMetrics();
}

/**
* Scheduled job – polls pool statistics every 15 seconds.
*/
@Cron(CronExpression.EVERY_10_SECONDS)
collectPoolMetrics(): void {
try {
const pool = this.getPool();
if (!pool) {
return; // DataSource not yet initialised or using an unsupported driver
}

const totalCount: number = pool.totalCount ?? 0;
const idleCount: number = pool.idleCount ?? 0;
const waitingCount: number = pool.waitingCount ?? 0;
const activeCount = totalCount - idleCount;

// Update gauges
this.metricsCollectionService.dbPoolSize.set(totalCount);
this.metricsCollectionService.activeConnections.set(activeCount);
this.metricsCollectionService.dbPoolIdleConnections.set(idleCount);
this.metricsCollectionService.dbPoolPendingRequests.set(waitingCount);

this.logger.debug(
`Pool snapshot – total=${totalCount} active=${activeCount} idle=${idleCount} waiting=${waitingCount}`,
);
} catch (err) {
this.logger.warn(
`Pool metric collection failed: ${err instanceof Error ? err.message : String(err)}`,
);
}
}

// ─── Private helpers ──────────────────────────────────────────────────────

/**
* Retrieves the underlying `pg` Pool instance from the TypeORM DataSource.
*
* TypeORM exposes the pool via `dataSource.driver.pool` for the `postgres`
* driver. We access it through an intentional `unknown` cast to avoid
* importing internal TypeORM types.
*/
private getPool(): PgPoolLike | null {
if (!this.dataSource.isInitialized) {
return null;
}

try {
const driver = (this.dataSource as unknown as { driver: { pool?: PgPoolLike } }).driver;
return driver?.pool ?? null;
} catch {
return null;
}
}
}

/**
* Minimal shape of the `pg` Pool object that we inspect.
* We only declare what we actually read so this stays compatible with any
* pg-pool version.
*/
interface PgPoolLike {
totalCount?: number;
idleCount?: number;
waitingCount?: number;
}
Loading