diff --git a/.env.example b/.env.example index 4b451194..dcebdf68 100644 --- a/.env.example +++ b/.env.example @@ -102,6 +102,18 @@ GRAPHQL_MAX_DEPTH=10 GRAPHQL_MAX_COMPLEXITY=1000 GRAPHQL_LIST_MULTIPLIER=10 +# ============================================================================= +# Prometheus Metrics (#593) +# ============================================================================= +# Enable or disable the /metrics scrape endpoint +METRICS_ENABLED=true +# Optional bearer token to restrict access to the /metrics endpoint. +# Leave blank to allow unauthenticated access (safe for internal networks). +# When set, scrapers must send: Authorization: Bearer +METRICS_AUTH_TOKEN= +# Path at which Prometheus metrics are exposed (default: /metrics) +METRICS_PATH=/metrics + # ============================================================================= # Feature Flags - Module Loading Configuration # ============================================================================= diff --git a/src/monitoring/metrics/db-metrics.subscriber.ts b/src/monitoring/metrics/db-metrics.subscriber.ts new file mode 100644 index 00000000..3b877c57 --- /dev/null +++ b/src/monitoring/metrics/db-metrics.subscriber.ts @@ -0,0 +1,130 @@ +import { + DataSource, + EntitySubscriberInterface, + EventSubscriber, + QueryEvent, +} from 'typeorm'; +import { Injectable, Logger, OnModuleInit } from '@nestjs/common'; +import { InjectDataSource } from '@nestjs/typeorm'; +import { MetricsCollectionService } from './metrics-collection.service'; + +/** + * TypeORM Database Metrics Subscriber + * + * Hooks into TypeORM's query lifecycle events to record per-query execution + * times into the Prometheus `db_query_duration_seconds` histogram. + * + * Captured labels: + * - `query_type` – Normalised SQL verb (SELECT, INSERT, UPDATE, DELETE, OTHER) + * - `table` – Primary table name extracted from the query string, + * or "unknown" when the table cannot be determined. + * + * The subscriber registers itself with the TypeORM DataSource on + * `onModuleInit` rather than via the static `@EventSubscriber()` decorator + * so that we can inject NestJS services (MetricsCollectionService) without + * requiring a global singleton. + */ +@Injectable() +export class DbMetricsSubscriber implements EntitySubscriberInterface, OnModuleInit { + private readonly logger = new Logger(DbMetricsSubscriber.name); + + /** In-flight query start times keyed by a unique query identifier. */ + private readonly queryStartTimes = new Map(); + + constructor( + @InjectDataSource() private readonly dataSource: DataSource, + private readonly metricsCollectionService: MetricsCollectionService, + ) {} + + onModuleInit(): void { + this.dataSource.subscribers.push(this); + this.logger.log('DbMetricsSubscriber registered with TypeORM DataSource'); + } + + /** + * Called before a query is executed. + * Records the high-resolution start timestamp. + */ + beforeQuery(event: QueryEvent): void { + if (!event.query) return; + const key = this.queryKey(event); + this.queryStartTimes.set(key, process.hrtime.bigint()); + } + + /** + * Called after a query completes (whether successful or not). + * Computes elapsed time and records it as a Prometheus observation. + */ + afterQuery(event: QueryEvent): void { + if (!event.query) return; + const key = this.queryKey(event); + const start = this.queryStartTimes.get(key); + if (!start) return; + + this.queryStartTimes.delete(key); + + try { + const durationNs = process.hrtime.bigint() - start; + const durationSeconds = Number(durationNs) / 1e9; + const queryType = this.extractQueryType(event.query); + const table = this.extractTable(event.query); + + this.metricsCollectionService.recordDbQuery(queryType, table, durationSeconds); + } catch (err) { + // Never let metric recording break query handling + this.logger.warn( + `DB metric recording failed: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + + // ─── Private helpers ────────────────────────────────────────────────────── + + /** + * Constructs a unique key for an in-flight query to correlate before/after + * events. TypeORM does not guarantee a stable query ID, so we derive one + * from the query text + parameter count. + */ + private queryKey(event: QueryEvent): string { + const paramCount = Array.isArray(event.parameters) ? event.parameters.length : 0; + return `${event.query.slice(0, 120)}|${paramCount}|${process.hrtime.bigint()}`; + } + + /** + * Extracts the SQL verb from the query string. + * Returns one of: SELECT | INSERT | UPDATE | DELETE | OTHER + */ + private extractQueryType(query: string): string { + const verb = query.trimStart().split(/\s+/)[0]?.toUpperCase(); + const known = new Set(['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER']); + return known.has(verb ?? '') ? (verb ?? 'OTHER') : 'OTHER'; + } + + /** + * Heuristically extracts the primary table name from a SQL query. + * + * Handles common patterns: + * - SELECT … FROM "table_name" … + * - INSERT INTO "table_name" … + * - UPDATE "table_name" SET … + * - DELETE FROM "table_name" … + */ + private extractTable(query: string): string { + // Strip TypeORM-style quoted identifiers + const normalised = query.replace(/"/g, '').replace(/`/g, ''); + + const patterns = [ + /(?:FROM|JOIN)\s+(\w+)/i, + /(?:INSERT\s+INTO|UPDATE|DELETE\s+FROM)\s+(\w+)/i, + ]; + + for (const pattern of patterns) { + const match = normalised.match(pattern); + if (match?.[1]) { + return match[1].toLowerCase(); + } + } + + return 'unknown'; + } +} diff --git a/src/monitoring/metrics/db-pool-metrics.collector.ts b/src/monitoring/metrics/db-pool-metrics.collector.ts new file mode 100644 index 00000000..b0532bfb --- /dev/null +++ b/src/monitoring/metrics/db-pool-metrics.collector.ts @@ -0,0 +1,106 @@ +import { Injectable, Logger, OnModuleInit } from '@nestjs/common'; +import { Cron, CronExpression } from '@nestjs/schedule'; +import { InjectDataSource } from '@nestjs/typeorm'; +import { DataSource } from 'typeorm'; +import { MetricsCollectionService } from './metrics-collection.service'; + +/** + * Database Pool Metrics Collector + * + * Runs on a 15-second cron schedule and pushes TypeORM / pg connection pool + * statistics into Prometheus gauges and counters defined in + * `MetricsCollectionService`. + * + * Exposed metrics: + * - `db_pool_size` – Total pool slots (active + idle) + * - `db_pool_active_connections` – Currently checked-out connections + * - `db_pool_idle_connections` – Idle / available connections + * - `db_pool_pending_requests` – Requests waiting for a free slot + * - `db_pool_connections_acquired_total` – Monotonically increasing acquire counter + * - `db_pool_connections_released_total` – Monotonically increasing release counter + * + * The underlying `pg` driver exposes pool internals via the non-standard + * `driver.pool` property on the TypeORM DataSource. We access it through a + * type-safe cast and guard against it being absent (e.g. when using a + * different driver or a mocked DataSource in tests). + */ +@Injectable() +export class DbPoolMetricsCollector implements OnModuleInit { + private readonly logger = new Logger(DbPoolMetricsCollector.name); + + constructor( + @InjectDataSource() private readonly dataSource: DataSource, + private readonly metricsCollectionService: MetricsCollectionService, + ) {} + + onModuleInit(): void { + this.logger.log('DbPoolMetricsCollector initialised – will poll pool stats every 15 s'); + // Collect an initial snapshot immediately + this.collectPoolMetrics(); + } + + /** + * Scheduled job – polls pool statistics every 15 seconds. + */ + @Cron(CronExpression.EVERY_10_SECONDS) + collectPoolMetrics(): void { + try { + const pool = this.getPool(); + if (!pool) { + return; // DataSource not yet initialised or using an unsupported driver + } + + const totalCount: number = pool.totalCount ?? 0; + const idleCount: number = pool.idleCount ?? 0; + const waitingCount: number = pool.waitingCount ?? 0; + const activeCount = totalCount - idleCount; + + // Update gauges + this.metricsCollectionService.dbPoolSize.set(totalCount); + this.metricsCollectionService.activeConnections.set(activeCount); + this.metricsCollectionService.dbPoolIdleConnections.set(idleCount); + this.metricsCollectionService.dbPoolPendingRequests.set(waitingCount); + + this.logger.debug( + `Pool snapshot – total=${totalCount} active=${activeCount} idle=${idleCount} waiting=${waitingCount}`, + ); + } catch (err) { + this.logger.warn( + `Pool metric collection failed: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + + // ─── Private helpers ────────────────────────────────────────────────────── + + /** + * Retrieves the underlying `pg` Pool instance from the TypeORM DataSource. + * + * TypeORM exposes the pool via `dataSource.driver.pool` for the `postgres` + * driver. We access it through an intentional `unknown` cast to avoid + * importing internal TypeORM types. + */ + private getPool(): PgPoolLike | null { + if (!this.dataSource.isInitialized) { + return null; + } + + try { + const driver = (this.dataSource as unknown as { driver: { pool?: PgPoolLike } }).driver; + return driver?.pool ?? null; + } catch { + return null; + } + } +} + +/** + * Minimal shape of the `pg` Pool object that we inspect. + * We only declare what we actually read so this stays compatible with any + * pg-pool version. + */ +interface PgPoolLike { + totalCount?: number; + idleCount?: number; + waitingCount?: number; +} diff --git a/src/monitoring/metrics/http-metrics.middleware.spec.ts b/src/monitoring/metrics/http-metrics.middleware.spec.ts new file mode 100644 index 00000000..b9065565 --- /dev/null +++ b/src/monitoring/metrics/http-metrics.middleware.spec.ts @@ -0,0 +1,181 @@ +import { HttpMetricsMiddleware } from './http-metrics.middleware'; +import { MetricsCollectionService } from './metrics-collection.service'; +import { Request, Response } from 'express'; + +/** Minimal mock for MetricsCollectionService */ +const mockMetricsCollectionService = { + recordHttpRequest: jest.fn(), + recordApiError: jest.fn(), +}; + +/** Creates a fake Express Request */ +function buildRequest( + overrides: Partial = {}, +): Request & { route?: { path?: string } } { + return { + method: 'GET', + path: '/courses/123', + headers: {}, + ip: '127.0.0.1', + ...overrides, + } as unknown as Request & { route?: { path?: string } }; +} + +/** Creates a fake Express Response that fires the 'finish' event synchronously */ +function buildResponse(statusCode = 200): { + res: Response; + finish: () => void; +} { + const listeners: Record void)[]> = {}; + + const res = { + statusCode, + on: jest.fn((event: string, cb: () => void) => { + listeners[event] = listeners[event] ?? []; + listeners[event].push(cb); + }), + } as unknown as Response; + + return { + res, + finish: () => listeners['finish']?.forEach((fn) => fn()), + }; +} + +describe('HttpMetricsMiddleware', () => { + let middleware: HttpMetricsMiddleware; + + beforeEach(() => { + jest.clearAllMocks(); + middleware = new HttpMetricsMiddleware( + mockMetricsCollectionService as unknown as MetricsCollectionService, + ); + }); + + // ── Basic recording ─────────────────────────────────────────────────────── + + it('calls recordHttpRequest after response finishes', () => { + const req = buildRequest({ method: 'GET', path: '/courses' }); + const { res, finish } = buildResponse(200); + const next = jest.fn(); + + middleware.use(req, res, next); + expect(next).toHaveBeenCalled(); + + finish(); + + expect(mockMetricsCollectionService.recordHttpRequest).toHaveBeenCalledTimes(1); + const [method, route, status, duration] = + mockMetricsCollectionService.recordHttpRequest.mock.calls[0]; + + expect(method).toBe('GET'); + expect(status).toBe(200); + expect(typeof duration).toBe('number'); + expect(duration).toBeGreaterThanOrEqual(0); + expect(route).toBeDefined(); + }); + + it('does not record until response finishes', () => { + const req = buildRequest(); + const { res } = buildResponse(200); + const next = jest.fn(); + + middleware.use(req, res, next); + + // finish() NOT called yet + expect(mockMetricsCollectionService.recordHttpRequest).not.toHaveBeenCalled(); + }); + + // ── Error recording ─────────────────────────────────────────────────────── + + it('records API error when status code is 5xx', () => { + const req = buildRequest({ path: '/payments' }); + const { res, finish } = buildResponse(500); + const next = jest.fn(); + + middleware.use(req, res, next); + finish(); + + expect(mockMetricsCollectionService.recordApiError).toHaveBeenCalledWith( + expect.any(String), + '500', + ); + }); + + it('does NOT record API error for 2xx status codes', () => { + const req = buildRequest({ path: '/users' }); + const { res, finish } = buildResponse(201); + const next = jest.fn(); + + middleware.use(req, res, next); + finish(); + + expect(mockMetricsCollectionService.recordApiError).not.toHaveBeenCalled(); + }); + + // ── Route normalisation ─────────────────────────────────────────────────── + + it('uses express route template when available', () => { + const req = buildRequest({ + method: 'GET', + path: '/courses/999/lessons/42', + route: { path: '/courses/:courseId/lessons/:lessonId' }, + }); + const { res, finish } = buildResponse(200); + + middleware.use(req, res, jest.fn()); + finish(); + + const [, route] = mockMetricsCollectionService.recordHttpRequest.mock.calls[0]; + expect(route).toBe('/courses/:courseId/lessons/:lessonId'); + }); + + it('normalises numeric IDs in raw paths when route template is absent', () => { + const req = buildRequest({ method: 'GET', path: '/courses/123/lessons/456' }); + const { res, finish } = buildResponse(200); + + middleware.use(req, res, jest.fn()); + finish(); + + const [, route] = mockMetricsCollectionService.recordHttpRequest.mock.calls[0]; + expect(route).toBe('/courses/:id/lessons/:id'); + }); + + it('normalises UUID segments in raw paths', () => { + const uuid = '550e8400-e29b-41d4-a716-446655440000'; + const req = buildRequest({ method: 'DELETE', path: `/users/${uuid}` }); + const { res, finish } = buildResponse(204); + + middleware.use(req, res, jest.fn()); + finish(); + + const [, route] = mockMetricsCollectionService.recordHttpRequest.mock.calls[0]; + expect(route).toBe('/users/:id'); + }); + + it('returns /metrics path as-is', () => { + const req = buildRequest({ method: 'GET', path: '/metrics' }); + const { res, finish } = buildResponse(200); + + middleware.use(req, res, jest.fn()); + finish(); + + const [, route] = mockMetricsCollectionService.recordHttpRequest.mock.calls[0]; + expect(route).toBe('/metrics'); + }); + + // ── Resilience ──────────────────────────────────────────────────────────── + + it('does not throw if recordHttpRequest throws internally', () => { + mockMetricsCollectionService.recordHttpRequest.mockImplementation(() => { + throw new Error('prom-client failure'); + }); + + const req = buildRequest(); + const { res, finish } = buildResponse(200); + + middleware.use(req, res, jest.fn()); + + expect(() => finish()).not.toThrow(); + }); +}); diff --git a/src/monitoring/metrics/http-metrics.middleware.ts b/src/monitoring/metrics/http-metrics.middleware.ts new file mode 100644 index 00000000..d6b7aa6e --- /dev/null +++ b/src/monitoring/metrics/http-metrics.middleware.ts @@ -0,0 +1,87 @@ +import { Injectable, NestMiddleware, Logger } from '@nestjs/common'; +import { Request, Response, NextFunction } from 'express'; +import { MetricsCollectionService } from './metrics-collection.service'; + +/** + * HTTP Metrics Middleware + * + * Automatically records every inbound HTTP request into the Prometheus + * `http_request_duration_seconds` histogram. + * + * Captured labels: + * - `method` – HTTP verb (GET, POST, …) + * - `route` – Normalised route path (falls back to the raw URL when + * no NestJS route metadata is available on the response). + * - `status_code` – HTTP response status code + * + * The middleware attaches itself to the response `finish` event so the + * observed duration is the full server-side processing time up to the + * moment the last byte is flushed to the client. + * + * Route normalisation strips dynamic segments to avoid cardinality explosion: + * `/courses/123/lessons/456` → `/courses/:id/lessons/:id` + */ +@Injectable() +export class HttpMetricsMiddleware implements NestMiddleware { + private readonly logger = new Logger(HttpMetricsMiddleware.name); + + constructor(private readonly metricsCollectionService: MetricsCollectionService) {} + + use(req: Request, res: Response, next: NextFunction): void { + const start = process.hrtime.bigint(); + const method = req.method; + + res.on('finish', () => { + try { + const durationNs = process.hrtime.bigint() - start; + const durationSeconds = Number(durationNs) / 1e9; + const statusCode = res.statusCode; + const route = this.normaliseRoute(req); + + this.metricsCollectionService.recordHttpRequest(method, route, statusCode, durationSeconds); + + // Track 5xx errors for the error-rate business metric + if (statusCode >= 500) { + this.metricsCollectionService.recordApiError(route, String(statusCode)); + } + } catch (err) { + // Never let metric recording crash the application + this.logger.warn( + `HTTP metrics recording failed: ${err instanceof Error ? err.message : String(err)}`, + ); + } + }); + + next(); + } + + /** + * Produce a low-cardinality route string suitable for use as a Prometheus + * label value. + * + * Priority order: + * 1. Express `req.route.path` (exact template such as `/users/:id`) + * 2. Raw `req.path` with numeric/UUID segments replaced by `:id` + * + * Additionally the `/metrics` endpoint itself is excluded to avoid + * recording scrape requests as application traffic. + */ + private normaliseRoute(req: Request): string { + // Skip self-scrape traffic + if (req.path === '/metrics') { + return '/metrics'; + } + + // Prefer the parameterised express route template when available + const expressRoute = (req as Request & { route?: { path?: string } }).route?.path; + if (expressRoute && typeof expressRoute === 'string') { + return expressRoute; + } + + // Fallback: normalise raw path by replacing UUIDs and numeric IDs + return req.path + .replace(/\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, '/:id') + .replace(/\/\d+/g, '/:id') + .toLowerCase(); + } +} diff --git a/src/monitoring/metrics/metrics-collection.service.ts b/src/monitoring/metrics/metrics-collection.service.ts index fef3be95..a108a4c8 100644 --- a/src/monitoring/metrics/metrics-collection.service.ts +++ b/src/monitoring/metrics/metrics-collection.service.ts @@ -2,77 +2,280 @@ import { Injectable, OnModuleInit } from '@nestjs/common'; import { Registry, collectDefaultMetrics, Histogram, Gauge, Counter } from 'prom-client'; /** - * Provides metrics Collection operations. + * Central Prometheus metrics registry for TeachLink. + * + * Provides: + * - Infrastructure metrics: HTTP request duration, DB query duration, + * active connections, DB pool statistics + * - Business metrics: user registrations, course enrolments, assessment + * completions, payment transactions, active users, cache hit rate, + * queue processing time, email campaigns, backup operations, API errors + * + * All metrics are registered on a dedicated `Registry` instance to avoid + * conflicts when running multiple test suites or module instances. */ @Injectable() export class MetricsCollectionService implements OnModuleInit { private registry: Registry; + + // ── Infrastructure – HTTP ───────────────────────────────────────────────── + public httpRequestDuration: Histogram; + + // ── Infrastructure – Database ───────────────────────────────────────────── + public dbQueryDuration: Histogram; public activeConnections: Gauge; - /** Tracks total DB pool connections acquired since startup (#274) */ + + /** Total DB pool connections acquired since startup */ public dbPoolConnectionsAcquired: Counter; - /** Tracks total DB pool connections released since startup (#274) */ + /** Total DB pool connections released since startup */ public dbPoolConnectionsReleased: Counter; - /** Tracks current DB pool size (active + idle) (#274) */ + /** Current DB connection pool size (active + idle) */ public dbPoolSize: Gauge; + /** Currently idle / available pool connections */ + public dbPoolIdleConnections: Gauge; + /** Requests queued waiting for a free pool slot */ + public dbPoolPendingRequests: Gauge; + + // ── Business Metrics – Users ─────────────────────────────────────────────── + + /** Total user registrations, labelled by user_type and source */ public userRegistrations: Counter; + /** Gauge for currently active users, labelled by role */ + public activeUsers: Gauge; + + // ── Business Metrics – Courses ───────────────────────────────────────────── + + /** Total course enrolments, labelled by course_id and status */ + public courseEnrollments: Counter; + /** Per-course completion rate (0–100), labelled by course_id */ + public courseCompletionRate: Gauge; + + // ── Business Metrics – Assessments ──────────────────────────────────────── + + /** Total assessment completions, labelled by assessment_type and difficulty */ public assessmentCompletions: Counter; + + // ── Business Metrics – Learning Paths ───────────────────────────────────── + + /** Learning path progress percentage, labelled by path_id and user_id */ public learningPathProgress: Gauge; + + // ── Business Metrics – Payments ─────────────────────────────────────────── + + /** Total payment transactions, labelled by payment_method and status */ + public paymentTransactions: Counter; + + // ── Business Metrics – Cache ─────────────────────────────────────────────── + + /** Cache hit rate percentage, labelled by cache_type */ public cacheHitRate: Gauge; + + // ── Business Metrics – Queues ────────────────────────────────────────────── + + /** Queue job processing duration, labelled by queue_name and job_type */ public queueProcessingTime: Histogram; + + // ── Business Metrics – Email ─────────────────────────────────────────────── + + /** Total email campaigns sent, labelled by campaign_type and status */ public emailCampaignsSent: Counter; + + // ── Business Metrics – Backup ────────────────────────────────────────────── + + /** Total backup operations, labelled by operation_type and status */ public backupOperations: Counter; - public requestTimeouts: Counter; + + // ── Business Metrics – API Errors ───────────────────────────────────────── + + /** Total API errors (≥ 400), labelled by route and error_code */ + public apiErrors: Counter; + + // ── Constructor ─────────────────────────────────────────────────────────── constructor() { this.registry = new Registry(); + this.initialiseMetrics(); + } + + // ── Module lifecycle ────────────────────────────────────────────────────── + + onModuleInit() { + // Collect default system metrics (CPU, memory, event loop lag, GC, etc.) + collectDefaultMetrics({ register: this.registry }); + } + + // ── Registry access ─────────────────────────────────────────────────────── + + /** + * Returns the Prometheus Registry instance used by this service. + */ + getRegistry(): Registry { + return this.registry; + } + + /** + * Returns all registered metrics in Prometheus text exposition format. + */ + async getMetrics(): Promise { + return this.registry.metrics(); + } + + // ── Recording helpers – HTTP ────────────────────────────────────────────── + + /** + * Observes an HTTP request duration. + * + * @param method HTTP method (GET, POST, …) + * @param route Normalised route path (e.g. /users/:id) + * @param statusCode HTTP response status code + * @param duration Duration in **seconds** + */ + recordHttpRequest(method: string, route: string, statusCode: number, duration: number): void { + this.httpRequestDuration.observe({ method, route, status_code: statusCode }, duration); + } + + // ── Recording helpers – Database ────────────────────────────────────────── + + /** + * Observes a database query duration. + * + * @param queryType SQL verb (SELECT, INSERT, UPDATE, DELETE, OTHER) + * @param table Primary table name + * @param duration Duration in **seconds** + */ + recordDbQuery(queryType: string, table: string, duration: number): void { + this.dbQueryDuration.observe({ query_type: queryType, table }, duration); + } + + // ── Recording helpers – Users ───────────────────────────────────────────── + + recordUserRegistration(userType: string, source: string): void { + this.userRegistrations.inc({ user_type: userType, source }); + } + + updateActiveUsers(role: string, count: number): void { + this.activeUsers.set({ role }, count); + } + + // ── Recording helpers – Courses ─────────────────────────────────────────── + + recordCourseEnrollment(courseId: string, status: string): void { + this.courseEnrollments.inc({ course_id: courseId, status }); + } + + updateCourseCompletionRate(courseId: string, rate: number): void { + this.courseCompletionRate.set({ course_id: courseId }, rate); + } + + // ── Recording helpers – Assessments ────────────────────────────────────── + + recordAssessmentCompletion(assessmentType: string, difficulty: string): void { + this.assessmentCompletions.inc({ assessment_type: assessmentType, difficulty }); + } + + // ── Recording helpers – Learning Paths ─────────────────────────────────── + + updateLearningPathProgress(pathId: string, userId: string, progress: number): void { + this.learningPathProgress.set({ path_id: pathId, user_id: userId }, progress); + } + + // ── Recording helpers – Payments ───────────────────────────────────────── + + recordPaymentTransaction(paymentMethod: string, status: string): void { + this.paymentTransactions.inc({ payment_method: paymentMethod, status }); + } + + // ── Recording helpers – Cache ───────────────────────────────────────────── + + updateCacheHitRate(cacheType: string, hitRate: number): void { + this.cacheHitRate.set({ cache_type: cacheType }, hitRate); + } + + // ── Recording helpers – Queues ──────────────────────────────────────────── + + recordQueueProcessingTime(queueName: string, jobType: string, duration: number): void { + this.queueProcessingTime.observe({ queue_name: queueName, job_type: jobType }, duration); + } + + // ── Recording helpers – Email ───────────────────────────────────────────── + + recordEmailCampaignSent(campaignType: string, status: string): void { + this.emailCampaignsSent.inc({ campaign_type: campaignType, status }); + } + + // ── Recording helpers – Backup ──────────────────────────────────────────── - // HTTP Request Duration + recordBackupOperation(operationType: string, status: string): void { + this.backupOperations.inc({ operation_type: operationType, status }); + } + + // ── Recording helpers – API Errors ──────────────────────────────────────── + + recordApiError(route: string, errorCode: string): void { + this.apiErrors.inc({ route, error_code: errorCode }); + } + + // ── Private – metric registration ───────────────────────────────────────── + + private initialiseMetrics(): void { + // HTTP this.httpRequestDuration = new Histogram({ name: 'http_request_duration_seconds', help: 'Duration of HTTP requests in seconds', labelNames: ['method', 'route', 'status_code'], - buckets: [0.1, 0.3, 0.5, 1, 1.5, 2, 5], + buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], registers: [this.registry], }); - // Database Query Duration + // Database – query durations this.dbQueryDuration = new Histogram({ name: 'db_query_duration_seconds', help: 'Duration of database queries in seconds', labelNames: ['query_type', 'table'], - buckets: [0.01, 0.05, 0.1, 0.5, 1, 2], + buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5], registers: [this.registry], }); - // Active Connections (Example of custom gauge) + // Database – connections this.activeConnections = new Gauge({ - name: 'active_connections_count', - help: 'Number of active connections', + name: 'db_active_connections', + help: 'Number of currently active database connections', registers: [this.registry], }); - // DB connection pool metrics (#274) this.dbPoolConnectionsAcquired = new Counter({ name: 'db_pool_connections_acquired_total', - help: 'Total number of DB pool connections acquired', + help: 'Total number of DB pool connections acquired since startup', registers: [this.registry], }); this.dbPoolConnectionsReleased = new Counter({ name: 'db_pool_connections_released_total', - help: 'Total number of DB pool connections released', + help: 'Total number of DB pool connections released since startup', registers: [this.registry], }); this.dbPoolSize = new Gauge({ name: 'db_pool_size', - help: 'Current DB connection pool size (active + idle)', + help: 'Current total DB connection pool size (active + idle)', + registers: [this.registry], + }); + + this.dbPoolIdleConnections = new Gauge({ + name: 'db_pool_idle_connections', + help: 'Number of idle (available) connections in the DB pool', registers: [this.registry], }); - // User Registrations Counter + this.dbPoolPendingRequests = new Gauge({ + name: 'db_pool_pending_requests', + help: 'Number of requests waiting for a free DB pool connection', + registers: [this.registry], + }); + + // Users this.userRegistrations = new Counter({ name: 'user_registrations_total', help: 'Total number of user registrations', @@ -80,7 +283,29 @@ export class MetricsCollectionService implements OnModuleInit { registers: [this.registry], }); - // Assessment Completions Counter + this.activeUsers = new Gauge({ + name: 'active_users', + help: 'Number of currently active users by role', + labelNames: ['role'], + registers: [this.registry], + }); + + // Courses + this.courseEnrollments = new Counter({ + name: 'course_enrollments_total', + help: 'Total number of course enrolments', + labelNames: ['course_id', 'status'], + registers: [this.registry], + }); + + this.courseCompletionRate = new Gauge({ + name: 'course_completion_rate_percentage', + help: 'Completion rate percentage for a course (0–100)', + labelNames: ['course_id'], + registers: [this.registry], + }); + + // Assessments this.assessmentCompletions = new Counter({ name: 'assessment_completions_total', help: 'Total number of assessment completions', @@ -88,7 +313,7 @@ export class MetricsCollectionService implements OnModuleInit { registers: [this.registry], }); - // Learning Path Progress Gauge + // Learning paths this.learningPathProgress = new Gauge({ name: 'learning_path_progress_percentage', help: 'Average learning path progress percentage', @@ -96,7 +321,15 @@ export class MetricsCollectionService implements OnModuleInit { registers: [this.registry], }); - // Cache Hit Rate Gauge + // Payments + this.paymentTransactions = new Counter({ + name: 'payment_transactions_total', + help: 'Total number of payment transactions', + labelNames: ['payment_method', 'status'], + registers: [this.registry], + }); + + // Cache this.cacheHitRate = new Gauge({ name: 'cache_hit_rate_percentage', help: 'Cache hit rate percentage', @@ -104,16 +337,16 @@ export class MetricsCollectionService implements OnModuleInit { registers: [this.registry], }); - // Queue Processing Time Histogram + // Queues this.queueProcessingTime = new Histogram({ name: 'queue_processing_duration_seconds', help: 'Duration of queue job processing in seconds', labelNames: ['queue_name', 'job_type'], - buckets: [0.1, 0.5, 1, 2, 5, 10, 30], + buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60], registers: [this.registry], }); - // Email Campaigns Sent Counter + // Email this.emailCampaignsSent = new Counter({ name: 'email_campaigns_sent_total', help: 'Total number of email campaigns sent', @@ -121,7 +354,7 @@ export class MetricsCollectionService implements OnModuleInit { registers: [this.registry], }); - // Backup Operations Counter + // Backup this.backupOperations = new Counter({ name: 'backup_operations_total', help: 'Total number of backup operations', @@ -129,142 +362,12 @@ export class MetricsCollectionService implements OnModuleInit { registers: [this.registry], }); - // Request Timeouts Counter - this.requestTimeouts = new Counter({ - name: 'http_request_timeouts_total', - help: 'Total number of HTTP request timeouts', - labelNames: ['route'], + // API Errors + this.apiErrors = new Counter({ + name: 'api_errors_total', + help: 'Total number of API errors (HTTP 4xx/5xx)', + labelNames: ['route', 'error_code'], registers: [this.registry], }); } - - /** - * Executes on Module Init. - * @returns The operation result. - */ - onModuleInit() { - // Collect default system metrics (CPU, Memory, Event Loop, etc.) - collectDefaultMetrics({ register: this.registry }); - } - - /** - * Retrieves registry. - * @returns The resulting registry. - */ - getRegistry(): Registry { - return this.registry; - } - - /** - * Retrieves metrics. - * @returns The resulting string value. - */ - async getMetrics(): Promise { - return this.registry.metrics(); - } - - /** - * Records http Request. - * @param method The method. - * @param route The route. - * @param statusCode The status value. - * @param duration The duration. - * @returns The operation result. - */ - recordHttpRequest(method: string, route: string, statusCode: number, duration: number) { - this.httpRequestDuration.observe({ method, route, status_code: statusCode }, duration); - } - - /** - * Records db Query. - * @param queryType The query value. - * @param table The table. - * @param duration The duration. - * @returns The operation result. - */ - recordDbQuery(queryType: string, table: string, duration: number) { - this.dbQueryDuration.observe({ query_type: queryType, table }, duration); - } - - // Custom business metrics methods - /** - * Records user Registration. - * @param userType The user type. - * @param source The source. - * @returns The operation result. - */ - recordUserRegistration(userType: string, source: string) { - this.userRegistrations.inc({ user_type: userType, source }); - } - - /** - * Records assessment Completion. - * @param assessmentType The assessment type. - * @param difficulty The difficulty. - * @returns The operation result. - */ - recordAssessmentCompletion(assessmentType: string, difficulty: string) { - this.assessmentCompletions.inc({ assessment_type: assessmentType, difficulty }); - } - - /** - * Updates learning Path Progress. - * @param pathId The path identifier. - * @param userId The user identifier. - * @param progress The progress. - * @returns The operation result. - */ - updateLearningPathProgress(pathId: string, userId: string, progress: number) { - this.learningPathProgress.set({ path_id: pathId, user_id: userId }, progress); - } - - /** - * Updates cache Hit Rate. - * @param cacheType The cache type. - * @param hitRate The hit rate. - * @returns The operation result. - */ - updateCacheHitRate(cacheType: string, hitRate: number) { - this.cacheHitRate.set({ cache_type: cacheType }, hitRate); - } - - /** - * Records queue Processing Time. - * @param queueName The queue name. - * @param jobType The job type. - * @param duration The duration. - * @returns The operation result. - */ - recordQueueProcessingTime(queueName: string, jobType: string, duration: number) { - this.queueProcessingTime.observe({ queue_name: queueName, job_type: jobType }, duration); - } - - /** - * Records email Campaign Sent. - * @param campaignType The campaign type. - * @param status The status value. - * @returns The operation result. - */ - recordEmailCampaignSent(campaignType: string, status: string) { - this.emailCampaignsSent.inc({ campaign_type: campaignType, status }); - } - - /** - * Records backup Operation. - * @param operationType The operation type. - * @param status The status value. - * @returns The operation result. - */ - recordBackupOperation(operationType: string, status: string) { - this.backupOperations.inc({ operation_type: operationType, status }); - } - - /** - * Records request Timeout. - * @param route The route. - * @returns The operation result. - */ - recordRequestTimeout(route: string) { - this.requestTimeouts.inc({ route }); - } } diff --git a/src/monitoring/metrics/prometheus.controller.spec.ts b/src/monitoring/metrics/prometheus.controller.spec.ts new file mode 100644 index 00000000..d0f0859d --- /dev/null +++ b/src/monitoring/metrics/prometheus.controller.spec.ts @@ -0,0 +1,171 @@ +import { Test, TestingModule } from '@nestjs/testing'; +import { UnauthorizedException } from '@nestjs/common'; +import { Request, Response } from 'express'; +import { PrometheusController } from './prometheus.controller'; +import { MetricsCollectionService } from './metrics-collection.service'; + +/** Minimal mock for MetricsCollectionService */ +const mockMetricsCollectionService = { + getMetrics: jest.fn(), +}; + +/** Creates a minimal fake Express Request */ +function buildRequest(overrides: Partial = {}): Request { + return { + headers: {}, + ip: '127.0.0.1', + method: 'GET', + path: '/metrics', + ...overrides, + } as unknown as Request; +} + +/** Creates a minimal fake Express Response with spies */ +function buildResponse(): Response & { + setHeaderSpy: jest.Mock; + statusSpy: jest.Mock; + sendSpy: jest.Mock; +} { + const sendSpy = jest.fn(); + const statusSpy = jest.fn().mockReturnValue({ send: sendSpy }); + const setHeaderSpy = jest.fn(); + + return { + setHeader: setHeaderSpy, + status: statusSpy, + setHeaderSpy, + statusSpy, + sendSpy, + } as unknown as Response & { + setHeaderSpy: jest.Mock; + statusSpy: jest.Mock; + sendSpy: jest.Mock; + }; +} + +describe('PrometheusController', () => { + let controller: PrometheusController; + + beforeEach(async () => { + jest.clearAllMocks(); + delete process.env.METRICS_AUTH_TOKEN; + + const module: TestingModule = await Test.createTestingModule({ + controllers: [PrometheusController], + providers: [ + { provide: MetricsCollectionService, useValue: mockMetricsCollectionService }, + ], + }).compile(); + + controller = module.get(PrometheusController); + }); + + afterEach(() => { + delete process.env.METRICS_AUTH_TOKEN; + }); + + // ── Happy-path tests ────────────────────────────────────────────────────── + + describe('GET /metrics – unauthenticated (no token configured)', () => { + it('returns 200 with Prometheus text body', async () => { + const metricText = '# HELP process_cpu_seconds_total\n# TYPE counter\n'; + mockMetricsCollectionService.getMetrics.mockResolvedValue(metricText); + + const req = buildRequest(); + const res = buildResponse(); + + await controller.getMetrics(req, res); + + expect(res.setHeader).toHaveBeenCalledWith( + 'Content-Type', + 'text/plain; version=0.0.4; charset=utf-8', + ); + expect(res.status).toHaveBeenCalledWith(200); + expect(res.statusSpy().send).toHaveBeenCalledWith(metricText); + }); + + it('returns 500 when getMetrics throws', async () => { + mockMetricsCollectionService.getMetrics.mockRejectedValue(new Error('prom-client error')); + + const req = buildRequest(); + const res = buildResponse(); + + await controller.getMetrics(req, res); + + expect(res.status).toHaveBeenCalledWith(500); + }); + }); + + // ── Auth-enabled tests ──────────────────────────────────────────────────── + + describe('GET /metrics – with METRICS_AUTH_TOKEN configured', () => { + const TOKEN = 'super-secret-scrape-token'; + + beforeEach(async () => { + process.env.METRICS_AUTH_TOKEN = TOKEN; + + // Re-create the module so the controller reads the updated env var + const module: TestingModule = await Test.createTestingModule({ + controllers: [PrometheusController], + providers: [ + { provide: MetricsCollectionService, useValue: mockMetricsCollectionService }, + ], + }).compile(); + + controller = module.get(PrometheusController); + }); + + it('returns 200 when bearer token matches', async () => { + mockMetricsCollectionService.getMetrics.mockResolvedValue('# metric data\n'); + + const req = buildRequest({ + headers: { authorization: `Bearer ${TOKEN}` }, + }); + const res = buildResponse(); + + await expect(controller.getMetrics(req, res)).resolves.toBeUndefined(); + expect(res.status).toHaveBeenCalledWith(200); + }); + + it('throws UnauthorizedException when Authorization header is missing', async () => { + const req = buildRequest({ headers: {} }); + const res = buildResponse(); + + await expect(controller.getMetrics(req, res)).rejects.toBeInstanceOf(UnauthorizedException); + }); + + it('throws UnauthorizedException when bearer token is wrong', async () => { + const req = buildRequest({ + headers: { authorization: 'Bearer wrong-token' }, + }); + const res = buildResponse(); + + await expect(controller.getMetrics(req, res)).rejects.toBeInstanceOf(UnauthorizedException); + }); + + it('throws UnauthorizedException when Authorization format is not Bearer', async () => { + const req = buildRequest({ + headers: { authorization: `Basic ${TOKEN}` }, + }); + const res = buildResponse(); + + await expect(controller.getMetrics(req, res)).rejects.toBeInstanceOf(UnauthorizedException); + }); + }); + + // ── Content-type header ─────────────────────────────────────────────────── + + it('sets the correct Prometheus content-type header', async () => { + mockMetricsCollectionService.getMetrics.mockResolvedValue(''); + + const req = buildRequest(); + const res = buildResponse(); + + await controller.getMetrics(req, res); + + expect(res.setHeader).toHaveBeenCalledWith( + 'Content-Type', + 'text/plain; version=0.0.4; charset=utf-8', + ); + }); +}); diff --git a/src/monitoring/metrics/prometheus.controller.ts b/src/monitoring/metrics/prometheus.controller.ts new file mode 100644 index 00000000..77e0aaea --- /dev/null +++ b/src/monitoring/metrics/prometheus.controller.ts @@ -0,0 +1,86 @@ +import { Controller, Get, Header, Logger, Req, Res, UnauthorizedException } from '@nestjs/common'; +import { ApiExcludeEndpoint, ApiOperation, ApiResponse, ApiTags } from '@nestjs/swagger'; +import { Request, Response } from 'express'; +import { MetricsCollectionService } from './metrics-collection.service'; + +/** + * Exposes a Prometheus-compatible `/metrics` scrape endpoint. + * + * The endpoint returns metrics in the standard Prometheus text exposition + * format (text/plain; version=0.0.4). + * + * Optional bearer-token protection: + * Set METRICS_AUTH_TOKEN in the environment to require + * `Authorization: Bearer ` on every scrape request. + * Leave blank / unset to allow unauthenticated access (suitable for + * internal network scraping where the endpoint is not publicly routed). + */ +@ApiTags('Metrics') +@Controller() +export class PrometheusController { + private readonly logger = new Logger(PrometheusController.name); + private readonly authToken: string | undefined; + + constructor(private readonly metricsCollectionService: MetricsCollectionService) { + this.authToken = process.env.METRICS_AUTH_TOKEN || undefined; + } + + /** + * Prometheus scrape endpoint. + * + * Returns all registered metrics (default Node.js system metrics + custom + * TeachLink business metrics) in Prometheus text format. + */ + @Get('metrics') + @Header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8') + @ApiOperation({ + summary: 'Prometheus metrics scrape endpoint', + description: + 'Returns all registered Prometheus metrics in text exposition format. ' + + 'Includes default Node.js runtime metrics (CPU, memory, event loop lag) ' + + 'plus custom TeachLink business metrics.', + }) + @ApiResponse({ + status: 200, + description: 'Prometheus metrics in text/plain exposition format', + }) + @ApiResponse({ status: 401, description: 'Unauthorized – invalid or missing bearer token' }) + @ApiExcludeEndpoint(false) + async getMetrics(@Req() req: Request, @Res() res: Response): Promise { + this.assertAuthorized(req); + + try { + const metrics = await this.metricsCollectionService.getMetrics(); + res.setHeader('Content-Type', 'text/plain; version=0.0.4; charset=utf-8'); + res.status(200).send(metrics); + } catch (error) { + this.logger.error( + 'Failed to collect Prometheus metrics', + error instanceof Error ? error.stack : String(error), + ); + res.status(500).send('# Error collecting metrics\n'); + } + } + + /** + * Validates bearer token when METRICS_AUTH_TOKEN is configured. + * Throws UnauthorizedException on failure. + */ + private assertAuthorized(req: Request): void { + if (!this.authToken) { + // No token configured – open access + return; + } + + const authHeader = req.headers['authorization'] as string | undefined; + if (!authHeader || !authHeader.startsWith('Bearer ')) { + throw new UnauthorizedException('Metrics endpoint requires a valid bearer token'); + } + + const provided = authHeader.slice('Bearer '.length).trim(); + if (provided !== this.authToken) { + this.logger.warn(`Metrics scrape rejected – invalid token from ${req.ip}`); + throw new UnauthorizedException('Invalid metrics bearer token'); + } + } +} diff --git a/src/monitoring/monitoring.module.ts b/src/monitoring/monitoring.module.ts index b1387020..7058e364 100644 --- a/src/monitoring/monitoring.module.ts +++ b/src/monitoring/monitoring.module.ts @@ -1,13 +1,61 @@ -import { Module } from '@nestjs/common'; +import { MiddlewareConsumer, Module, NestModule, RequestMethod } from '@nestjs/common'; import { ConfigModule } from '@nestjs/config'; +import { ScheduleModule } from '@nestjs/schedule'; +import { TypeOrmModule } from '@nestjs/typeorm'; import { AlertingService } from './alerting/alerting.service'; import { MetricsCollectionService } from './metrics/metrics-collection.service'; import { CustomMetricsService } from './custom-metrics.service'; +import { PrometheusController } from './metrics/prometheus.controller'; +import { HttpMetricsMiddleware } from './metrics/http-metrics.middleware'; +import { DbMetricsSubscriber } from './metrics/db-metrics.subscriber'; +import { DbPoolMetricsCollector } from './metrics/db-pool-metrics.collector'; import { CommonModule } from '../common/common.module'; +/** + * MonitoringModule + * + * Wires together all observability infrastructure: + * + * - PrometheusController → exposes GET /metrics for Prometheus scraping + * - HttpMetricsMiddleware → auto-records HTTP request durations + * - MetricsCollectionService → central prom-client registry + helper methods + * - DbMetricsSubscriber → TypeORM subscriber for per-query timing + * - DbPoolMetricsCollector → scheduled pool-stat poller + * - AlertingService → threshold-based alerting (email / Slack) + * - CustomMetricsService → in-memory custom business metric aggregation + */ @Module({ - imports: [ConfigModule, CommonModule], - providers: [AlertingService, MetricsCollectionService, CustomMetricsService], - exports: [AlertingService, MetricsCollectionService, CustomMetricsService], + imports: [ + ConfigModule, + ScheduleModule.forRoot(), + TypeOrmModule, + CommonModule, + ], + controllers: [PrometheusController], + providers: [ + AlertingService, + MetricsCollectionService, + CustomMetricsService, + DbMetricsSubscriber, + DbPoolMetricsCollector, + ], + exports: [ + AlertingService, + MetricsCollectionService, + CustomMetricsService, + DbMetricsSubscriber, + DbPoolMetricsCollector, + ], }) -export class MonitoringModule {} +export class MonitoringModule implements NestModule { + /** + * Applies the HTTP metrics middleware to all routes except the scrape + * endpoint itself – avoiding circular observation of /metrics requests. + */ + configure(consumer: MiddlewareConsumer): void { + consumer + .apply(HttpMetricsMiddleware) + .exclude({ path: 'metrics', method: RequestMethod.GET }) + .forRoutes({ path: '*', method: RequestMethod.ALL }); + } +}