Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions .github/workflows/frontend-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Frontend CI

on:
push:
branches: [main, develop]
paths:
- 'app/frontend/**'
- '.github/workflows/frontend-ci.yml'
pull_request:
branches: [main, develop]
paths:
- 'app/frontend/**'
- '.github/workflows/frontend-ci.yml'

jobs:
test:
name: Frontend tests & accessibility audit
runs-on: ubuntu-latest
defaults:
run:
working-directory: app/frontend

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'

- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9

- name: Get pnpm store directory
id: pnpm-cache
run: echo "store-dir=$(pnpm store path --silent)" >> "$GITHUB_OUTPUT"

- name: Cache pnpm modules
uses: actions/cache@v4
with:
path: ${{ steps.pnpm-cache.outputs.store-dir }}
key: ${{ runner.os }}-pnpm-${{ hashFiles('pnpm-lock.yaml') }}
restore-keys: |
${{ runner.os }}-pnpm-

- name: Install dependencies
run: pnpm install --frozen-lockfile

- name: Type check
run: pnpm type-check

- name: Lint
run: pnpm lint --max-warnings=0

- name: Run unit & jest-axe accessibility tests
# --ci ensures deterministic output and a clean exit code on test failures.
run: pnpm jest --ci --colors=false

- name: Upload test summary
if: always()
uses: actions/upload-artifact@v4
with:
name: frontend-test-summary
path: |
app/frontend/coverage
if-no-files-found: ignore
retention-days: 7
83 changes: 83 additions & 0 deletions .github/workflows/lighthouse.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Lighthouse CI (Scheduled)

on:
schedule:
# 00:30 UTC sidesteps the global GitHub Actions queue spike at
# midnight and ensures log rotations have completed first.
- cron: '30 0 * * *'
# Allow ops to trigger a fresh run on demand from the Actions tab
# without waiting for the next scheduled window.
workflow_dispatch:

# Cancel any in-flight run if the schedule slips or someone hit
# "Re-run all jobs" twice.
concurrency:
group: scheduled-lighthouse
cancel-in-progress: true

permissions:
contents: read

jobs:
audit:
name: Daily Lighthouse profile
runs-on: ubuntu-latest

env:
NEXT_TELEMETRY_DISABLED: '1'
NODE_ENV: production

defaults:
run:
working-directory: app/frontend

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup Node.js
id: node-version
uses: actions/setup-node@v4
with:
node-version: '20'

- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9

- name: Get pnpm store directory
id: pnpm-cache
run: echo "store-dir=$(pnpm store path --silent)" >> "$GITHUB_OUTPUT"

- name: Cache pnpm modules
uses: actions/cache@v4
with:
path: ${{ steps.pnpm-cache.outputs.store-dir }}
# Node version is part of the key so a Node upgrade doesn't poison
# the restore-keys fallback with a stale store layout.
key: ${{ runner.os }}-node${{ steps.node-version.outputs.node-version }}-pnpm-lighthouse-${{ hashFiles('app/frontend/pnpm-lock.yaml') }}
restore-keys: |
${{ runner.os }}-node${{ steps.node-version.outputs.node-version }}-pnpm-lighthouse-

- name: Install dependencies
run: pnpm install --frozen-lockfile

- name: Build production bundle
# pnpm start will refuse to boot if .next/ is missing, so a
# successful build step is a hard precondition.
run: pnpm build

- name: Run Lighthouse CI
# @lhci/cli ships its own Chromium via puppeteer, so we don't
# need a separate browser-actions step on ubuntu-latest.
run: pnpm lhci:autorun

- name: Upload Lighthouse reports
if: always()
uses: actions/upload-artifact@v4
with:
name: lighthouse-reports
path: app/frontend/lhci-reports
retention-days: 14
if-no-files-found: warn
31 changes: 31 additions & 0 deletions app/ai-service/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,37 @@
INFERENCE_LATENCY = Histogram('inference_latency_seconds', 'Inference latency in seconds', ['task_type'])
PIPELINE_STEP_LATENCY = Histogram('pipeline_step_latency_seconds', 'Pipeline step latency in seconds', ['step_name'])

# Circuit breaker metrics
# State is encoded numerically so it can be plotted over time:
# 0 = CLOSED (healthy), 1 = HALF_OPEN (probing), 2 = OPEN (failing fast).
CIRCUIT_STATE = Gauge(
'circuit_breaker_state',
'Circuit breaker state (0=CLOSED, 1=HALF_OPEN, 2=OPEN)',
['breaker_name'],
)
CIRCUIT_FAILURE_COUNT = Counter(
'circuit_breaker_failure_count_total',
'Total failures recorded by the circuit breaker',
['breaker_name'],
)
CIRCUIT_RECOVERY_TIME = Histogram(
'circuit_breaker_recovery_time_seconds',
'Time spent in the OPEN state before transitioning to HALF_OPEN',
['breaker_name'],
)

# Circuit-breaker state constants. Exported so callers (and tests) can
# compare against the numeric gauge value without hard-coding literals.
CIRCUIT_STATE_CLOSED = 0
CIRCUIT_STATE_HALF_OPEN = 1
CIRCUIT_STATE_OPEN = 2


def set_circuit_state(breaker_name: str, state_value: int) -> None:
"""Helper to update the circuit-state gauge from anywhere."""
CIRCUIT_STATE.labels(breaker_name=breaker_name).set(state_value)


def check_system_resources(memory_threshold_percent: float = 90.0) -> bool:
"""
Check if system RAM or VRAM is above threshold.
Expand Down
32 changes: 30 additions & 2 deletions app/ai-service/services/circuit_breaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,49 @@
import logging
from threading import Lock

from metrics import (
CIRCUIT_STATE_CLOSED,
CIRCUIT_STATE_HALF_OPEN,
CIRCUIT_STATE_OPEN,
CIRCUIT_FAILURE_COUNT,
CIRCUIT_RECOVERY_TIME,
set_circuit_state,
)

logger = logging.getLogger(__name__)


class CircuitBreaker:
"""
A thread-safe implementation of the Circuit Breaker pattern.

States:
- CLOSED: Normal operation. Requests flow through.
- OPEN: Service is failing. Requests fail-fast (return False/raise error).
- HALF_OPEN: Recovery window elapsed. Allow a request to test downstream health.

The breaker publishes Prometheus metrics on every state change:
- CIRCUIT_STATE (Gauge): current state, encoded as 0/1/2.
- CIRCUIT_FAILURE_COUNT (Counter): cumulative failure count.
- CIRCUIT_RECOVERY_TIME (Histogram): time spent OPEN before HALF_OPEN.
Metric updates happen inside the same lock that guards state, so the
exported values can never diverge from the underlying state.
"""

def __init__(self, name: str, failure_threshold: int = 3, recovery_timeout: float = 30.0):
self.name = name
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout

self.state = "CLOSED" # CLOSED, OPEN, HALF_OPEN
self.failure_count = 0
self.last_state_change = time.time()
self._lock = Lock()

# Publish the initial state so the gauge is always defined for
# every instantiated breaker, even before any traffic flows.
set_circuit_state(self.name, CIRCUIT_STATE_CLOSED)

def allow_request(self) -> bool:
"""
Check if a request is allowed to proceed.
Expand All @@ -34,6 +54,9 @@ def allow_request(self) -> bool:
now = time.time()
if self.state == "OPEN":
if now - self.last_state_change >= self.recovery_timeout:
# Capture recovery time BEFORE updating last_state_change,
# so the histogram reflects how long we were actually OPEN.
recovery_seconds = now - self.last_state_change
logger.info(
"Circuit breaker for provider '%s' transitioning from OPEN to HALF_OPEN "
"(recovery timeout %ss elapsed)",
Expand All @@ -42,6 +65,8 @@ def allow_request(self) -> bool:
)
self.state = "HALF_OPEN"
self.last_state_change = now
set_circuit_state(self.name, CIRCUIT_STATE_HALF_OPEN)
CIRCUIT_RECOVERY_TIME.labels(breaker_name=self.name).observe(recovery_seconds)
return True
return False
return True
Expand All @@ -62,6 +87,7 @@ def record_success(self) -> None:
self.state = "CLOSED"
self.failure_count = 0
self.last_state_change = now
set_circuit_state(self.name, CIRCUIT_STATE_CLOSED)
elif self.state == "CLOSED":
self.failure_count = 0

Expand All @@ -73,6 +99,7 @@ def record_failure(self) -> None:
with self._lock:
now = time.time()
self.failure_count += 1
CIRCUIT_FAILURE_COUNT.labels(breaker_name=self.name).inc()
if self.state == "HALF_OPEN" or self.failure_count >= self.failure_threshold:
logger.warning(
"Circuit breaker for provider '%s' transitioning from %s to OPEN "
Expand All @@ -84,3 +111,4 @@ def record_failure(self) -> None:
)
self.state = "OPEN"
self.last_state_change = now
set_circuit_state(self.name, CIRCUIT_STATE_OPEN)
Loading