diff --git a/services/otel-gateway/Dockerfile b/services/otel-gateway/Dockerfile index 1db2661..354d6c7 100644 --- a/services/otel-gateway/Dockerfile +++ b/services/otel-gateway/Dockerfile @@ -1,10 +1,12 @@ # we use the contrib image for basicauth and other bits # # This image is build FROM scratch, so doesn't include *any* tools: -# +# # https://github.com/open-telemetry/opentelemetry-collector-releases/blob/main/distributions/otelcol/Dockerfile # -FROM otel/opentelemetry-collector-contrib:0.110.0 +FROM otel/opentelemetry-collector-contrib:0.146.0 AS otelcol-base + +FROM otelcol-base AS otel-gateway LABEL org.opencontainers.image.authors="tech@opensafely.org" \ org.opencontainers.image.url="opensafely.org" \ @@ -14,11 +16,18 @@ LABEL org.opencontainers.image.authors="tech@opensafely.org" \ # default config, can be overridden at runtime ENV HONEYCOMB_ENDPOINT="https://api.honeycomb.io" ENV METRICS_DATASET="jobrunner-metrics" +# log level for the collector's telemetry ENV LOG_LEVEL="info" +# verbosity level for the debug exporter +ENV LOG_VERBOSITY="normal" # these must be provided at runtime # ENV HONEYCOMB_KEY # ENV BASIC_AUTH_USER # ENV BASIC_AUTH_PASSWORD +# app.json must be in the WORKDIR, so make the implicit /app explict +WORKDIR /app + +COPY app.json /app/app.json COPY config.yaml /etc/otelcol-contrib/config.yaml diff --git a/services/otel-gateway/README.md b/services/otel-gateway/README.md index 4203c35..4dcab85 100644 --- a/services/otel-gateway/README.md +++ b/services/otel-gateway/README.md @@ -83,6 +83,19 @@ dokku git:from-image otel-gateway ghcr.io/opensafely-core/otel-gateway:latest dokku logs otel-gateway ``` +## Health checks + +Dokku deploy checks are configured in [`app.json`](app.json) using the collector +health endpoint at `:13133/healthz`. The file is copied into the image during +build so `dokku git:from-image` can use it. + +Useful commands on dokku: + +```bash +dokku checks:report otel-gateway +dokku checks:run otel-gateway +``` + ## Dokku app set up diff --git a/services/otel-gateway/app.json b/services/otel-gateway/app.json new file mode 100644 index 0000000..fbc3483 --- /dev/null +++ b/services/otel-gateway/app.json @@ -0,0 +1,14 @@ +{ + "healthchecks": { + "web": [ + { + "type": "startup", + "path": "/healthz", + "port": 13133, + "wait": 2, + "timeout": 5, + "attempts": 15 + } + ] + } +} diff --git a/services/otel-gateway/config.yaml b/services/otel-gateway/config.yaml index 37a2ead..c78f74a 100644 --- a/services/otel-gateway/config.yaml +++ b/services/otel-gateway/config.yaml @@ -5,6 +5,9 @@ extensions: htpasswd: inline: | ${BASIC_AUTH_USER}:${BASIC_AUTH_PASSWORD} + health_check: + endpoint: 0.0.0.0:13133 + path: /healthz receivers: otlp: @@ -18,8 +21,12 @@ processors: batch: exporters: - logging: - loglevel: "${LOG_LEVEL}" + # The debug exporter replaces the old logging exporter and takes a verbosity + # argument instead of a log level (detailed/normal/basic vs debug/info/warn). + # https://github.com/open-telemetry/opentelemetry-collector/issues/11337 + # https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/debugexporter + debug: + verbosity: "${LOG_VERBOSITY}" otlphttp/traces: endpoint: "${HONEYCOMB_ENDPOINT}" @@ -39,13 +46,13 @@ service: telemetry: logs: level: "${LOG_LEVEL}" - extensions: [basicauth/server] + extensions: [basicauth/server, health_check] pipelines: traces: receivers: [otlp] processors: [batch] - exporters: [otlphttp/traces, logging] + exporters: [otlphttp/traces, debug] metrics: receivers: [otlp] processors: [batch] - exporters: [otlphttp/metrics, logging] + exporters: [otlphttp/metrics, debug] diff --git a/services/otel-gateway/docker-compose.yml b/services/otel-gateway/docker-compose.yml new file mode 100644 index 0000000..e8be830 --- /dev/null +++ b/services/otel-gateway/docker-compose.yml @@ -0,0 +1,39 @@ +services: + otel-gateway: + build: + context: . + target: otel-gateway + image: otel-gateway + environment: + BASIC_AUTH_USER: ${BASIC_AUTH_USER} + BASIC_AUTH_PASSWORD: ${BASIC_AUTH_PASSWORD} + HONEYCOMB_KEY: ${HONEYCOMB_KEY} + HONEYCOMB_ENDPOINT: ${HONEYCOMB_ENDPOINT:-https://api.honeycomb.io} + LOG_LEVEL: ${LOG_LEVEL:-info} + LOG_VERBOSITY: ${LOG_VERBOSITY:-normal} + ports: + # otlp + - "4318:4318" + # http health check + - "13133:13133" + extra_hosts: + - "host.docker.internal:host-gateway" + networks: + - otel-test-net + + mock-honeycomb: + build: + context: . + target: otelcol-base + image: otel-gateway-mock + user: "${LOCAL_UID:-1000}:${LOCAL_GID:-1000}" + volumes: + - ./mock-honeycomb-config.yaml:/etc/otelcol-contrib/config.yaml:ro + - ./exported/honeycomb:/exported + ports: + - "4319:4318" + networks: + - otel-test-net + +networks: + otel-test-net: diff --git a/services/otel-gateway/justfile b/services/otel-gateway/justfile index f025082..03aba87 100644 --- a/services/otel-gateway/justfile +++ b/services/otel-gateway/justfile @@ -1,8 +1,11 @@ set dotenv-load := true -export IMAGE_NAME := "otel-gateway" export DOCKER_BUILDKIT := "1" +# when runnign via just, ensure we have detail logs for debugging tests +export LOG_LEVEL := "debug" +export LOG_VERBOSITY := "detailed" + # list available commands default: @"{{ just_executable() }}" --list @@ -94,66 +97,67 @@ fix: # build the docker image build: _dotenv - docker build . -t $IMAGE_NAME + docker compose build otel-gateway # run the gateway, in the foreground by default. run *args: _checkenv build - docker run --rm --name otel-gateway \ - -e BASIC_AUTH_USER=$BASIC_AUTH_USER \ - -e BASIC_AUTH_PASSWORD=$BASIC_AUTH_PASSWORD \ - -e HONEYCOMB_KEY \ - --network=otel-test-net \ - {{ args }} {{ IMAGE_NAME }} + #!/bin/bash + set -euo pipefail + + docker compose up --force-recreate --no-deps {{ args }} otel-gateway # run integration test. You will need a HONEYCOMB_KEY set in the environment test-integration: _checkenv #!/bin/bash set -euo pipefail - {{ just_executable() }} run -d -e LOG_LEVEL=debug -p 4318:4318 + trap 'docker compose stop otel-gateway >/dev/null 2>&1 || true' EXIT + + docker compose up -d --build --force-recreate --no-deps otel-gateway + docker compose ps --status running --services | grep -qx otel-gateway || { docker compose logs otel-gateway; exit 1; } + {{ just_executable() }} _wait_for_health otel-gateway "http://127.0.0.1:13133/healthz" {{ just_executable() }} run-python tests.py echo "Data sent to honeycomb" echo "https://ui.honeycomb.io/bennett-institute-for-applied-data-science/environments/development/datasets/otel-gateway-tests?query=%7B%22time_range%22%3A600%2C%22granularity%22%3A0%2C%22breakdowns%22%3A%5B%5D%2C%22calculations%22%3A%5B%5D%2C%22orders%22%3A%5B%5D%2C%22havings%22%3A%5B%5D%2C%22limit%22%3A100%7D" +_wait_for_health service url: + #!/bin/bash + set -euo pipefail + + if ! curl -fsS --retry 20 --retry-delay 1 --retry-connrefused --retry-all-errors "{{ url }}" >/dev/null 2>&1; then + echo "{{ service }} did not become healthy in time: {{ url }}" + docker compose logs "{{ service }}" + exit 1 + fi + _mock_honeycomb_start: #!/bin/bash set -euo pipefail mkdir -p exported/honeycomb - # run a different instance of a collector as a test endpoint - docker network create otel-test-net 2>/dev/null || true - docker run --rm -d -p 4319:4318 \ - --name mock-honeycomb -u "$(id -u):$(id -g)" \ - -v $PWD/mock-honeycomb-config.yaml:/etc/otelcol-contrib/config.yaml \ - -v $PWD/exported/honeycomb:/exported \ - --network=otel-test-net \ - otel/opentelemetry-collector-contrib:0.62.1 - test "$(docker inspect mock-honeycomb -f '{{{{.State.Status}}')" == "running" || { docker logs mock-honeycomb; exit 1; } + # we need these so we can read/write the files as the right user + export LOCAL_UID="$(id -u)" + export LOCAL_GID="$(id -g)" + + # point otel-gateway at the mock instance. + export HONEYCOMB_ENDPOINT="http://mock-honeycomb:4318" + + docker compose up -d --build --force-recreate mock-honeycomb otel-gateway + docker compose ps --status running --services | grep -qx mock-honeycomb || { docker compose logs mock-honeycomb; exit 1; } + docker compose ps --status running --services | grep -qx otel-gateway || { docker compose logs otel-gateway; exit 1; } + {{ just_executable() }} _wait_for_health otel-gateway "http://127.0.0.1:13133/healthz" _mock_honeycomb_stop: - docker stop mock-honeycomb - docker network remove otel-test-net + docker compose stop otel-gateway mock-honeycomb # run tests against mock upstream servers test-ci: _checkenv _mock_honeycomb_start && _mock_honeycomb_stop #!/bin/bash set -euo pipefail - export HONEYCOMB_ENDPOINT="http://mock-honeycomb:4318" - - # run otel-gateway pointing at the test endpoint - {{ just_executable() }} run -d \ - -e HONEYCOMB_ENDPOINT \ - -e LOG_LEVEL=debug \ - -p 4318:4318 \ - --add-host=host.docker.internal:host-gateway - - test "$(docker inspect otel-gateway -f '{{{{.State.Status}}')" == "running" || { docker logs otel-gateway; exit 1; } {{ just_executable() }} run-python -m pytest tests.py - docker stop otel-gateway - # run a python script in the correct environment run-python *args: _checkenv #!/bin/bash @@ -161,7 +165,6 @@ run-python *args: _checkenv TOKEN=$(echo -n "$BASIC_AUTH_USER:$BASIC_AUTH_PASSWORD" | base64) - export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:4318" export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic%20$TOKEN" export OTEL_SERVICE_NAME="otel-gateway-tests" diff --git a/services/otel-gateway/mock-honeycomb-config.yaml b/services/otel-gateway/mock-honeycomb-config.yaml index 7898935..43e1734 100644 --- a/services/otel-gateway/mock-honeycomb-config.yaml +++ b/services/otel-gateway/mock-honeycomb-config.yaml @@ -2,13 +2,14 @@ receivers: otlp: protocols: http: + endpoint: 0.0.0.0:4318 processors: batch: exporters: - logging: - logLevel: debug + debug: + verbosity: detailed file/traces: path: /exported/traces.json file/metrics: @@ -22,8 +23,8 @@ service: traces: receivers: [otlp] processors: [batch] - exporters: [file/traces, logging] + exporters: [file/traces, debug] metrics: receivers: [otlp] processors: [batch] - exporters: [file/metrics, logging] + exporters: [file/metrics, debug] diff --git a/services/otel-gateway/tests.py b/services/otel-gateway/tests.py index 5aeb97e..c6ecfca 100644 --- a/services/otel-gateway/tests.py +++ b/services/otel-gateway/tests.py @@ -54,9 +54,9 @@ def generate_test_metric(): def get_output(path): - # wait for file to be written to, typically a few hundred 100ms + # Wait for the exporter to create and write the file. timeout_count = 0 - while path.exists() and path.stat().st_size == 0: + while (not path.exists()) or path.stat().st_size == 0: time.sleep(0.01) timeout_count = timeout_count + 1 if timeout_count > 500: @@ -64,7 +64,11 @@ def get_output(path): "Test timed out - no output written to file after 5 seconds" ) - return json.loads(path.read_text()) + # file exporter writes one JSON object per line (ndjson); read the latest. + lines = [line for line in path.read_text().splitlines() if line.strip()] + if not lines: + raise Exception("Test timed out - output file was empty") + return json.loads(lines[-1]) def service_name_helper(resource_attributes):