diff --git a/CMakeLists.txt b/CMakeLists.txt index 86eaef64720..6df450c510c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,8 @@ project(YDB-CPP-SDK VERSION ${YDB_SDK_VERSION} LANGUAGES C CXX ASM) option(YDB_SDK_INSTALL "Install YDB C++ SDK" Off) option(YDB_SDK_TESTS "Build YDB C++ SDK tests" Off) option(YDB_SDK_EXAMPLES "Build YDB C++ SDK examples" On) +option(YDB_SDK_ENABLE_OTEL_METRICS "Build OpenTelemetry metrics plugin" Off) +option(YDB_SDK_ENABLE_OTEL_TRACE "Build OpenTelemetry trace plugin" Off) set(YDB_SDK_GOOGLE_COMMON_PROTOS_TARGET "" CACHE STRING "Name of cmake target preparing google common proto library") option(YDB_SDK_USE_RAPID_JSON "Search for rapid json library in system" ON) @@ -58,6 +60,7 @@ add_subdirectory(library/cpp) add_subdirectory(include/ydb-cpp-sdk/client) add_subdirectory(src) add_subdirectory(util) +add_subdirectory(plugins) #_ydb_sdk_validate_public_headers() diff --git a/cmake/external_libs.cmake b/cmake/external_libs.cmake index dc46fdb1d5e..4560fd662b3 100644 --- a/cmake/external_libs.cmake +++ b/cmake/external_libs.cmake @@ -14,6 +14,11 @@ find_package(Brotli 1.1.0 REQUIRED) find_package(jwt-cpp REQUIRED) find_package(double-conversion REQUIRED) +# OpenTelemetry +if (YDB_SDK_ENABLE_OTEL_METRICS OR YDB_SDK_ENABLE_OTEL_TRACE) + find_package(opentelemetry-cpp REQUIRED) +endif() + # RapidJSON if (YDB_SDK_USE_RAPID_JSON) find_package(RapidJSON REQUIRED) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9607701b4f6..380b49e8e99 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -9,3 +9,7 @@ add_subdirectory(topic_writer/transaction) add_subdirectory(ttl) add_subdirectory(vector_index) add_subdirectory(vector_index_builtin) + +if (YDB_SDK_ENABLE_OTEL_TRACE AND YDB_SDK_ENABLE_OTEL_METRICS) + add_subdirectory(otel_tracing) +endif() diff --git a/examples/otel_tracing/CMakeLists.txt b/examples/otel_tracing/CMakeLists.txt new file mode 100644 index 00000000000..b826c66688b --- /dev/null +++ b/examples/otel_tracing/CMakeLists.txt @@ -0,0 +1,41 @@ +add_executable(otel_tracing_example) + +target_link_libraries(otel_tracing_example PUBLIC + yutil + getopt + YDB-CPP-SDK::Query + YDB-CPP-SDK::Table + YDB-CPP-SDK::Params + YDB-CPP-SDK::Driver + YDB-CPP-SDK::OpenTelemetryTrace + YDB-CPP-SDK::OpenTelemetryMetrics + opentelemetry-cpp::otlp_http_exporter + opentelemetry-cpp::otlp_http_metric_exporter +) + +target_sources(otel_tracing_example PRIVATE + main.cpp +) + +vcs_info(otel_tracing_example) + +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") + target_link_libraries(otel_tracing_example PUBLIC + cpuid_check + ) +endif() + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_options(otel_tracing_example PRIVATE + -ldl + -lrt + -Wl,--no-as-needed + -lpthread + ) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_link_options(otel_tracing_example PRIVATE + -Wl,-platform_version,macos,11.0,11.0 + -framework + CoreFoundation + ) +endif() diff --git a/examples/otel_tracing/README.md b/examples/otel_tracing/README.md new file mode 100644 index 00000000000..5d4c750d057 --- /dev/null +++ b/examples/otel_tracing/README.md @@ -0,0 +1,186 @@ +# YDB C++ SDK — OpenTelemetry Demo + +Демонстрация трассировки и метрик операций QueryService и TableService +с визуализацией в **Grafana**, **Jaeger** и **Prometheus**. + +## Архитектура + +``` +┌──────────────┐ OTLP/HTTP ┌──────────────────┐ +│ C++ demo │ ──────────────────> │ OTel Collector │ +│ application │ │ :4328 (HTTP) │ +└──────────────┘ └────────┬──────────┘ + │ │ + traces │ │ metrics + ▼ ▼ + ┌──────────┐ ┌────────────┐ + │ Jaeger │ │ Prometheus │ + │ :16686 │ │ :9090 │ + └─────┬─────┘ └──────┬──────┘ + │ │ + └───────┬───────┘ + ▼ + ┌──────────┐ + │ Grafana │ + │ :3000 │ + └──────────┘ +``` + +## Быстрый старт + +### 1. Запустить инфраструктуру + +```bash +cd examples/otel_tracing +docker compose up -d +``` + +Дождитесь готовности YDB: + +```bash +docker compose logs ydb -f +# Ждите строку "Database started successfully" +``` + +### 2. Собрать SDK с OTel и тестами + +Из корня репозитория: + +```bash +mkdir -p build && cd build + +cmake .. \ + -DYDB_SDK_TESTS=ON \ + -DYDB_SDK_ENABLE_OTEL_TRACE=ON \ + -DYDB_SDK_ENABLE_OTEL_METRICS=ON + +cmake --build . --target otel_tracing_example -j$(nproc) +``` + +### 3. Запустить демо + +```bash +./examples/otel_tracing/otel_tracing_example \ + --endpoint localhost:2136 \ + --database /local \ + --otlp http://localhost:4328 \ + --iterations 20 \ + --retry-workers 6 \ + --retry-ops 30 +``` + +#### Доступные флаги + +| Флаг | По умолчанию | Описание | +|--------------------|---------------------------|--------------------------------------------------------------------------| +| `--endpoint`, `-e` | `localhost:2136` | gRPC-эндпоинт YDB | +| `--database`, `-d` | `/local` | Имя базы | +| `--otlp` | `http://localhost:4328` | OTLP/HTTP endpoint коллектора | +| `--iterations`,`-n`| `20` | Итераций в Query- и Table-нагрузке | +| `--retry-workers` | `6` | Параллельных воркеров в retry-нагрузке (`0` чтобы пропустить) | +| `--retry-ops` | `30` | Операций на каждого retry-воркера | + +#### Демонстрация реальных ретраев + +Третий встроенный сценарий — `RunRetryWorkload` — намеренно провоцирует +**SERIALIZABLE-конфликты**: N параллельных воркеров делают +`SELECT → sleep → UPSERT → COMMIT` на одной и той же «горячей» строке +(`id = 9999`) внутри `RetryQuerySync`. YDB возвращает `ABORTED` +проигравшим транзакциям, и SDK прозрачно ретраит их. + +В трейсах появятся: + +``` +ydb.RunWithRetry (INTERNAL) +├── ydb.Try attempt=0 (INTERNAL, backoff_ms=0) +│ ├── ydb.CreateSession +│ ├── ydb.ExecuteQuery +│ └── ydb.Commit status=ABORTED, error.type=ABORTED, exception event +├── ydb.Try attempt=1 (INTERNAL, backoff_ms=...) +│ └── ... status=ABORTED +└── ydb.Try attempt=N (INTERNAL) + └── ... status=SUCCESS +``` + +Для усиления конфликтов поднимите воркеров и операций: + +```bash +./examples/otel_tracing/otel_tracing_example \ + --retry-workers 12 --retry-ops 80 +``` + +В конце программа печатает счётчик наблюдённых абортов — каждый из них +соответствует одному автоматическому ретраю SDK. + +> **Важно:** для статуса `ABORTED` SDK использует политику +> `RetryImmediately` (см. `src/client/impl/internal/retry/retry.h`), +> поэтому атрибут `ydb.retry.backoff_ms` будет равен `0` — +> это by design. Чтобы увидеть `backoff_ms > 0`, нужны статусы +> `UNAVAILABLE` (FastBackoff, slot 5 ms) или `OVERLOADED` / +> `CLIENT_RESOURCE_EXHAUSTED` (SlowBackoff, slot 1 s). Самый простой способ +> их получить — кратковременно перезапустить YDB во время работы примера: +> +> ```bash +> ./examples/otel_tracing/otel_tracing_example --retry-workers 8 --retry-ops 100 & +> sleep 5 +> docker compose -f examples/otel_tracing/docker-compose.yml restart ydb +> wait +> ``` + +### 4. Открыть дашборды + +| Сервис | URL | Описание | +|-----------|------------------------------|---------------------------------| +| Grafana | http://localhost:3000 | Дашборд "YDB QueryService" | +| Jaeger | http://localhost:16686 | Поиск трейсов по сервису | +| Prometheus| http://localhost:9090 | Метрики `db_client_operation_*` | + +**Grafana**: логин `admin` / пароль `admin`. + +### 5. Что смотреть + +#### В Grafana (дашборд "YDB QueryService"): +- **Request Rate by Operation** — RPS по операциям (ExecuteQuery, ExecuteDataQuery, CreateSession, Commit, Rollback) +- **Error Rate by Operation** — частота ошибок +- **Duration p50/p95/p99** — распределение длительности операций +- **Error Ratio** — процент ошибок +- **Recent Traces** — таблица трейсов из Jaeger + +#### В Jaeger UI: +- Выберите сервис `ydb-cpp-sdk-demo`. +- RPC-спаны (`SpanKind = CLIENT`): + `ydb.CreateSession`, `ydb.ExecuteQuery`, `ydb.ExecuteDataQuery`, + `ydb.BeginTransaction`, `ydb.Commit`, `ydb.Rollback`, + `ydb.ExecuteSchemeQuery`, `ydb.BulkUpsert`. +- Retry-спаны (`SpanKind = INTERNAL`): `ydb.RunWithRetry`, + `ydb.Try` (по одному на каждую попытку, с атрибутами + `ydb.retry.attempt`, `ydb.retry.backoff_ms`). +- Общие атрибуты на всех YDB-спанах: + - `db.system.name = ydb` + - `db.namespace` (имя базы) + - `server.address`, `server.port` (эндпоинт балансера) + - `network.peer.address`, `network.peer.port` (фактический узел кластера) +- На ошибках добавляются: + - `db.response.status_code` — строковый статус YDB (например, `ABORTED`) + - `error.type` — тот же строковый статус + - событие `exception` с `exception.type` и `exception.message` + +#### В Prometheus: +- `db_client_operation_duration_seconds_bucket` — гистограмма длительности + (OTel Semantic Conventions). Лейблы: `db.system.name`, `db.namespace`, + `db.operation.name` (с префиксом `ydb.`), `ydb.client.api` + (`Query` / `Table`). Для ошибок добавляются `db.response.status_code` + и `error.type`. +- `db_client_operation_requests_total` — счётчик начатых операций + (включая каждую попытку ретрая). +- `db_client_operation_errors_total` — счётчик неуспешных попыток. + Полезно сравнивать с `requests_total`: для retry-нагрузки на той же + «горячей» строке коэффициент ошибок будет очень высоким — это и есть + индикатор работы ретраев. + +### 6. Остановить + +```bash +cd examples/otel_tracing +docker compose down -v +``` diff --git a/examples/otel_tracing/docker-compose.yml b/examples/otel_tracing/docker-compose.yml new file mode 100644 index 00000000000..9d01c8fa823 --- /dev/null +++ b/examples/otel_tracing/docker-compose.yml @@ -0,0 +1,70 @@ +services: + ydb: + image: cr.yandex/yc/yandex-docker-local-ydb:latest + platform: linux/amd64 + ports: + - "2136:2136" + - "8765:8765" + environment: + - GRPC_TLS_PORT=2135 + - GRPC_PORT=2136 + - MON_PORT=8765 + - YDB_DEFAULT_LOG_LEVEL=NOTICE + - YDB_USE_IN_MEMORY_PDISKS=true + volumes: + - ydb-data:/ydb_data + healthcheck: + test: /bin/sh -c "/ydb -e grpc://localhost:2136 -d /local scheme ls" + interval: 5s + timeout: 5s + retries: 20 + + jaeger: + image: jaegertracing/all-in-one:1.76.0 + ports: + - "16686:16686" + - "4317:4317" + - "4318:4318" + environment: + - COLLECTOR_OTLP_ENABLED=true + + prometheus: + image: prom/prometheus:v2.53.0 + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + depends_on: + - otel-collector + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + ports: + - "4327:4317" + - "4328:4318" + - "8889:8889" + volumes: + - ./otel-collector/config.yml:/etc/otelcol-contrib/config.yaml:ro + depends_on: + - jaeger + + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + depends_on: + - jaeger + - prometheus + +volumes: + ydb-data: + grafana-data: diff --git a/examples/otel_tracing/grafana/dashboards/ydb-query-service.json b/examples/otel_tracing/grafana/dashboards/ydb-query-service.json new file mode 100644 index 00000000000..12a38d99f15 --- /dev/null +++ b/examples/otel_tracing/grafana/dashboards/ydb-query-service.json @@ -0,0 +1,129 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Request Rate by Operation", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${prometheus_ds}" }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { "drawStyle": "line", "fillOpacity": 10 } + } + }, + "targets": [ + { + "expr": "rate(db_client_operation_requests_total[1m])", + "legendFormat": "{{db_operation_name}}" + } + ] + }, + { + "title": "Error Rate by Operation", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${prometheus_ds}" }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { "drawStyle": "line", "fillOpacity": 10 }, + "color": { "mode": "palette-classic" } + } + }, + "targets": [ + { + "expr": "rate(db_client_operation_errors_total[1m])", + "legendFormat": "{{db_operation_name}}" + } + ] + }, + { + "title": "Duration p50 / p95 / p99 (s)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${prometheus_ds}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 5 } + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(db_client_operation_duration_seconds_bucket[1m]))", + "legendFormat": "p50 {{db_operation_name}}" + }, + { + "expr": "histogram_quantile(0.95, rate(db_client_operation_duration_seconds_bucket[1m]))", + "legendFormat": "p95 {{db_operation_name}}" + }, + { + "expr": "histogram_quantile(0.99, rate(db_client_operation_duration_seconds_bucket[1m]))", + "legendFormat": "p99 {{db_operation_name}}" + } + ] + }, + { + "title": "Error Ratio (%)", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "${prometheus_ds}" }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + } + } + }, + "targets": [ + { + "expr": "sum(rate(db_client_operation_errors_total[5m])) by (db_operation_name) / sum(rate(db_client_operation_requests_total[5m])) by (db_operation_name)", + "legendFormat": "{{db_operation_name}}" + } + ] + }, + { + "title": "Recent Traces", + "type": "table", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 16 }, + "datasource": { "type": "jaeger", "uid": "${jaeger_ds}" }, + "targets": [ + { + "query": "ydb-cpp-sdk-demo", + "queryType": "search", + "service": "ydb-cpp-sdk-demo" + } + ] + } + ], + "schemaVersion": 39, + "templating": { + "list": [ + { + "name": "prometheus_ds", + "type": "datasource", + "query": "prometheus", + "current": { "text": "Prometheus", "value": "Prometheus" } + }, + { + "name": "jaeger_ds", + "type": "datasource", + "query": "jaeger", + "current": { "text": "Jaeger", "value": "Jaeger" } + } + ] + }, + "time": { "from": "now-30m", "to": "now" }, + "title": "YDB QueryService", + "uid": "ydb-query-service" +} diff --git a/examples/otel_tracing/grafana/provisioning/dashboards/dashboards.yml b/examples/otel_tracing/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000000..8336756c137 --- /dev/null +++ b/examples/otel_tracing/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "YDB" + orgId: 1 + folder: "YDB" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/examples/otel_tracing/grafana/provisioning/datasources/datasources.yml b/examples/otel_tracing/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 00000000000..428e06210ab --- /dev/null +++ b/examples/otel_tracing/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,16 @@ +apiVersion: 1 + +datasources: + - name: Jaeger + type: jaeger + access: proxy + url: http://jaeger:16686 + isDefault: false + editable: true + + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/examples/otel_tracing/main.cpp b/examples/otel_tracing/main.cpp new file mode 100644 index 00000000000..ca7cbf5b7ed --- /dev/null +++ b/examples/otel_tracing/main.cpp @@ -0,0 +1,442 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +namespace nostd = opentelemetry::nostd; +namespace sdktrace = opentelemetry::sdk::trace; +namespace sdkmetrics = opentelemetry::sdk::metrics; +namespace otlp = opentelemetry::exporter::otlp; +namespace resource = opentelemetry::sdk::resource; + +using namespace NYdb; +using namespace NYdb::NStatusHelpers; + +struct TConfig { + std::string Endpoint = "localhost:2136"; + std::string Database = "/local"; + std::string OtlpEndpoint = "http://localhost:4328"; + int Iterations = 20; + int RetryWorkers = 6; + int RetryOps = 30; +}; + +nostd::shared_ptr InitTracing(const TConfig& cfg) { + otlp::OtlpHttpExporterOptions opts; + opts.url = cfg.OtlpEndpoint + "/v1/traces"; + + auto exporter = otlp::OtlpHttpExporterFactory::Create(opts); + auto processor = sdktrace::SimpleSpanProcessorFactory::Create(std::move(exporter)); + + auto res = resource::Resource::Create({ + {"service.name", "ydb-cpp-sdk-demo"}, + {"service.version", "1.0.0"}, + }); + + std::shared_ptr provider = + std::make_shared(std::move(processor), res); + return nostd::shared_ptr(provider); +} + +nostd::shared_ptr InitMetrics(const TConfig& cfg) { + otlp::OtlpHttpMetricExporterOptions opts; + opts.url = cfg.OtlpEndpoint + "/v1/metrics"; + + auto exporter = otlp::OtlpHttpMetricExporterFactory::Create(opts); + + sdkmetrics::PeriodicExportingMetricReaderOptions readerOpts; + readerOpts.export_interval_millis = std::chrono::milliseconds(5000); + readerOpts.export_timeout_millis = std::chrono::milliseconds(3000); + + auto reader = sdkmetrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts); + + auto res = resource::Resource::Create({ + {"service.name", "ydb-cpp-sdk-demo"}, + {"service.version", "1.0.0"}, + }); + + auto rawProvider = std::make_shared( + std::unique_ptr(new sdkmetrics::ViewRegistry()), res); + rawProvider->AddMetricReader(std::move(reader)); + + std::shared_ptr provider = rawProvider; + return nostd::shared_ptr(provider); +} + +nostd::shared_ptr GetAppTracer() { + return opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("ydb-demo-app", "1.0.0"); +} + +void RunQueryWorkload(NQuery::TQueryClient& client, int iterations) { + std::cout << "\n=== Query Service workload ===" << std::endl; + + auto tracer = GetAppTracer(); + + { + auto ddlSpan = tracer->StartSpan("QueryService.DDL"); + auto scope = opentelemetry::trace::Scope(ddlSpan); + + ThrowOnError(client.RetryQuerySync([](NQuery::TSession session) { + return session.ExecuteQuery(R"( + CREATE TABLE IF NOT EXISTS otel_demo ( + id Uint64, + value Utf8, + PRIMARY KEY (id) + ) + )", NQuery::TTxControl::NoTx()).GetValueSync(); + })); + + ddlSpan->SetStatus(opentelemetry::trace::StatusCode::kOk); + } + + for (int i = 0; i < iterations; ++i) { + auto iterSpan = tracer->StartSpan("QueryService.Iteration"); + auto scope = opentelemetry::trace::Scope(iterSpan); + iterSpan->SetAttribute("iteration", static_cast(i + 1)); + + std::cout << " [Query] Iteration " << (i + 1) << "/" << iterations << std::endl; + + ThrowOnError(client.RetryQuerySync([i](NQuery::TSession session) { + auto params = TParamsBuilder() + .AddParam("$id").Uint64(i).Build() + .AddParam("$val").Utf8("query_" + std::to_string(i)).Build() + .Build(); + + return session.ExecuteQuery(R"( + DECLARE $id AS Uint64; + DECLARE $val AS Utf8; + UPSERT INTO otel_demo (id, value) VALUES ($id, $val) + )", NQuery::TTxControl::BeginTx(NQuery::TTxSettings::SerializableRW()).CommitTx(), + params).GetValueSync(); + })); + + ThrowOnError(client.RetryQuerySync([i](NQuery::TSession session) { + auto params = TParamsBuilder() + .AddParam("$id").Uint64(i).Build() + .Build(); + + return session.ExecuteQuery(R"( + DECLARE $id AS Uint64; + SELECT id, value FROM otel_demo WHERE id = $id + )", NQuery::TTxControl::BeginTx(NQuery::TTxSettings::SerializableRW()).CommitTx(), + params).GetValueSync(); + })); + + if (i % 5 == 4) { + ThrowOnError(client.RetryQuerySync([](NQuery::TQueryClient client) -> TStatus { + auto session = client.GetSession().GetValueSync().GetSession(); + auto beginResult = session.BeginTransaction(NQuery::TTxSettings::SerializableRW()).GetValueSync(); + if (!beginResult.IsSuccess()) { + return beginResult; + } + auto tx = beginResult.GetTransaction(); + + auto result = session.ExecuteQuery(R"( + SELECT COUNT(*) AS cnt FROM otel_demo + )", NQuery::TTxControl::Tx(tx)).GetValueSync(); + + if (!result.IsSuccess()) { + return result; + } + + return tx.Commit().GetValueSync(); + })); + } + + iterSpan->SetStatus(opentelemetry::trace::StatusCode::kOk); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } +} + +void RunTableWorkload(NTable::TTableClient& client, int iterations) { + std::cout << "\n=== Table Service workload ===" << std::endl; + + auto tracer = GetAppTracer(); + + for (int i = 0; i < iterations; ++i) { + int id = 1000 + i; + + auto iterSpan = tracer->StartSpan("TableService.Iteration"); + auto scope = opentelemetry::trace::Scope(iterSpan); + iterSpan->SetAttribute("iteration", static_cast(i + 1)); + + std::cout << " [Table] Iteration " << (i + 1) << "/" << iterations << std::endl; + + ThrowOnError(client.RetryOperationSync([id](NTable::TSession session) { + auto params = session.GetParamsBuilder() + .AddParam("$id").Uint64(id).Build() + .AddParam("$val").Utf8("table_" + std::to_string(id)).Build() + .Build(); + + return session.ExecuteDataQuery(R"( + DECLARE $id AS Uint64; + DECLARE $val AS Utf8; + UPSERT INTO otel_demo (id, value) VALUES ($id, $val) + )", NTable::TTxControl::BeginTx(NTable::TTxSettings::SerializableRW()).CommitTx(), + std::move(params)).GetValueSync(); + })); + + ThrowOnError(client.RetryOperationSync([id](NTable::TSession session) { + auto params = session.GetParamsBuilder() + .AddParam("$id").Uint64(id).Build() + .Build(); + + return session.ExecuteDataQuery(R"( + DECLARE $id AS Uint64; + SELECT id, value FROM otel_demo WHERE id = $id + )", NTable::TTxControl::BeginTx(NTable::TTxSettings::SerializableRW()).CommitTx(), + std::move(params)).GetValueSync(); + })); + + ThrowOnError(client.RetryOperationSync([](NTable::TSession session) -> TStatus { + auto beginResult = session.BeginTransaction(NTable::TTxSettings::SerializableRW()).GetValueSync(); + if (!beginResult.IsSuccess()) { + return beginResult; + } + auto tx = beginResult.GetTransaction(); + + auto result = session.ExecuteDataQuery(R"( + SELECT COUNT(*) AS cnt FROM otel_demo + )", NTable::TTxControl::Tx(tx)).GetValueSync(); + + if (!result.IsSuccess()) { + return result; + } + + return tx.Commit().GetValueSync(); + })); + + if (i % 5 == 4) { + auto rollbackResult = client.RetryOperationSync([](NTable::TSession session) -> TStatus { + auto beginResult = session.BeginTransaction(NTable::TTxSettings::SerializableRW()).GetValueSync(); + if (!beginResult.IsSuccess()) { + return beginResult; + } + auto tx = beginResult.GetTransaction(); + return tx.Rollback().GetValueSync(); + }); + if (!rollbackResult.IsSuccess()) { + std::cerr << " Rollback status: " << static_cast(rollbackResult.GetStatus()) << std::endl; + } + } + + iterSpan->SetStatus(opentelemetry::trace::StatusCode::kOk); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } +} + +void RunRetryWorkload(NQuery::TQueryClient& client, int workers, int opsPerWorker) { + std::cout << "\n=== Retry workload (SERIALIZABLE conflicts) ===" + << " workers=" << workers << " ops=" << opsPerWorker << std::endl; + + auto tracer = GetAppTracer(); + + { + auto seedSpan = tracer->StartSpan("RetryWorkload.Seed"); + auto scope = opentelemetry::trace::Scope(seedSpan); + + ThrowOnError(client.RetryQuerySync([](NQuery::TSession session) { + return session.ExecuteQuery(R"( + UPSERT INTO otel_demo (id, value) VALUES (9999u, "seed") + )", NQuery::TTxControl::BeginTx(NQuery::TTxSettings::SerializableRW()).CommitTx()).GetValueSync(); + })); + } + + std::atomic conflicts{0}; + std::atomic successes{0}; + std::vector threads; + threads.reserve(workers); + + for (int w = 0; w < workers; ++w) { + threads.emplace_back([&, w]() { + auto workerTracer = GetAppTracer(); + for (int i = 0; i < opsPerWorker; ++i) { + auto iterSpan = workerTracer->StartSpan("RetryWorkload.Op"); + auto scope = opentelemetry::trace::Scope(iterSpan); + iterSpan->SetAttribute("worker", static_cast(w)); + iterSpan->SetAttribute("op", static_cast(i)); + + auto status = client.RetryQuerySync( + [w, i, &conflicts](NQuery::TQueryClient client) -> TStatus { + auto sessionRes = client.GetSession().GetValueSync(); + if (!sessionRes.IsSuccess()) { + return sessionRes; + } + auto session = sessionRes.GetSession(); + + auto beginRes = session.BeginTransaction( + NQuery::TTxSettings::SerializableRW()).GetValueSync(); + if (!beginRes.IsSuccess()) { + return beginRes; + } + auto tx = beginRes.GetTransaction(); + + auto readRes = session.ExecuteQuery(R"( + SELECT value FROM otel_demo WHERE id = 9999u + )", NQuery::TTxControl::Tx(tx)).GetValueSync(); + if (!readRes.IsSuccess()) { + if (readRes.GetStatus() == EStatus::ABORTED) { + conflicts.fetch_add(1); + } + return readRes; + } + + std::this_thread::sleep_for( + std::chrono::milliseconds(5 + (w * 7 + i * 3) % 20)); + + auto params = TParamsBuilder() + .AddParam("$v").Utf8("w" + std::to_string(w) + + "_i" + std::to_string(i)).Build() + .Build(); + + auto writeRes = session.ExecuteQuery(R"( + DECLARE $v AS Utf8; + UPSERT INTO otel_demo (id, value) VALUES (9999u, $v) + )", NQuery::TTxControl::Tx(tx), params).GetValueSync(); + if (!writeRes.IsSuccess()) { + if (writeRes.GetStatus() == EStatus::ABORTED) { + conflicts.fetch_add(1); + } + return writeRes; + } + + auto commitRes = tx.Commit().GetValueSync(); + if (!commitRes.IsSuccess() + && commitRes.GetStatus() == EStatus::ABORTED) { + conflicts.fetch_add(1); + } + return commitRes; + }); + + if (status.IsSuccess()) { + successes.fetch_add(1); + iterSpan->SetStatus(opentelemetry::trace::StatusCode::kOk); + } else { + iterSpan->SetStatus(opentelemetry::trace::StatusCode::kError, + std::string(ToString(status.GetStatus()))); + std::cerr << " [retry-wl] worker=" << w << " op=" << i + << " final_status=" << static_cast(status.GetStatus()) + << std::endl; + } + } + }); + } + + for (auto& t : threads) { + t.join(); + } + + std::cout << " Retry workload done." + << " successes=" << successes.load() + << " observed_aborts=" << conflicts.load() + << " (each abort triggers one SDK retry attempt)" << std::endl; +} + +int main(int argc, char** argv) { + TConfig cfg; + + NLastGetopt::TOpts opts; + opts.AddLongOption('e', "endpoint", "YDB endpoint") + .DefaultValue(cfg.Endpoint).StoreResult(&cfg.Endpoint); + opts.AddLongOption('d', "database", "YDB database") + .DefaultValue(cfg.Database).StoreResult(&cfg.Database); + opts.AddLongOption("otlp", "OTLP HTTP endpoint") + .DefaultValue(cfg.OtlpEndpoint).StoreResult(&cfg.OtlpEndpoint); + opts.AddLongOption('n', "iterations", "Number of iterations") + .DefaultValue(std::to_string(cfg.Iterations)).StoreResult(&cfg.Iterations); + opts.AddLongOption("retry-workers", "Concurrent workers for retry workload (0 to skip)") + .DefaultValue(std::to_string(cfg.RetryWorkers)).StoreResult(&cfg.RetryWorkers); + opts.AddLongOption("retry-ops", "Operations per retry worker") + .DefaultValue(std::to_string(cfg.RetryOps)).StoreResult(&cfg.RetryOps); + + NLastGetopt::TOptsParseResult parsedOpts(&opts, argc, argv); + + if (cfg.Endpoint.rfind("grpc://", 0) == 0) { + cfg.Endpoint.erase(0, 7); + } else if (cfg.Endpoint.rfind("grpcs://", 0) == 0) { + cfg.Endpoint.erase(0, 8); + } + + std::cout << "Initializing OpenTelemetry..." << std::endl; + std::cout << " OTLP endpoint: " << cfg.OtlpEndpoint << std::endl; + + auto tracerProvider = InitTracing(cfg); + auto meterProvider = InitMetrics(cfg); + + auto ydbTraceProvider = NTrace::CreateOtelTraceProvider(tracerProvider); + auto ydbMetricRegistry = NMetrics::CreateOtelMetricRegistry(meterProvider); + + std::cout << "Connecting to YDB at " << cfg.Endpoint << cfg.Database << std::endl; + + auto driverConfig = TDriverConfig() + .SetEndpoint(cfg.Endpoint) + .SetDatabase(cfg.Database) + .SetDiscoveryMode(EDiscoveryMode::Off) + .SetTraceProvider(ydbTraceProvider) + .SetMetricRegistry(ydbMetricRegistry); + + TDriver driver(driverConfig); + NQuery::TQueryClient queryClient(driver); + NTable::TTableClient tableClient(driver); + + try { + RunQueryWorkload(queryClient, cfg.Iterations); + RunTableWorkload(tableClient, cfg.Iterations); + + if (cfg.RetryWorkers > 0 && cfg.RetryOps > 0) { + RunRetryWorkload(queryClient, cfg.RetryWorkers, cfg.RetryOps); + } + + std::cout << "\n=== Cleanup ===" << std::endl; + ThrowOnError(queryClient.RetryQuerySync([](NQuery::TSession session) { + return session.ExecuteQuery( + "DROP TABLE otel_demo", NQuery::TTxControl::NoTx()).GetValueSync(); + })); + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + } + + std::cout << "Flushing telemetry..." << std::endl; + + driver.Stop(true); + + if (auto* sdkTracerProvider = dynamic_cast(tracerProvider.get())) { + sdkTracerProvider->ForceFlush(); + } + if (auto* sdkMeterProvider = dynamic_cast(meterProvider.get())) { + sdkMeterProvider->ForceFlush(); + } + + std::this_thread::sleep_for(std::chrono::seconds(3)); + + std::cout << "Done. Open Grafana at http://localhost:3000" << std::endl; + std::cout << " Jaeger UI at http://localhost:16686" << std::endl; + std::cout << " Prometheus at http://localhost:9090" << std::endl; + + return 0; +} diff --git a/examples/otel_tracing/otel-collector/config.yml b/examples/otel_tracing/otel-collector/config.yml new file mode 100644 index 00000000000..9589c9cd4ee --- /dev/null +++ b/examples/otel_tracing/otel-collector/config.yml @@ -0,0 +1,32 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +exporters: + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + + prometheus: + endpoint: 0.0.0.0:8889 + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/jaeger] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus] diff --git a/examples/otel_tracing/prometheus/prometheus.yml b/examples/otel_tracing/prometheus/prometheus.yml new file mode 100644 index 00000000000..faeda702aff --- /dev/null +++ b/examples/otel_tracing/prometheus/prometheus.yml @@ -0,0 +1,8 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: "otel-collector" + static_configs: + - targets: ["otel-collector:8889"] diff --git a/include/ydb-cpp-sdk/client/driver/driver.h b/include/ydb-cpp-sdk/client/driver/driver.h index 72aa008ccca..8d5ab1fac2b 100644 --- a/include/ydb-cpp-sdk/client/driver/driver.h +++ b/include/ydb-cpp-sdk/client/driver/driver.h @@ -3,6 +3,8 @@ #include "fwd.h" #include +#include +#include #include #include #include @@ -153,6 +155,12 @@ class TDriverConfig { //! If not set, default executor will be used. TDriverConfig& SetExecutor(std::shared_ptr executor); + //! Set external metrics registry implementation. + TDriverConfig& SetMetricRegistry(std::shared_ptr registry); + + //! Set external trace provider implementation. + TDriverConfig& SetTraceProvider(std::shared_ptr provider); + private: class TImpl; std::shared_ptr Impl_; diff --git a/include/ydb-cpp-sdk/client/metrics/metrics.h b/include/ydb-cpp-sdk/client/metrics/metrics.h new file mode 100644 index 00000000000..5faa930ed50 --- /dev/null +++ b/include/ydb-cpp-sdk/client/metrics/metrics.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace NYdb::inline V3::NMetrics { + +using TLabels = std::map; + +class ICounter { +public: + virtual ~ICounter() = default; + virtual void Inc() = 0; +}; + +class IGauge { +public: + virtual ~IGauge() = default; + virtual void Add(double delta) = 0; + virtual void Set(double value) = 0; +}; + +class IHistogram { +public: + virtual ~IHistogram() = default; + virtual void Record(double value) = 0; +}; + +class IMetricRegistry { +public: + virtual ~IMetricRegistry() = default; + + virtual std::shared_ptr Counter( + const std::string& name, + const TLabels& labels = {}, + const std::string& description = {}, + const std::string& unit = {} + ) = 0; + virtual std::shared_ptr Gauge( + const std::string& name, + const TLabels& labels = {}, + const std::string& description = {}, + const std::string& unit = {} + ) = 0; + virtual std::shared_ptr Histogram( + const std::string& name, + const std::vector& buckets, + const TLabels& labels = {}, + const std::string& description = {}, + const std::string& unit = {} + ) = 0; +}; + +} // namespace NYdb::NMetrics diff --git a/include/ydb-cpp-sdk/client/trace/trace.h b/include/ydb-cpp-sdk/client/trace/trace.h new file mode 100644 index 00000000000..a39d488af77 --- /dev/null +++ b/include/ydb-cpp-sdk/client/trace/trace.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include +#include +#include + +namespace NYdb::inline V3::NTrace { + +enum class ESpanKind { + INTERNAL, + SERVER, + CLIENT, + PRODUCER, + CONSUMER +}; + +class IScope { +public: + virtual ~IScope() = default; +}; + +class ISpan { +public: + virtual ~ISpan() = default; + virtual void End() = 0; + virtual void SetAttribute(const std::string& key, const std::string& value) = 0; + virtual void SetAttribute(const std::string& key, int64_t value) = 0; + virtual void AddEvent(const std::string& name, const std::map& attributes = {}) = 0; + virtual std::unique_ptr Activate() = 0; + + virtual void RecordException( + const std::string& type, + const std::string& message, + const std::string& stacktrace = {} + ) { + std::map attrs{ + {"exception.type", type}, + {"exception.message", message}, + }; + if (!stacktrace.empty()) { + attrs.emplace("exception.stacktrace", stacktrace); + } + AddEvent("exception", attrs); + } +}; + +class ITracer { +public: + virtual ~ITracer() = default; + virtual std::shared_ptr StartSpan(const std::string& name, ESpanKind kind = ESpanKind::INTERNAL) = 0; +}; + +class ITraceProvider { +public: + virtual ~ITraceProvider() = default; + virtual std::shared_ptr GetTracer(const std::string& name) = 0; +}; + +} // namespace NYdb::NTrace diff --git a/plugins/CMakeLists.txt b/plugins/CMakeLists.txt new file mode 100644 index 00000000000..0d232800455 --- /dev/null +++ b/plugins/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(metrics) +add_subdirectory(trace) diff --git a/plugins/metrics/CMakeLists.txt b/plugins/metrics/CMakeLists.txt new file mode 100644 index 00000000000..6d50a5111e7 --- /dev/null +++ b/plugins/metrics/CMakeLists.txt @@ -0,0 +1,3 @@ +if (YDB_SDK_ENABLE_OTEL_METRICS) + add_subdirectory(otel EXCLUDE_FROM_ALL) +endif() diff --git a/plugins/metrics/otel/CMakeLists.txt b/plugins/metrics/otel/CMakeLists.txt new file mode 100644 index 00000000000..e26b1931984 --- /dev/null +++ b/plugins/metrics/otel/CMakeLists.txt @@ -0,0 +1,17 @@ +_ydb_sdk_add_library(open_telemetry_metrics) +target_sources(open_telemetry_metrics PRIVATE + src/metrics.cpp +) +target_include_directories(open_telemetry_metrics PUBLIC + $ + $ +) +target_link_libraries(open_telemetry_metrics PUBLIC + client-metrics + client-resources + opentelemetry-cpp::api + opentelemetry-cpp::metrics +) +_ydb_sdk_make_client_component(OpenTelemetryMetrics open_telemetry_metrics) + +_ydb_sdk_install_headers(${CMAKE_INSTALL_INCLUDEDIR} DIRECTORY include/) diff --git a/plugins/metrics/otel/include/ydb-cpp-sdk/open_telemetry/metrics.h b/plugins/metrics/otel/include/ydb-cpp-sdk/open_telemetry/metrics.h new file mode 100644 index 00000000000..f992c577bf6 --- /dev/null +++ b/plugins/metrics/otel/include/ydb-cpp-sdk/open_telemetry/metrics.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +#include +#include + +namespace NYdb::inline V3::NMetrics { + +std::shared_ptr CreateOtelMetricRegistry( + opentelemetry::nostd::shared_ptr meterProvider); + +} // namespace NYdb::NMetrics diff --git a/plugins/metrics/otel/src/metrics.cpp b/plugins/metrics/otel/src/metrics.cpp new file mode 100644 index 00000000000..f883fae7946 --- /dev/null +++ b/plugins/metrics/otel/src/metrics.cpp @@ -0,0 +1,175 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace NYdb::inline V3::NMetrics { + +namespace { + +namespace otel_metrics = opentelemetry::metrics; +namespace otel_nostd = opentelemetry::nostd; +namespace otel_common = opentelemetry::common; +namespace otel_context = opentelemetry::context; +namespace otel_sdk_metrics = opentelemetry::sdk::metrics; + +otel_common::KeyValueIterableView MakeAttributes(const TLabels& labels) { + return otel_common::KeyValueIterableView(labels); +} + +class TOtelCounter : public ICounter { +public: + TOtelCounter(otel_nostd::shared_ptr> counter, const TLabels& labels) + : Counter_(std::move(counter)) + , Labels_(labels) + {} + + void Inc() override { + Counter_->Add(1, MakeAttributes(Labels_), otel_context::RuntimeContext::GetCurrent()); + } + +private: + otel_nostd::shared_ptr> Counter_; + TLabels Labels_; +}; + +class TOtelUpDownCounterGauge : public IGauge { +public: + TOtelUpDownCounterGauge(otel_nostd::shared_ptr> counter, const TLabels& labels) + : Counter_(std::move(counter)) + , Labels_(labels) + {} + + void Add(double delta) override { + Counter_->Add(delta, MakeAttributes(Labels_), otel_context::RuntimeContext::GetCurrent()); + Value_ += delta; + } + + void Set(double value) override { + Counter_->Add(value - Value_, MakeAttributes(Labels_), otel_context::RuntimeContext::GetCurrent()); + Value_ = value; + } + +private: + otel_nostd::shared_ptr> Counter_; + TLabels Labels_; + double Value_ = 0; +}; + +class TOtelHistogram : public IHistogram { +public: + TOtelHistogram(otel_nostd::shared_ptr> histogram, const TLabels& labels) + : Histogram_(std::move(histogram)) + , Labels_(labels) + {} + + void Record(double value) override { + Histogram_->Record(value, MakeAttributes(Labels_), otel_context::RuntimeContext::GetCurrent()); + } + +private: + otel_nostd::shared_ptr> Histogram_; + TLabels Labels_; +}; + +class TOtelMetricRegistry : public IMetricRegistry { +public: + TOtelMetricRegistry(otel_nostd::shared_ptr meterProvider) + : MeterProvider_(std::move(meterProvider)) + , Meter_(MeterProvider_->GetMeter("ydb-cpp-sdk", GetSdkSemver())) + {} + + std::shared_ptr Counter(const std::string& name + , const TLabels& labels + , const std::string& description + , const std::string& unit + ) override { + auto counter = Meter_->CreateUInt64Counter(name, description, unit); + return std::make_shared(std::move(counter), labels); + } + + std::shared_ptr Gauge(const std::string& name + , const TLabels& labels + , const std::string& description + , const std::string& unit + ) override { + auto counter = Meter_->CreateDoubleUpDownCounter(name, description, unit); + return std::make_shared(std::move(counter), labels); + } + + std::shared_ptr Histogram(const std::string& name + , const std::vector& buckets + , const TLabels& labels + , const std::string& description + , const std::string& unit + ) override { + ConfigureHistogramBuckets(name, unit, buckets); + auto histogram = Meter_->CreateDoubleHistogram(name, description, unit); + return std::make_shared(std::move(histogram), labels); + } + +private: + void ConfigureHistogramBuckets(const std::string& name, const std::string& unit, const std::vector& buckets) { + if (buckets.empty()) { + return; + } + + auto* sdkProvider = dynamic_cast(MeterProvider_.get()); + if (!sdkProvider) { + return; + } + + { + std::lock_guard lock(HistogramViewsLock_); + if (!HistogramViews_.insert(name).second) { + return; + } + } + + auto selector = std::make_unique( + otel_sdk_metrics::InstrumentType::kHistogram, + name, + unit + ); + auto meterSelector = std::make_unique( + std::string("ydb-cpp-sdk"), + std::string(GetSdkSemver()), + std::string() + ); + + auto histogramConfig = std::make_shared(); + histogramConfig->boundaries_ = buckets; + + auto view = std::make_unique( + std::string(), + std::string(), + otel_sdk_metrics::AggregationType::kHistogram, + histogramConfig + ); + + sdkProvider->AddView(std::move(selector), std::move(meterSelector), std::move(view)); + } + + otel_nostd::shared_ptr MeterProvider_; + otel_nostd::shared_ptr Meter_; + std::mutex HistogramViewsLock_; + std::unordered_set HistogramViews_; +}; + +} // namespace + +std::shared_ptr CreateOtelMetricRegistry( + opentelemetry::nostd::shared_ptr meterProvider) +{ + return std::make_shared(std::move(meterProvider)); +} + +} // namespace NYdb::NMetrics diff --git a/plugins/trace/CMakeLists.txt b/plugins/trace/CMakeLists.txt new file mode 100644 index 00000000000..ef231ab7103 --- /dev/null +++ b/plugins/trace/CMakeLists.txt @@ -0,0 +1,3 @@ +if (YDB_SDK_ENABLE_OTEL_TRACE) + add_subdirectory(otel EXCLUDE_FROM_ALL) +endif() diff --git a/plugins/trace/otel/CMakeLists.txt b/plugins/trace/otel/CMakeLists.txt new file mode 100644 index 00000000000..6816d8ff7c6 --- /dev/null +++ b/plugins/trace/otel/CMakeLists.txt @@ -0,0 +1,16 @@ +_ydb_sdk_add_library(open_telemetry_trace) +target_sources(open_telemetry_trace PRIVATE + src/trace.cpp +) +target_include_directories(open_telemetry_trace PUBLIC + $ + $ +) +target_link_libraries(open_telemetry_trace PUBLIC + client-trace + opentelemetry-cpp::api + opentelemetry-cpp::trace +) +_ydb_sdk_make_client_component(OpenTelemetryTrace open_telemetry_trace) + +_ydb_sdk_install_headers(${CMAKE_INSTALL_INCLUDEDIR} DIRECTORY include/) diff --git a/plugins/trace/otel/include/ydb-cpp-sdk/open_telemetry/trace.h b/plugins/trace/otel/include/ydb-cpp-sdk/open_telemetry/trace.h new file mode 100644 index 00000000000..64c8fe4abba --- /dev/null +++ b/plugins/trace/otel/include/ydb-cpp-sdk/open_telemetry/trace.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +#include +#include + +namespace NYdb::inline V3::NTrace { + +std::shared_ptr CreateOtelTraceProvider( + opentelemetry::nostd::shared_ptr tracerProvider); + +} // namespace NYdb::NTrace diff --git a/plugins/trace/otel/src/trace.cpp b/plugins/trace/otel/src/trace.cpp new file mode 100644 index 00000000000..85b315ccdf4 --- /dev/null +++ b/plugins/trace/otel/src/trace.cpp @@ -0,0 +1,139 @@ +#include + +#include +#include +#include +#include +#include + +namespace NYdb::inline V3::NTrace { + +namespace { + +namespace otel_trace = opentelemetry::trace; +namespace otel_nostd = opentelemetry::nostd; +namespace otel_common = opentelemetry::common; + +otel_trace::SpanKind MapSpanKind(ESpanKind kind) { + switch (kind) { + case ESpanKind::INTERNAL: return otel_trace::SpanKind::kInternal; + case ESpanKind::SERVER: return otel_trace::SpanKind::kServer; + case ESpanKind::CLIENT: return otel_trace::SpanKind::kClient; + case ESpanKind::PRODUCER: return otel_trace::SpanKind::kProducer; + case ESpanKind::CONSUMER: return otel_trace::SpanKind::kConsumer; + } + return otel_trace::SpanKind::kInternal; +} + +class TOtelScope : public IScope { +public: + TOtelScope(otel_nostd::shared_ptr span) + : Scope_(std::move(span)) + {} + +private: + otel_trace::Scope Scope_; +}; + +class TOtelSpan : public ISpan { +public: + TOtelSpan(otel_nostd::shared_ptr span) + : Span_(std::move(span)) + {} + + void End() override { + Span_->End(); + } + + void SetAttribute(const std::string& key, const std::string& value) override { + Span_->SetAttribute(key, value); + } + + void SetAttribute(const std::string& key, int64_t value) override { + Span_->SetAttribute(key, value); + } + + void AddEvent(const std::string& name, const std::map& attributes) override { + if (attributes.empty()) { + Span_->AddEvent(name); + } else { + std::vector> attrs; + attrs.reserve(attributes.size()); + for (const auto& [k, v] : attributes) { + attrs.emplace_back(otel_nostd::string_view(k), otel_common::AttributeValue(otel_nostd::string_view(v))); + } + Span_->AddEvent(name, attrs); + } + } + + std::unique_ptr Activate() override { + return std::make_unique(Span_); + } + + void RecordException(const std::string& type + , const std::string& message + , const std::string& stacktrace + ) override { + std::vector> attrs; + attrs.reserve(3); + attrs.emplace_back( + otel_nostd::string_view("exception.type"), + otel_common::AttributeValue(otel_nostd::string_view(type)) + ); + attrs.emplace_back( + otel_nostd::string_view("exception.message"), + otel_common::AttributeValue(otel_nostd::string_view(message)) + ); + if (!stacktrace.empty()) { + attrs.emplace_back( + otel_nostd::string_view("exception.stacktrace"), + otel_common::AttributeValue(otel_nostd::string_view(stacktrace)) + ); + } + Span_->AddEvent("exception", attrs); + Span_->SetStatus(otel_trace::StatusCode::kError, message); + } + +private: + otel_nostd::shared_ptr Span_; +}; + +class TOtelTracer : public ITracer { +public: + TOtelTracer(otel_nostd::shared_ptr tracer) + : Tracer_(std::move(tracer)) + {} + + std::shared_ptr StartSpan(const std::string& name, ESpanKind kind) override { + otel_trace::StartSpanOptions options; + options.kind = MapSpanKind(kind); + return std::make_shared(Tracer_->StartSpan(name, options)); + } + +private: + otel_nostd::shared_ptr Tracer_; +}; + +class TOtelTraceProvider : public ITraceProvider { +public: + TOtelTraceProvider(otel_nostd::shared_ptr tracerProvider) + : TracerProvider_(std::move(tracerProvider)) + {} + + std::shared_ptr GetTracer(const std::string& name) override { + return std::make_shared(TracerProvider_->GetTracer(name)); + } + +private: + otel_nostd::shared_ptr TracerProvider_; +}; + +} // namespace + +std::shared_ptr CreateOtelTraceProvider( + opentelemetry::nostd::shared_ptr tracerProvider) +{ + return std::make_shared(std::move(tracerProvider)); +} + +} // namespace NYdb::NTrace diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3dff7094058..b251a041380 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,3 +1,3 @@ add_subdirectory(api) add_subdirectory(client) -add_subdirectory(library) \ No newline at end of file +add_subdirectory(library) diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt index e7f448e8675..ce5e4938058 100644 --- a/src/client/CMakeLists.txt +++ b/src/client/CMakeLists.txt @@ -12,6 +12,7 @@ add_subdirectory(iam) add_subdirectory(iam_private) add_subdirectory(impl) add_subdirectory(import) +add_subdirectory(metrics) add_subdirectory(monitoring) add_subdirectory(operation) add_subdirectory(params) @@ -25,5 +26,6 @@ add_subdirectory(scheme) add_subdirectory(ss_tasks) add_subdirectory(table) add_subdirectory(topic) +add_subdirectory(trace) add_subdirectory(types) add_subdirectory(value) diff --git a/src/client/driver/driver.cpp b/src/client/driver/driver.cpp index 207c67b6d5f..7bdf3a79bbc 100644 --- a/src/client/driver/driver.cpp +++ b/src/client/driver/driver.cpp @@ -51,6 +51,8 @@ class TDriverConfig::TImpl : public IConnectionsParams { uint64_t GetMaxMessageSize() const override { return MaxMessageSize; } const TLog& GetLog() const override { return Log; } std::shared_ptr GetExecutor() const override { return Executor; } + std::shared_ptr GetExternalMetricRegistry() const override { return MetricRegistry; } + std::shared_ptr GetTraceProvider() const override { return TraceProvider; } std::string Endpoint; size_t NetworkThreadsNum = 2; @@ -80,6 +82,8 @@ class TDriverConfig::TImpl : public IConnectionsParams { uint64_t MaxMessageSize = 0; TLog Log; // Null by default. std::shared_ptr Executor; + std::shared_ptr MetricRegistry; + std::shared_ptr TraceProvider; }; TDriverConfig::TDriverConfig(const std::string& connectionString) @@ -229,6 +233,16 @@ TDriverConfig& TDriverConfig::SetExecutor(std::shared_ptr executor) { return *this; } +TDriverConfig& TDriverConfig::SetMetricRegistry(std::shared_ptr registry) { + Impl_->MetricRegistry = std::move(registry); + return *this; +} + +TDriverConfig& TDriverConfig::SetTraceProvider(std::shared_ptr provider) { + Impl_->TraceProvider = std::move(provider); + return *this; +} + //////////////////////////////////////////////////////////////////////////////// std::shared_ptr CreateInternalInterface(const TDriver connection) { @@ -280,6 +294,8 @@ TDriverConfig TDriver::GetConfig() const { config.SetMaxOutboundMessageSize(Impl_->MaxOutboundMessageSize_); config.SetMaxMessageSize(Impl_->MaxMessageSize_); config.Impl_->Log = Impl_->Log; + config.SetMetricRegistry(Impl_->GetExternalMetricRegistry()); + config.SetTraceProvider(Impl_->GetTraceProvider()); return config; } diff --git a/src/client/impl/CMakeLists.txt b/src/client/impl/CMakeLists.txt index 9e04f134b37..8dfc3fa865b 100644 --- a/src/client/impl/CMakeLists.txt +++ b/src/client/impl/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(endpoints) add_subdirectory(executor) add_subdirectory(internal) +add_subdirectory(observability) add_subdirectory(session) add_subdirectory(stats) diff --git a/src/client/impl/internal/common/log_lazy.h b/src/client/impl/internal/common/log_lazy.h new file mode 100644 index 00000000000..0635ef2cc89 --- /dev/null +++ b/src/client/impl/internal/common/log_lazy.h @@ -0,0 +1,10 @@ +#pragma once + +#ifdef LOG_LAZY +#error log macro redefinition +#endif + +#define LOG_LAZY(log, priority, message) \ + if (log.IsOpen() && log.FiltrationLevel() >= priority) { \ + log.Write(priority, message); \ + } diff --git a/src/client/impl/internal/db_driver_state/state.cpp b/src/client/impl/internal/db_driver_state/state.cpp index 9a836fc527a..e41a47869fc 100644 --- a/src/client/impl/internal/db_driver_state/state.cpp +++ b/src/client/impl/internal/db_driver_state/state.cpp @@ -44,7 +44,7 @@ TDbDriverState::TDbDriverState( auto self = shared_from_this(); return client->GetEndpoints(self); }, client) - , StatCollector(database, client->GetMetricRegistry()) + , StatCollector(database, client->GetMetricRegistry(), client->GetExternalMetricRegistry()) , Log(Client->GetLog()) , DiscoveryCompletedPromise(NThreading::NewPromise()) { diff --git a/src/client/impl/internal/grpc_connections/grpc_connections.cpp b/src/client/impl/internal/grpc_connections/grpc_connections.cpp index 48e170d28c6..09fb75687ee 100644 --- a/src/client/impl/internal/grpc_connections/grpc_connections.cpp +++ b/src/client/impl/internal/grpc_connections/grpc_connections.cpp @@ -167,6 +167,8 @@ TGRpcConnectionsImpl::TGRpcConnectionsImpl(std::shared_ptr p #ifndef YDB_GRPC_BYPASS_CHANNEL_POOL , ChannelPool_(TcpKeepAliveSettings_, params->GetSocketIdleTimeout()) #endif + , MetricRegistry_(params->GetExternalMetricRegistry()) + , TraceProvider_(params->GetTraceProvider()) , NetworkThreadsNum_(params->GetNetworkThreadsNum()) , UsePerChannelTcpConnection_(params->GetUsePerChannelTcpConnection()) , GRpcClientLow_(NetworkThreadsNum_) @@ -434,6 +436,14 @@ void TGRpcConnectionsImpl::RegisterExtensionApi(IExtensionApi* api) { ExtensionApis_.emplace_back(api); } +std::shared_ptr TGRpcConnectionsImpl::GetExternalMetricRegistry() const { + return MetricRegistry_; +} + +std::shared_ptr TGRpcConnectionsImpl::GetTraceProvider() const { + return TraceProvider_; +} + void TGRpcConnectionsImpl::SetDiscoveryMutator(IDiscoveryMutatorApi::TMutatorCb&& cb) { std::lock_guard lock(ExtensionsLock_); DiscoveryMutatorCb = std::move(cb); diff --git a/src/client/impl/internal/grpc_connections/grpc_connections.h b/src/client/impl/internal/grpc_connections/grpc_connections.h index 756d2f0d957..c6ef8686c36 100644 --- a/src/client/impl/internal/grpc_connections/grpc_connections.h +++ b/src/client/impl/internal/grpc_connections/grpc_connections.h @@ -18,6 +18,14 @@ namespace NYdb::inline V3 { +namespace NMetrics { + class IMetricRegistry; +} // namespace NMetrics + +namespace NTrace { + class ITraceProvider; +} // namespace NTrace + constexpr TDeadline::Duration GRPC_KEEP_ALIVE_TIMEOUT_FOR_DISCOVERY = std::chrono::seconds(10); constexpr TDeadline::Duration INITIAL_DEFERRED_CALL_DELAY = std::chrono::milliseconds(10); // The delay before first deferred service call constexpr TDeadline::Duration GET_ENDPOINTS_TIMEOUT = std::chrono::seconds(10); // Time wait for ListEndpoints request, after this time we pass error to client @@ -581,6 +589,9 @@ class TGRpcConnectionsImpl ::NMonitoring::TMetricRegistry* GetMetricRegistry() override; void RegisterExtension(IExtension* extension); void RegisterExtensionApi(IExtensionApi* api); + std::shared_ptr GetExternalMetricRegistry() const override; + std::shared_ptr GetTraceProvider() const; + void SetDiscoveryMutator(IDiscoveryMutatorApi::TMutatorCb&& cb); const TLog& GetLog() const override; @@ -714,6 +725,8 @@ class TGRpcConnectionsImpl std::vector> Extensions_; std::vector> ExtensionApis_; + std::shared_ptr MetricRegistry_; + std::shared_ptr TraceProvider_; IDiscoveryMutatorApi::TMutatorCb DiscoveryMutatorCb; diff --git a/src/client/impl/internal/grpc_connections/params.h b/src/client/impl/internal/grpc_connections/params.h index 2bc9f4567c5..50c90211603 100644 --- a/src/client/impl/internal/grpc_connections/params.h +++ b/src/client/impl/internal/grpc_connections/params.h @@ -11,6 +11,14 @@ namespace NYdb::inline V3 { +namespace NMetrics { + class IMetricRegistry; +} // namespace NMetrics + +namespace NTrace { + class ITraceProvider; +} // namespace NTrace + class IConnectionsParams { public: virtual ~IConnectionsParams() = default; @@ -36,6 +44,8 @@ class IConnectionsParams { virtual uint64_t GetMaxOutboundMessageSize() const = 0; virtual uint64_t GetMaxMessageSize() const = 0; virtual std::shared_ptr GetExecutor() const = 0; + virtual std::shared_ptr GetExternalMetricRegistry() const = 0; + virtual std::shared_ptr GetTraceProvider() const = 0; }; } // namespace NYdb diff --git a/src/client/impl/internal/internal_client/client.h b/src/client/impl/internal/internal_client/client.h index 3e52f984480..406a8b7103c 100644 --- a/src/client/impl/internal/internal_client/client.h +++ b/src/client/impl/internal/internal_client/client.h @@ -14,6 +14,10 @@ namespace NMonitoring { class TMetricRegistry; } +namespace NYdb::inline V3::NMetrics { + class IMetricRegistry; +} + namespace NYdb::inline V3 { class TDbDriverState; @@ -29,6 +33,7 @@ class IInternalClient { virtual TBalancingPolicy::TImpl GetBalancingSettings() const = 0; virtual bool StartStatCollecting(::NMonitoring::IMetricRegistry* sensorsRegistry) = 0; virtual ::NMonitoring::TMetricRegistry* GetMetricRegistry() = 0; + virtual std::shared_ptr GetExternalMetricRegistry() const = 0; virtual const TLog& GetLog() const = 0; }; diff --git a/src/client/impl/internal/retry/retry.cpp b/src/client/impl/internal/retry/retry.cpp index 73880d0e5c6..5dad5df9bbf 100644 --- a/src/client/impl/internal/retry/retry.cpp +++ b/src/client/impl/internal/retry/retry.cpp @@ -28,8 +28,10 @@ TBackoffDuration CalcBackoffTime(const TBackoffSettings& settings, std::uint32_t } -void Backoff(const NRetry::TBackoffSettings& settings, std::uint32_t retryNumber) { - std::this_thread::sleep_for(CalcBackoffTime(settings, retryNumber)); +TDuration Backoff(const NRetry::TBackoffSettings& settings, std::uint32_t retryNumber) { + const auto duration = CalcBackoffTime(settings, retryNumber); + std::this_thread::sleep_for(duration); + return TDuration::MicroSeconds(static_cast(duration.count())); } void AsyncBackoff(std::shared_ptr client, const TBackoffSettings& settings, diff --git a/src/client/impl/internal/retry/retry.h b/src/client/impl/internal/retry/retry.h index 6fe090409c8..5d1dffc0aca 100644 --- a/src/client/impl/internal/retry/retry.h +++ b/src/client/impl/internal/retry/retry.h @@ -21,7 +21,7 @@ class IClientImplCommon; namespace NYdb::inline V3::NRetry { -void Backoff(const NRetry::TBackoffSettings& settings, std::uint32_t retryNumber); +TDuration Backoff(const NRetry::TBackoffSettings& settings, std::uint32_t retryNumber); void AsyncBackoff(std::shared_ptr client, const TBackoffSettings& settings, std::uint32_t retryNumber, const std::function& fn); diff --git a/src/client/impl/internal/retry/retry_sync.h b/src/client/impl/internal/retry/retry_sync.h index beefcb27714..94581e41ae4 100644 --- a/src/client/impl/internal/retry/retry_sync.h +++ b/src/client/impl/internal/retry/retry_sync.h @@ -1,9 +1,14 @@ #pragma once #include +#include #include #include +#include + +#include +#include namespace NYdb::inline V3::NRetry::Sync { @@ -13,32 +18,42 @@ class TRetryContext : public TRetryContextBase { TClient& Client_; public: + using TAttemptSpanFactory = std::function< + std::shared_ptr(std::uint32_t attempt, std::int64_t backoffMs)>; + TStatusType Execute() { this->RetryStartTime_ = TInstant::Now(); - TStatusType status = Retry(); // first attempt + std::int64_t lastBackoffMs = 0; + + TStatusType status = RunAttempt(lastBackoffMs); for (this->RetryNumber_ = 0; this->RetryNumber_ <= this->Settings_.MaxRetries_;) { auto nextStep = this->GetNextStep(status); + TDuration backoff = TDuration::Zero(); switch (nextStep) { case NextStep::RetryImmediately: break; case NextStep::RetryFastBackoff: - DoBackoff(true); + backoff = DoBackoff(true); break; case NextStep::RetrySlowBackoff: - DoBackoff(false); + backoff = DoBackoff(false); break; case NextStep::Finish: return status; } - // make next retry this->RetryNumber_++; this->LogRetry(status); this->Client_.Impl_->CollectRetryStatSync(status.GetStatus()); - status = Retry(); + lastBackoffMs = static_cast(backoff.MilliSeconds()); + status = RunAttempt(lastBackoffMs); } return status; } + void SetAttemptSpanFactory(TAttemptSpanFactory factory) { + AttemptSpanFactory_ = std::move(factory); + } + protected: TRetryContext(TClient& client, const TRetryOperationSettings& settings) : TRetryContextBase(settings) @@ -49,11 +64,32 @@ class TRetryContext : public TRetryContextBase { virtual TStatusType RunOperation() = 0; - void DoBackoff(bool fast) { + TDuration DoBackoff(bool fast) { const auto &settings = fast ? this->Settings_.FastBackoffSettings_ : this->Settings_.SlowBackoffSettings_; - Backoff(settings, this->RetryNumber_); + return Backoff(settings, this->RetryNumber_); } + +private: + TStatusType RunAttempt(std::int64_t backoffMs) { + std::shared_ptr attemptSpan; + std::unique_ptr scope; + if (AttemptSpanFactory_) { + attemptSpan = AttemptSpanFactory_(this->RetryNumber_, backoffMs); + if (attemptSpan) { + scope = attemptSpan->Activate(); + } + } + + TStatusType status = Retry(); + + if (attemptSpan) { + attemptSpan->End(status.GetStatus()); + } + return status; + } + + TAttemptSpanFactory AttemptSpanFactory_; }; template> @@ -135,4 +171,28 @@ class TRetryWithSession : public TRetryContext, public TRe } }; +// Wraps a sync retry loop with the required OpenTelemetry spans: +// ydb.RunWithRetry (INTERNAL, created here) +// └─ ydb.Try (INTERNAL, one per attempt, with retry.attempt/backoff_ms) +// └─ +template +TStatus RunSyncRetryWithParentSpan( + const std::shared_ptr& impl + , TCtx&& ctx +) { + auto parentSpan = impl->CreateRetryRootSpan(); + auto scope = parentSpan ? parentSpan->Activate() : nullptr; + + auto attemptSpanFactory = [impl](std::uint32_t attempt, std::int64_t backoffMs) { + return impl->CreateRetryAttemptSpan(attempt, backoffMs); + }; + ctx.SetAttemptSpanFactory(std::move(attemptSpanFactory)); + + auto status = ctx.Execute(); + if (parentSpan) { + parentSpan->End(status.GetStatus()); + } + return status; +} + } // namespace NYdb::NRetry::Sync diff --git a/src/client/impl/observability/CMakeLists.txt b/src/client/impl/observability/CMakeLists.txt new file mode 100644 index 00000000000..5238ee63535 --- /dev/null +++ b/src/client/impl/observability/CMakeLists.txt @@ -0,0 +1,16 @@ +_ydb_sdk_add_library(impl-observability) + +target_link_libraries(impl-observability PUBLIC + yutil + client-metrics + client-impl-ydb_stats + client-types +) + +target_sources(impl-observability PRIVATE + metrics.cpp + observation.cpp + span.cpp +) + +_ydb_sdk_install_targets(TARGETS impl-observability) diff --git a/src/client/impl/observability/metrics.cpp b/src/client/impl/observability/metrics.cpp new file mode 100644 index 00000000000..8443c27f173 --- /dev/null +++ b/src/client/impl/observability/metrics.cpp @@ -0,0 +1,76 @@ +#include "metrics.h" + +#include "operation_name.h" + +#include + +#include + +namespace NYdb::inline V3::NObservability { + +namespace { + +void SafeLogRequestMetricsError(TLog& log, const char* message, std::exception_ptr exception) noexcept { + try { + if (!exception) { + LOG_LAZY(log, TLOG_ERR, std::string("TRequestMetrics: ") + message + ": (no active exception)"); + return; + } + try { + std::rethrow_exception(exception); + } catch (const std::exception& e) { + LOG_LAZY(log, TLOG_ERR, std::string("TRequestMetrics: ") + message + ": " + e.what()); + return; + } catch (...) { + } + LOG_LAZY(log, TLOG_ERR, std::string("TRequestMetrics: ") + message + ": (unknown)"); + } catch (...) { + } +} + +} // namespace + +TRequestMetrics::TRequestMetrics(NSdkStats::TStatCollector::TClientOperationStatCollector* operationCollector + , const std::string& requestName + , const TLog& log +) : Collector_(operationCollector) + , RequestName_(NormalizeOperationName(requestName)) + , Log_(log) +{ + if (!Collector_) { + return; + } + try { + Collector_->IncRequestCount(RequestName_); + StartTime_ = std::chrono::steady_clock::now(); + } catch (...) { + SafeLogRequestMetricsError(Log_, "failed to initialize metrics", std::current_exception()); + Collector_ = nullptr; + } +} + +TRequestMetrics::~TRequestMetrics() noexcept { + End(EStatus::CLIENT_INTERNAL_ERROR); +} + +void TRequestMetrics::End(EStatus status) noexcept { + if (Ended_) { + return; + } + Ended_ = true; + + if (!Collector_) { + return; + } + + try { + auto elapsed = std::chrono::steady_clock::now() - StartTime_; + double durationSec = std::chrono::duration(elapsed).count(); + Collector_->RecordLatency(RequestName_, durationSec, status); + Collector_->IncErrorCount(RequestName_, status); + } catch (...) { + SafeLogRequestMetricsError(Log_, "failed to record metrics", std::current_exception()); + } +} + +} // namespace NYdb::NObservability diff --git a/src/client/impl/observability/metrics.h b/src/client/impl/observability/metrics.h new file mode 100644 index 00000000000..07c91a08f30 --- /dev/null +++ b/src/client/impl/observability/metrics.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include + +#include + +#include +#include + +namespace NYdb::inline V3::NObservability { + +class TRequestMetrics { +public: + TRequestMetrics(NSdkStats::TStatCollector::TClientOperationStatCollector* operationCollector + , const std::string& requestName + , const TLog& log + ); + ~TRequestMetrics() noexcept; + + void End(EStatus status) noexcept; + +private: + NSdkStats::TStatCollector::TClientOperationStatCollector* Collector_ = nullptr; + std::string RequestName_; + std::chrono::steady_clock::time_point StartTime_{}; + bool Ended_ = false; + TLog Log_; +}; + +} // namespace NYdb::NObservability diff --git a/src/client/impl/observability/observation.cpp b/src/client/impl/observability/observation.cpp new file mode 100644 index 00000000000..6e7d9dee2d1 --- /dev/null +++ b/src/client/impl/observability/observation.cpp @@ -0,0 +1,44 @@ +#include "observation.h" + +#define INCLUDE_YDB_INTERNAL_H +#include +#undef INCLUDE_YDB_INTERNAL_H + +namespace NYdb::inline V3::NObservability { + +TRequestObservation::TRequestObservation(const std::string& ydbClientType + , NSdkStats::TStatCollector::TClientOperationStatCollector* operationCollector + , std::shared_ptr tracer + , const std::string& operationName + , const std::shared_ptr& dbDriverState +) : Span_( + std::make_shared(ydbClientType + , std::move(tracer) + , operationName + , dbDriverState + ) + ), Metrics_( + std::make_shared(operationCollector, operationName, dbDriverState->Log) + ) +{} + +void TRequestObservation::SetPeerEndpoint(const std::string& endpoint) noexcept { + if (Span_) { + Span_->SetPeerEndpoint(endpoint); + } +} + +void TRequestObservation::End(EStatus status) noexcept { + if (Span_) { + Span_->End(status); + } + if (Metrics_) { + Metrics_->End(status); + } +} + +void TRequestObservation::EndWithClientInternalError() noexcept { + End(EStatus::CLIENT_INTERNAL_ERROR); +} + +} // namespace NYdb::NObservability diff --git a/src/client/impl/observability/observation.h b/src/client/impl/observability/observation.h new file mode 100644 index 00000000000..33f8bdf0672 --- /dev/null +++ b/src/client/impl/observability/observation.h @@ -0,0 +1,29 @@ +#pragma once + +#include "metrics.h" +#include "span.h" + +#include +#include + +namespace NYdb::inline V3::NObservability { + +class TRequestObservation { +public: + TRequestObservation(const std::string& ydbClientType + , NSdkStats::TStatCollector::TClientOperationStatCollector* operationCollector + , std::shared_ptr tracer + , const std::string& operationName + , const std::shared_ptr& dbDriverState + ); + + void SetPeerEndpoint(const std::string& endpoint) noexcept; + void End(EStatus status) noexcept; + void EndWithClientInternalError() noexcept; + +private: + std::shared_ptr Span_; + std::shared_ptr Metrics_; +}; + +} // namespace NYdb::NObservability diff --git a/src/client/impl/observability/operation_name.h b/src/client/impl/observability/operation_name.h new file mode 100644 index 00000000000..afceee2e1b7 --- /dev/null +++ b/src/client/impl/observability/operation_name.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include + +namespace NYdb::inline V3::NObservability { + +inline std::string NormalizeOperationName(const std::string& requestName) noexcept { + static constexpr std::string_view kPrefix = "ydb."; + if (requestName.size() >= kPrefix.size() + && std::string_view(requestName.data(), kPrefix.size()) == kPrefix) + { + return requestName; + } + return std::string(kPrefix) + requestName; +} + +} // namespace NYdb::NObservability diff --git a/src/client/impl/observability/span.cpp b/src/client/impl/observability/span.cpp new file mode 100644 index 00000000000..73bbd42036d --- /dev/null +++ b/src/client/impl/observability/span.cpp @@ -0,0 +1,240 @@ +#include "span.h" + +#include "operation_name.h" + +#include + +#define INCLUDE_YDB_INTERNAL_H +#include +#undef INCLUDE_YDB_INTERNAL_H + +#include + +#include + +namespace NYdb::inline V3::NObservability { + +namespace { + +constexpr int DefaultGrpcPort = 2135; +constexpr const char* kRetryRootSpanName = "ydb.RunWithRetry"; +constexpr const char* kRetryAttemptSpanName = "ydb.Try"; + +std::string YdbClientApiAttributeValue(const std::string& clientType) noexcept { + return clientType.empty() ? std::string("Unspecified") : clientType; +} + +void ParseEndpoint(const std::string& endpoint, std::string& host, int& port) { + port = DefaultGrpcPort; + + if (endpoint.empty()) { + host = endpoint; + return; + } + + if (endpoint.front() == '[') { + auto bracketEnd = endpoint.find(']'); + if (bracketEnd != std::string::npos) { + host = endpoint.substr(1, bracketEnd - 1); + if (bracketEnd + 2 < endpoint.size() && endpoint[bracketEnd + 1] == ':') { + try { + port = std::stoi(endpoint.substr(bracketEnd + 2)); + } catch (...) {} + } + return; + } + } + + auto pos = endpoint.rfind(':'); + if (pos != std::string::npos) { + host = endpoint.substr(0, pos); + try { + port = std::stoi(endpoint.substr(pos + 1)); + } catch (...) {} + } else { + host = endpoint; + } +} + +void SafeLogRequestSpanError(TLog& log, const char* message, std::exception_ptr exception) noexcept { + try { + if (!exception) { + LOG_LAZY(log, TLOG_ERR, std::string("TRequestSpan: ") + message + ": (no active exception)"); + return; + } + try { + std::rethrow_exception(exception); + } catch (const std::exception& e) { + LOG_LAZY(log, TLOG_ERR, std::string("TRequestSpan: ") + message + ": " + e.what()); + return; + } catch (...) { + } + LOG_LAZY(log, TLOG_ERR, std::string("TRequestSpan: ") + message + ": (unknown)"); + } catch (...) { + } +} + +} // namespace + +std::shared_ptr TRequestSpan::CreateForClientRetry(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::shared_ptr& dbDriverState +) { + return std::make_shared( + ydbClientType, + std::move(tracer), + kRetryRootSpanName, + dbDriverState, + NTrace::ESpanKind::INTERNAL + ); +} + +std::shared_ptr TRequestSpan::CreateForRetryAttempt(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::shared_ptr& dbDriverState + , std::uint32_t attempt + , std::int64_t backoffMs +) { + auto span = std::make_shared( + ydbClientType, + std::move(tracer), + kRetryAttemptSpanName, + dbDriverState, + NTrace::ESpanKind::INTERNAL + ); + if (span && span->Span_) { + try { + span->Span_->SetAttribute("ydb.retry.attempt", static_cast(attempt)); + span->Span_->SetAttribute("ydb.retry.backoff_ms", backoffMs); + } catch (...) { + SafeLogRequestSpanError(span->Log_, "failed to set retry attributes", std::current_exception()); + } + } + return span; +} + +TRequestSpan::TRequestSpan(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::string& requestName + , const std::string& discoveryEndpoint + , const std::string& database + , const TLog& log + , NTrace::ESpanKind kind +) : Log_(log) { + if (!tracer) { + return; + } + + std::string host; + int port; + ParseEndpoint(discoveryEndpoint, host, port); + + try { + const auto operationName = NormalizeOperationName(requestName); + Span_ = tracer->StartSpan(operationName, kind); + if (!Span_) { + return; + } + Span_->SetAttribute("db.system.name", "ydb"); + Span_->SetAttribute("db.namespace", database); + Span_->SetAttribute("db.operation.name", operationName); + Span_->SetAttribute("ydb.client.api", YdbClientApiAttributeValue(ydbClientType)); + Span_->SetAttribute("server.address", host); + Span_->SetAttribute("server.port", static_cast(port)); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to initialize span", std::current_exception()); + Span_.reset(); + } +} + +TRequestSpan::TRequestSpan(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::string& requestName + , const std::shared_ptr& dbDriverState + , NTrace::ESpanKind kind +): TRequestSpan(ydbClientType, + std::move(tracer), + requestName, + dbDriverState->DiscoveryEndpoint, + dbDriverState->Database, + dbDriverState->Log, + kind +) {} + +TRequestSpan::~TRequestSpan() noexcept { + if (Span_) { + try { + Span_->End(); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to end span", std::current_exception()); + } + } +} + +void TRequestSpan::SetPeerEndpoint(const std::string& endpoint) noexcept { + if (!Span_ || endpoint.empty()) { + return; + } + try { + std::string host; + int port; + ParseEndpoint(endpoint, host, port); + Span_->SetAttribute("network.peer.address", host); + Span_->SetAttribute("network.peer.port", static_cast(port)); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to set peer endpoint", std::current_exception()); + } +} + +void TRequestSpan::AddEvent(const std::string& name, const std::map& attributes) noexcept { + if (!Span_) { + return; + } + try { + Span_->AddEvent(name, attributes); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to add event", std::current_exception()); + } +} + +void TRequestSpan::RecordException(const std::string& type, const std::string& message, const std::string& stacktrace) noexcept { + if (!Span_) { + return; + } + try { + Span_->RecordException(type, message, stacktrace); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to record exception", std::current_exception()); + } +} + +std::unique_ptr TRequestSpan::Activate() noexcept { + if (!Span_) { + return nullptr; + } + try { + return Span_->Activate(); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to activate span", std::current_exception()); + return nullptr; + } +} + +void TRequestSpan::End(EStatus status) noexcept { + if (Span_) { + try { + if (status != EStatus::SUCCESS) { + const auto statusName = ToString(status); + Span_->SetAttribute("db.response.status_code", statusName); + Span_->SetAttribute("error.type", statusName); + Span_->RecordException(statusName, statusName); + } + Span_->End(); + } catch (...) { + SafeLogRequestSpanError(Log_, "failed to finalize span", std::current_exception()); + } + Span_.reset(); + } +} + +} // namespace NYdb::NObservability diff --git a/src/client/impl/observability/span.h b/src/client/impl/observability/span.h new file mode 100644 index 00000000000..2d19280392f --- /dev/null +++ b/src/client/impl/observability/span.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace NYdb::inline V3 { + +class TDbDriverState; + +} // namespace NYdb::inline V3 + +namespace NYdb::inline V3::NObservability { + +class TRequestSpan { +public: + TRequestSpan(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::string& requestName + , const std::string& discoveryEndpoint + , const std::string& database + , const TLog& log = TLog() + , NTrace::ESpanKind kind = NTrace::ESpanKind::CLIENT + ); + + TRequestSpan(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::string& requestName + , const std::shared_ptr& dbDriverState + , NTrace::ESpanKind kind = NTrace::ESpanKind::CLIENT + ); + + static std::shared_ptr CreateForClientRetry( + const std::string& ydbClientType + , std::shared_ptr tracer + , const std::shared_ptr& dbDriverState + ); + + static std::shared_ptr CreateForRetryAttempt(const std::string& ydbClientType + , std::shared_ptr tracer + , const std::shared_ptr& dbDriverState + , std::uint32_t attempt + , std::int64_t backoffMs + ); + + ~TRequestSpan() noexcept; + + void SetPeerEndpoint(const std::string& endpoint) noexcept; + void AddEvent(const std::string& name, const std::map& attributes = {}) noexcept; + void RecordException(const std::string& type, const std::string& message, const std::string& stacktrace = {}) noexcept; + std::unique_ptr Activate() noexcept; + + void End(EStatus status) noexcept; + +private: + TLog Log_; + std::shared_ptr Span_; +}; + +} // namespace NYdb::NObservability diff --git a/src/client/impl/session/session_pool.cpp b/src/client/impl/session/session_pool.cpp index 877566a34fe..21774148587 100644 --- a/src/client/impl/session/session_pool.cpp +++ b/src/client/impl/session/session_pool.cpp @@ -202,6 +202,7 @@ void TSessionPool::ClearOldWaiters() { for (auto& waiter : oldWaiters) { FakeSessionsCounter_.Inc(); + ExternalStatCollector_.IncConnectionTimeouts(); waiter->ReplyError(CLIENT_RESOURCE_EXHAUSTED_ACTIVE_SESSION_LIMIT); } @@ -338,6 +339,7 @@ TPeriodicCb TSessionPool::CreatePeriodicTask(std::weak_ptr weakC for (auto& waiter : waitersToReplyError) { FakeSessionsCounter_.Inc(); + ExternalStatCollector_.IncConnectionTimeouts(); waiter->ReplyError(CLIENT_RESOURCE_EXHAUSTED_ACTIVE_SESSION_LIMIT); } } @@ -392,12 +394,26 @@ void TSessionPool::SetStatCollector(NSdkStats::TStatCollector::TSessionPoolStatC InPoolSessionsCounter_.Set(statCollector.InPoolSessions); FakeSessionsCounter_.Set(statCollector.FakeSessions); SessionWaiterCounter_.Set(statCollector.Waiters); + ExternalStatCollector_ = std::move(statCollector); + // Publish an initial zeroed state for OTel gauges so that users see the series + // appear as soon as a client starts, even before any session activity. + ExternalStatCollector_.UpdateConnectionCount(ActiveSessions_ + static_cast(Sessions_.size())); + ExternalStatCollector_.UpdatePendingRequests(WaitersQueue_.Size()); +} + +void TSessionPool::RecordConnectionCreateTime(double seconds) { + ExternalStatCollector_.RecordConnectionCreateTime(seconds); } void TSessionPool::UpdateStats() { ActiveSessionsCounter_.Apply(ActiveSessions_); InPoolSessionsCounter_.Apply(Sessions_.size()); SessionWaiterCounter_.Apply(WaitersQueue_.Size()); + // Export connection pool state via OpenTelemetry metrics as well. + // session == connection in YDB, so connection.count includes both in-use + // (active) sessions and idle sessions still living in the pool. + ExternalStatCollector_.UpdateConnectionCount(ActiveSessions_ + static_cast(Sessions_.size())); + ExternalStatCollector_.UpdatePendingRequests(WaitersQueue_.Size()); } } diff --git a/src/client/impl/session/session_pool.h b/src/client/impl/session/session_pool.h index 2499515fd9d..c3abf1b988c 100644 --- a/src/client/impl/session/session_pool.h +++ b/src/client/impl/session/session_pool.h @@ -128,6 +128,10 @@ class TSessionPool : public IServerCloseHandler { void Drain(std::function&&)> cb, bool close); void SetStatCollector(NSdkStats::TStatCollector::TSessionPoolStatCollector collector); + // Records time spent creating a new connection (session). To be called by the + // client after a CreateSession RPC completes. + void RecordConnectionCreateTime(double seconds); + void OnCloseSession(const TKqpSessionCommon*, std::shared_ptr client) override; private: @@ -146,6 +150,7 @@ class TSessionPool : public IServerCloseHandler { NSdkStats::TSessionCounter InPoolSessionsCounter_; NSdkStats::TSessionCounter SessionWaiterCounter_; NSdkStats::TAtomicCounter<::NMonitoring::TRate> FakeSessionsCounter_; + NSdkStats::TStatCollector::TSessionPoolStatCollector ExternalStatCollector_; }; } diff --git a/src/client/impl/stats/CMakeLists.txt b/src/client/impl/stats/CMakeLists.txt index 498104196cd..15866af4bc6 100644 --- a/src/client/impl/stats/CMakeLists.txt +++ b/src/client/impl/stats/CMakeLists.txt @@ -4,6 +4,7 @@ target_link_libraries(client-impl-ydb_stats PUBLIC yutil grpc-client monlib-metrics + client-metrics ) target_sources(client-impl-ydb_stats PRIVATE diff --git a/src/client/impl/stats/stats.h b/src/client/impl/stats/stats.h index d545764c887..d3e53950153 100644 --- a/src/client/impl/stats/stats.h +++ b/src/client/impl/stats/stats.h @@ -1,17 +1,24 @@ #pragma once #include +#include #include #include #include #include +#include #include +#include namespace NYdb::inline V3 { namespace NSdkStats { +inline std::string YdbClientApiAttributeValue(const std::string& clientType) { + return clientType.empty() ? std::string("Unspecified") : clientType; +} + // works only for case normal (foo_bar) underscore inline std::string UnderscoreToUpperCamel(const std::string& in) { @@ -179,17 +186,93 @@ struct TStatCollector { TSessionPoolStatCollector(::NMonitoring::TIntGauge* activeSessions = nullptr , ::NMonitoring::TIntGauge* inPoolSessions = nullptr , ::NMonitoring::TRate* fakeSessions = nullptr - , ::NMonitoring::TIntGauge* waiters = nullptr) + , ::NMonitoring::TIntGauge* waiters = nullptr + , std::shared_ptr externalRegistry = {} + , std::string database = {} + , std::string clientType = {}) : ActiveSessions(activeSessions) , InPoolSessions(inPoolSessions) , FakeSessions(fakeSessions) , Waiters(waiters) + , ExternalRegistry_(std::move(externalRegistry)) + , Database_(std::move(database)) + , ClientType_(std::move(clientType)) { } ::NMonitoring::TIntGauge* ActiveSessions; ::NMonitoring::TIntGauge* InPoolSessions; ::NMonitoring::TRate* FakeSessions; ::NMonitoring::TIntGauge* Waiters; + + // OpenTelemetry connection pool metrics (session == connection in YDB). + // See https://opentelemetry.io/docs/specs/semconv/database/database-metrics/#connection-pool + + void UpdateConnectionCount(std::int64_t value) { + if (!ExternalRegistry_) { + return; + } + ExternalRegistry_->Gauge( + "db.client.connection.count", + ConnectionPoolLabels(), + "The number of connections that are currently in state described by the state attribute.", + "{connection}" + )->Set(static_cast(value)); + } + + void UpdatePendingRequests(std::int64_t value) { + if (!ExternalRegistry_) { + return; + } + ExternalRegistry_->Gauge( + "db.client.connection.pending_requests", + ConnectionPoolLabels(), + "The number of current pending requests for an open connection.", + "{request}" + )->Set(static_cast(value)); + } + + void IncConnectionTimeouts() { + if (!ExternalRegistry_) { + return; + } + ExternalRegistry_->Counter( + "db.client.connection.timeouts", + ConnectionPoolLabels(), + "The number of connection timeouts that have occurred trying to obtain a connection from the pool.", + "{timeout}" + )->Inc(); + } + + void RecordConnectionCreateTime(double seconds) { + if (!ExternalRegistry_) { + return; + } + ExternalRegistry_->Histogram( + "db.client.connection.create_time", + {0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10}, + ConnectionPoolLabels(), + "The time it took to create a new connection.", + "s" + )->Record(seconds); + } + + bool HasExternalRegistry() const { + return static_cast(ExternalRegistry_); + } + + private: + NMetrics::TLabels ConnectionPoolLabels() const { + return { + {"db.system.name", "ydb"}, + {"db.namespace", Database_}, + {"db.client.connection.pool.name", YdbClientApiAttributeValue(ClientType_)}, + {"ydb.client.api", YdbClientApiAttributeValue(ClientType_)}, + }; + } + + std::shared_ptr ExternalRegistry_; + std::string Database_; + std::string ClientType_; }; struct TClientRetryOperationStatCollector { @@ -226,6 +309,104 @@ struct TStatCollector { std::string ClientType_; }; + struct TClientOperationStatCollector { + TClientOperationStatCollector() + : MetricRegistry_() + {} + + TClientOperationStatCollector(::NMonitoring::TMetricRegistry* registry, + const std::string& database, + const std::string& clientType, + std::shared_ptr externalRegistry = {}) + : MetricRegistry_(registry) + , ExternalRegistry_(std::move(externalRegistry)) + , Database_(database) + , ClientType_(clientType) + {} + + void IncRequestCount(const std::string& operationName) { + if (auto registry = MetricRegistry_.Get()) { + registry->Rate({ + {"database", Database_}, + {"ydb_client", ClientType_}, + {"operation", operationName}, + {"sensor", "Request/Operations"} + })->Inc(); + } + } + + void IncErrorCount(const std::string& operationName, EStatus status) { + if (status == EStatus::SUCCESS) { + return; + } + if (auto registry = MetricRegistry_.Get()) { + registry->Rate({ + {"database", Database_}, + {"ydb_client", ClientType_}, + {"operation", operationName}, + {"status", TStringBuilder() << status}, + {"sensor", "Request/OperationErrors"} + })->Inc(); + } + if (ExternalRegistry_) { + const std::string clientApi = YdbClientApiAttributeValue(ClientType_); + const std::string statusName = TStringBuilder() << status; + NMetrics::TLabels labels = { + {"db.system.name", "ydb"}, + {"db.namespace", Database_}, + {"db.operation.name", operationName}, + {"ydb.client.api", clientApi}, + {"db.response.status_code", statusName}, + {"error.type", statusName}, + }; + ExternalRegistry_->Counter( + "db.client.operation.failed", + labels, + "Number of database client operations that failed.", + "{operation}" + )->Inc(); + } + } + + void RecordLatency(const std::string& operationName, double durationSeconds, EStatus status) { + if (auto registry = MetricRegistry_.Get()) { + registry->HistogramRate({ + {"database", Database_}, + {"ydb_client", ClientType_}, + {"operation", operationName}, + {"sensor", "Request/OperationLatencyMs"} + }, ::NMonitoring::ExponentialHistogram(20, 2, 1))->Record( + static_cast(durationSeconds * 1000.0)); + } + if (ExternalRegistry_) { + NMetrics::TLabels labels = { + {"db.system.name", "ydb"}, + {"db.namespace", Database_}, + {"db.operation.name", operationName}, + {"ydb.client.api", YdbClientApiAttributeValue(ClientType_)}, + }; + if (status != EStatus::SUCCESS) { + const std::string statusName = TStringBuilder() << status; + labels["db.response.status_code"] = statusName; + labels["error.type"] = statusName; + } + ExternalRegistry_->Histogram( + "db.client.operation.duration", + {0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10}, + labels, + "Duration of database client operations.", + "s" + )->Record(durationSeconds); + } + } + + private: + TAtomicPointer<::NMonitoring::TMetricRegistry> MetricRegistry_; + std::shared_ptr ExternalRegistry_; + std::string Database_; + std::string ClientType_; + }; + struct TClientStatCollector { TClientStatCollector(::NMonitoring::TRate* cacheMiss = nullptr @@ -233,13 +414,15 @@ struct TStatCollector { , ::NMonitoring::THistogram* paramsSize = nullptr , ::NMonitoring::TRate* sessionRemoved = nullptr , ::NMonitoring::TRate* requestMigrated = nullptr - , TClientRetryOperationStatCollector retryOperationStatCollector = TClientRetryOperationStatCollector()) + , TClientRetryOperationStatCollector retryOperationStatCollector = TClientRetryOperationStatCollector() + , TClientOperationStatCollector operationStatCollector = TClientOperationStatCollector()) : CacheMiss(cacheMiss) , QuerySize(querySize) , ParamsSize(paramsSize) , SessionRemovedDueBalancing(sessionRemoved) , RequestMigrated(requestMigrated) , RetryOperationStatCollector(retryOperationStatCollector) + , OperationStatCollector(operationStatCollector) { } ::NMonitoring::TRate* CacheMiss; @@ -248,11 +431,15 @@ struct TStatCollector { ::NMonitoring::TRate* SessionRemovedDueBalancing; ::NMonitoring::TRate* RequestMigrated; TClientRetryOperationStatCollector RetryOperationStatCollector; + TClientOperationStatCollector OperationStatCollector; }; - TStatCollector(const std::string& database, TMetricRegistry* sensorsRegistry) - : Database_(database) + TStatCollector(const std::string& database + , TMetricRegistry* sensorsRegistry + , std::shared_ptr externalMetricRegistry = {} + ) : Database_(database) , DatabaseLabel_({"database", database}) + , ExternalMetricRegistry_(std::move(externalMetricRegistry)) { if (sensorsRegistry) { SetMetricRegistry(sensorsRegistry); @@ -349,10 +536,12 @@ struct TStatCollector { auto waiters = registry->IntGauge({ DatabaseLabel_, {"ydb_client", clientType}, {"sensor", "Sessions/WaitForReturn"} }); - return TSessionPoolStatCollector(activeSessions, inPoolSessions, fakeSessions, waiters); + return TSessionPoolStatCollector(activeSessions, inPoolSessions, fakeSessions, waiters, + ExternalMetricRegistry_, Database_, clientType); } - return TSessionPoolStatCollector(); + return TSessionPoolStatCollector(nullptr, nullptr, nullptr, nullptr, + ExternalMetricRegistry_, Database_, clientType); } TClientStatCollector GetClientStatCollector(const std::string& clientType) { @@ -376,10 +565,13 @@ struct TStatCollector { {"sensor", "Request/ParamsSize"} }, ::NMonitoring::ExponentialHistogram(10, 2, 32)); return TClientStatCollector(cacheMiss, querySize, paramsSize, sessionRemovedDueBalancing, requestMigrated, - TClientRetryOperationStatCollector(MetricRegistryPtr_.Get(), Database_, clientType)); + TClientRetryOperationStatCollector(MetricRegistryPtr_.Get(), Database_, clientType), + TClientOperationStatCollector(MetricRegistryPtr_.Get(), Database_, clientType, ExternalMetricRegistry_)); } - return TClientStatCollector(); + return TClientStatCollector(nullptr, nullptr, nullptr, nullptr, nullptr, + TClientRetryOperationStatCollector(nullptr, Database_, clientType), + TClientOperationStatCollector(nullptr, Database_, clientType, ExternalMetricRegistry_)); } bool IsCollecting() { @@ -397,6 +589,7 @@ struct TStatCollector { private: const std::string Database_; const ::NMonitoring::TLabel DatabaseLabel_; + std::shared_ptr ExternalMetricRegistry_; TAtomicPointer MetricRegistryPtr_; TAtomicCounter<::NMonitoring::TRate> DiscoveryDuePessimization_; TAtomicCounter<::NMonitoring::TRate> DiscoveryDueExpiration_; diff --git a/src/client/metrics/CMakeLists.txt b/src/client/metrics/CMakeLists.txt new file mode 100644 index 00000000000..e681a846b26 --- /dev/null +++ b/src/client/metrics/CMakeLists.txt @@ -0,0 +1,7 @@ +_ydb_sdk_add_library(client-metrics) + +target_sources(client-metrics PRIVATE + metrics.cpp +) + +_ydb_sdk_make_client_component(Metrics client-metrics) diff --git a/src/client/metrics/metrics.cpp b/src/client/metrics/metrics.cpp new file mode 100644 index 00000000000..341917291bb --- /dev/null +++ b/src/client/metrics/metrics.cpp @@ -0,0 +1 @@ +#include diff --git a/src/client/query/CMakeLists.txt b/src/client/query/CMakeLists.txt index 6677d402d4d..bc159ea87ab 100644 --- a/src/client/query/CMakeLists.txt +++ b/src/client/query/CMakeLists.txt @@ -11,6 +11,7 @@ target_link_libraries(client-ydb_query PUBLIC client-ydb_driver client-ydb_query-impl client-ydb_result + client-metrics client-types-operation api-protos api-grpc diff --git a/src/client/query/client.cpp b/src/client/query/client.cpp index ccf90f1175c..ec68da477eb 100644 --- a/src/client/query/client.cpp +++ b/src/client/query/client.cpp @@ -14,8 +14,10 @@ #include #include +#include #include #include +#include #include @@ -23,6 +25,7 @@ namespace NYdb::inline V3::NQuery { +using TQueryObservation = NObservability::TRequestObservation; using TRetryContextResultAsync = NRetry::Async::TRetryContext; using TRetryContextAsync = NRetry::Async::TRetryContext; @@ -67,6 +70,10 @@ class TQueryClient::TImpl: public TClientImplCommon, public { SetStatCollector(DbDriverState_->StatCollector.GetClientStatCollector("Query")); SessionPool_.SetStatCollector(DbDriverState_->StatCollector.GetSessionPoolStatCollector("Query")); + + if (auto traceProvider = Connections_->GetTraceProvider()) { + Tracer_ = traceProvider->GetTracer("ydb-cpp-sdk-query"); + } } ~TImpl() { @@ -77,6 +84,7 @@ class TQueryClient::TImpl: public TClientImplCommon, public QuerySizeHistogram_.Set(collector.QuerySize); ParamsSizeHistogram_.Set(collector.ParamsSize); RetryOperationStatCollector_ = collector.RetryOperationStatCollector; + OperationStatCollector_ = collector.OperationStatCollector; } TAsyncExecuteQueryIterator StreamExecuteQuery(const std::string& query, const TTxControl& txControl, @@ -94,8 +102,24 @@ class TQueryClient::TImpl: public TClientImplCommon, public { CollectQuerySize(query); CollectParamsSize(params ? ¶ms->GetProtoMap() : nullptr); + + auto obs = MakeObservation("ExecuteQuery"); + std::string sessionEndpoint = session.has_value() ? session->SessionImpl_->GetEndpoint() : std::string{}; + return TExecQueryImpl::ExecuteQuery( - Connections_, DbDriverState_, query, txControl, params, settings, session); + Connections_, DbDriverState_, query, txControl, params, settings, session) + .Apply([obs, sessionEndpoint = std::move(sessionEndpoint)](TAsyncExecuteQueryResult future) { + try { + auto result = future.GetValue(); + const auto& resultEndpoint = result.GetEndpoint(); + obs->SetPeerEndpoint(!resultEndpoint.empty() ? resultEndpoint : sessionEndpoint); + obs->End(result.GetStatus()); + return result; + } catch (...) { + obs->EndWithClientInternalError(); + throw; + } + }); } NThreading::TFuture ExecuteScript(const std::string& script, const std::optional& params, const TExecuteScriptSettings& settings) { @@ -162,20 +186,27 @@ class TQueryClient::TImpl: public TClientImplCommon, public auto promise = NThreading::NewPromise(); - auto responseCb = [promise, session] + auto obs = MakeObservation("Rollback"); + + auto responseCb = [promise, session, obs] (Ydb::Query::RollbackTransactionResponse* response, TPlainStatus status) mutable { try { + obs->SetPeerEndpoint(status.Endpoint); if (response) { NYdb::NIssue::TIssues opIssues; NYdb::NIssue::IssuesFromMessage(response->issues(), opIssues); TStatus rollbackTxStatus(TPlainStatus{static_cast(response->status()), std::move(opIssues), status.Endpoint, std::move(status.Metadata)}); + obs->End(rollbackTxStatus.GetStatus()); + promise.SetValue(std::move(rollbackTxStatus)); } else { + obs->End(status.Status); promise.SetValue(TStatus(std::move(status))); } } catch (...) { + obs->EndWithClientInternalError(); promise.SetException(std::current_exception()); } }; @@ -203,21 +234,28 @@ class TQueryClient::TImpl: public TClientImplCommon, public auto promise = NThreading::NewPromise(); - auto responseCb = [promise, session] + auto obs = MakeObservation("Commit"); + + auto responseCb = [promise, session, obs] (Ydb::Query::CommitTransactionResponse* response, TPlainStatus status) mutable { try { + obs->SetPeerEndpoint(status.Endpoint); if (response) { NYdb::NIssue::TIssues opIssues; NYdb::NIssue::IssuesFromMessage(response->issues(), opIssues); TStatus commitTxStatus(TPlainStatus{static_cast(response->status()), std::move(opIssues), status.Endpoint, std::move(status.Metadata)}); + obs->End(commitTxStatus.GetStatus()); + TCommitTransactionResult commitTxResult(std::move(commitTxStatus)); promise.SetValue(std::move(commitTxResult)); } else { + obs->End(status.Status); promise.SetValue(TCommitTransactionResult(TStatus(std::move(status)))); } } catch (...) { + obs->EndWithClientInternalError(); promise.SetException(std::current_exception()); } }; @@ -395,8 +433,12 @@ class TQueryClient::TImpl: public TClientImplCommon, public auto promise = NThreading::NewPromise(); auto self = shared_from_this(); + const auto createStartTime = std::chrono::steady_clock::now(); - auto extractor = [promise, self] (Ydb::Query::CreateSessionResponse* resp, TPlainStatus status) mutable { + auto extractor = [promise, self, createStartTime] (Ydb::Query::CreateSessionResponse* resp, TPlainStatus status) mutable { + const double elapsedSec = + std::chrono::duration(std::chrono::steady_clock::now() - createStartTime).count(); + self->SessionPool_.RecordConnectionCreateTime(elapsedSec); if (resp) { if (resp->status() != Ydb::StatusIds::SUCCESS) { NYdb::NIssue::TIssues opIssues; @@ -425,10 +467,12 @@ class TQueryClient::TImpl: public TClientImplCommon, public TAsyncCreateSessionResult GetSession(const TCreateSessionSettings& settings) { class TQueryClientGetSessionCtx : public NSessionPool::IGetSessionCtx { public: - TQueryClientGetSessionCtx(std::shared_ptr client, const TCreateSessionSettings& settings) + TQueryClientGetSessionCtx(std::shared_ptr client, const TCreateSessionSettings& settings, + std::shared_ptr observation) : Promise(NThreading::NewPromise()) , Client(client) , RpcSettings(TRpcRequestSettings::Make(settings)) + , Observation(std::move(observation)) {} TAsyncCreateSessionResult GetFuture() { @@ -437,6 +481,9 @@ class TQueryClient::TImpl: public TClientImplCommon, public void ReplyError(TStatus status) override { TSession session; + if (Observation) { + Observation->End(status.GetStatus()); + } ScheduleReply(TCreateSessionResult(std::move(status), std::move(session))); } @@ -449,14 +496,23 @@ class TQueryClient::TImpl: public TClientImplCommon, public ) ); + if (Observation) { + Observation->SetPeerEndpoint(session->GetEndpoint()); + Observation->End(EStatus::SUCCESS); + } ScheduleReply(std::move(val)); } void ReplyNewSession() override { Client->CreateAttachedSession(RpcSettings).Subscribe( - [promise{std::move(Promise)}](TAsyncCreateSessionResult future) mutable + [promise{std::move(Promise)}, obs = Observation](TAsyncCreateSessionResult future) mutable { - promise.SetValue(future.ExtractValue()); + auto val = future.ExtractValue(); + if (obs) { + obs->SetPeerEndpoint(val.GetEndpoint()); + obs->End(val.GetStatus()); + } + promise.SetValue(std::move(val)); }); } @@ -481,9 +537,11 @@ class TQueryClient::TImpl: public TClientImplCommon, public NThreading::TPromise Promise; std::shared_ptr Client; const TRpcRequestSettings RpcSettings; + std::shared_ptr Observation; }; - auto ctx = std::make_unique(shared_from_this(), settings); + auto obs = MakeObservation("CreateSession"); + auto ctx = std::make_unique(shared_from_this(), settings, obs); auto future = ctx->GetFuture(); SessionPool_.GetSession(std::move(ctx)); @@ -527,6 +585,24 @@ class TQueryClient::TImpl: public TClientImplCommon, public ), NSessionPool::PERIODIC_ACTION_INTERVAL); } + std::shared_ptr CreateRetryRootSpan() { + return NObservability::TRequestSpan::CreateForClientRetry( + "Query", + Tracer_, + DbDriverState_ + ); + } + + std::shared_ptr CreateRetryAttemptSpan(std::uint32_t attempt, std::int64_t backoffMs) { + return NObservability::TRequestSpan::CreateForRetryAttempt( + "Query", + Tracer_, + DbDriverState_, + attempt, + backoffMs + ); + } + void CollectRetryStatAsync(EStatus status) { RetryOperationStatCollector_.IncAsyncRetryOperation(status); } @@ -552,6 +628,18 @@ class TQueryClient::TImpl: public TClientImplCommon, public } private: + std::shared_ptr MakeObservation(const std::string& operationName) { + return std::make_shared( + "Query", + &OperationStatCollector_, + Tracer_, + operationName, + DbDriverState_ + ); + } + + std::shared_ptr Tracer_; + NSdkStats::TStatCollector::TClientOperationStatCollector OperationStatCollector_; NSdkStats::TStatCollector::TClientRetryOperationStatCollector RetryOperationStatCollector_; NSdkStats::TAtomicHistogram<::NMonitoring::THistogram> QuerySizeHistogram_; NSdkStats::TAtomicHistogram<::NMonitoring::THistogram> ParamsSizeHistogram_; @@ -642,13 +730,15 @@ TAsyncStatus TQueryClient::RetryQuery(TQueryWithoutSessionFunc&& queryFunc, TRet } TStatus TQueryClient::RetryQuerySync(const TQuerySyncFunc& queryFunc, TRetryOperationSettings settings) { - NRetry::Sync::TRetryWithSession ctx(*this, queryFunc, settings); - return ctx.Execute(); + return NRetry::Sync::RunSyncRetryWithParentSpan( + Impl_, + NRetry::Sync::TRetryWithSession(*this, queryFunc, settings)); } TStatus TQueryClient::RetryQuerySync(const TQueryWithoutSessionSyncFunc& queryFunc, TRetryOperationSettings settings) { - NRetry::Sync::TRetryWithoutSession ctx(*this, queryFunc, settings); - return ctx.Execute(); + return NRetry::Sync::RunSyncRetryWithParentSpan( + Impl_, + NRetry::Sync::TRetryWithoutSession(*this, queryFunc, settings)); } TAsyncExecuteQueryResult TQueryClient::RetryQuery(const std::string& query, const TTxControl& txControl, diff --git a/src/client/query/impl/CMakeLists.txt b/src/client/query/impl/CMakeLists.txt index 76b112b2254..d33258b7afd 100644 --- a/src/client/query/impl/CMakeLists.txt +++ b/src/client/query/impl/CMakeLists.txt @@ -9,6 +9,10 @@ target_link_libraries(client-ydb_query-impl PUBLIC client-ydb_result ) +target_link_libraries(client-ydb_query-impl PUBLIC + impl-observability +) + target_sources(client-ydb_query-impl PRIVATE exec_query.cpp client_session.cpp diff --git a/src/client/table/impl/CMakeLists.txt b/src/client/table/impl/CMakeLists.txt index 8f53d386fc6..8ecfe4ead87 100644 --- a/src/client/table/impl/CMakeLists.txt +++ b/src/client/table/impl/CMakeLists.txt @@ -10,6 +10,8 @@ target_link_libraries(client-ydb_table-impl client-impl-ydb_endpoints impl-session client-ydb_table-query_stats + client-metrics + impl-observability PRIVATE OpenSSL::SSL ) diff --git a/src/client/table/impl/table_client.cpp b/src/client/table/impl/table_client.cpp index 4df9e91e24e..1d78b141537 100644 --- a/src/client/table/impl/table_client.cpp +++ b/src/client/table/impl/table_client.cpp @@ -22,14 +22,39 @@ TTableClient::TImpl::TImpl(std::shared_ptr&& connections, , Settings_(settings) , SessionPool_(Settings_.SessionPoolSettings_.MaxActiveSessions_) { + auto clientCollector = DbDriverState_->StatCollector.GetClientStatCollector("Table"); + OperationStatCollector_ = clientCollector.OperationStatCollector; + + if (auto traceProvider = Connections_->GetTraceProvider()) { + Tracer_ = traceProvider->GetTracer("ydb-cpp-sdk-table"); + } + if (!DbDriverState_->StatCollector.IsCollecting()) { return; } - SetStatCollector(DbDriverState_->StatCollector.GetClientStatCollector("Table")); + SetStatCollector(clientCollector); SessionPool_.SetStatCollector(DbDriverState_->StatCollector.GetSessionPoolStatCollector("Table")); } +std::shared_ptr TTableClient::TImpl::CreateRetryRootSpan() { + return NObservability::TRequestSpan::CreateForClientRetry( + "Table", + Tracer_, + DbDriverState_ + ); +} + +std::shared_ptr TTableClient::TImpl::CreateRetryAttemptSpan(std::uint32_t attempt, std::int64_t backoffMs) { + return NObservability::TRequestSpan::CreateForRetryAttempt( + "Table", + Tracer_, + DbDriverState_, + attempt, + backoffMs + ); +} + TTableClient::TImpl::~TImpl() { if (Connections_->GetDrainOnDtors()) { Drain().Wait(); @@ -378,8 +403,10 @@ TAsyncCreateSessionResult TTableClient::TImpl::CreateSession(const TCreateSessio auto createSessionPromise = NewPromise(); auto self = shared_from_this(); + auto obs = MakeObservation("CreateSession"); + const auto createStartTime = std::chrono::steady_clock::now(); - auto createSessionExtractor = [createSessionPromise, self, standalone] + auto createSessionExtractor = [createSessionPromise, self, standalone, obs, createStartTime] (google::protobuf::Any* any, TPlainStatus status) mutable { Ydb::Table::CreateSessionResult result; if (any) { @@ -395,7 +422,12 @@ TAsyncCreateSessionResult TTableClient::TImpl::CreateSession(const TCreateSessio // We do not use SessionStatusInterception for CreateSession request session.SessionImpl_->MarkBroken(); } + obs->SetPeerEndpoint(status.Endpoint); + const double elapsedSec = + std::chrono::duration(std::chrono::steady_clock::now() - createStartTime).count(); + self->SessionPool_.RecordConnectionCreateTime(elapsedSec); TCreateSessionResult val(TStatus(std::move(status)), std::move(session)); + obs->End(val.GetStatus()); createSessionPromise.SetValue(std::move(val)); }; @@ -759,11 +791,20 @@ TAsyncStatus TTableClient::TImpl::ExecuteSchemeQuery(const TSession& session, co request.set_session_id(TStringType{session.GetId()}); request.set_yql_text(TStringType{query}); - return RunSimple( + auto obs = MakeObservation("ExecuteSchemeQuery"); + + auto future = RunSimple( std::move(request), &Ydb::Table::V1::TableService::Stub::AsyncExecuteSchemeQuery, rpcSettings ); + + return future.Apply([obs](NThreading::TFuture f) mutable { + auto status = f.ExtractValue(); + obs->SetPeerEndpoint(status.GetEndpoint()); + obs->End(status.GetStatus()); + return status; + }); } TAsyncBeginTransactionResult TTableClient::TImpl::BeginTransaction(const TSession& session, const TTxSettings& txSettings, @@ -776,9 +817,11 @@ TAsyncBeginTransactionResult TTableClient::TImpl::BeginTransaction(const TSessio request.set_session_id(TStringType{session.GetId()}); SetTxSettings(txSettings, request.mutable_tx_settings()); + auto obs = MakeObservation("BeginTransaction"); + auto promise = NewPromise(); - auto extractor = [promise, session] + auto extractor = [promise, session, obs] (google::protobuf::Any* any, TPlainStatus status) mutable { std::string txId; if (any) { @@ -787,8 +830,10 @@ TAsyncBeginTransactionResult TTableClient::TImpl::BeginTransaction(const TSessio txId = result.tx_meta().id(); } + obs->SetPeerEndpoint(status.Endpoint); TBeginTransactionResult beginTxResult(TStatus(std::move(status)), TTransaction(session, txId)); + obs->End(beginTxResult.GetStatus()); promise.SetValue(std::move(beginTxResult)); }; @@ -815,9 +860,11 @@ TAsyncCommitTransactionResult TTableClient::TImpl::CommitTransaction(const TSess request.set_tx_id(TStringType{txId}); request.set_collect_stats(GetStatsCollectionMode(settings.CollectQueryStats_)); + auto obs = MakeObservation("Commit"); + auto promise = NewPromise(); - auto extractor = [promise] + auto extractor = [promise, obs] (google::protobuf::Any* any, TPlainStatus status) mutable { std::optional queryStats; if (any) { @@ -829,7 +876,9 @@ TAsyncCommitTransactionResult TTableClient::TImpl::CommitTransaction(const TSess } } + obs->SetPeerEndpoint(status.Endpoint); TCommitTransactionResult commitTxResult(TStatus(std::move(status)), queryStats); + obs->End(commitTxResult.GetStatus()); promise.SetValue(std::move(commitTxResult)); }; @@ -855,11 +904,20 @@ TAsyncStatus TTableClient::TImpl::RollbackTransaction(const TSession& session, c request.set_session_id(TStringType{session.GetId()}); request.set_tx_id(TStringType{txId}); - return RunSimple( + auto obs = MakeObservation("Rollback"); + + auto future = RunSimple( std::move(request), &Ydb::Table::V1::TableService::Stub::AsyncRollbackTransaction, rpcSettings ); + + return future.Apply([obs](TAsyncStatus fut) { + auto status = fut.GetValue(); + obs->SetPeerEndpoint(status.GetEndpoint()); + obs->End(status.GetStatus()); + return status; + }); } TAsyncExplainDataQueryResult TTableClient::TImpl::ExplainDataQuery(const TSession& session, const std::string& query, @@ -1100,6 +1158,7 @@ void TTableClient::TImpl::SetStatCollector(const NSdkStats::TStatCollector::TCli ParamsSizeHistogram.Set(collector.ParamsSize); RetryOperationStatCollector = collector.RetryOperationStatCollector; SessionRemovedDueBalancing.Set(collector.SessionRemovedDueBalancing); + OperationStatCollector_ = collector.OperationStatCollector; } TAsyncBulkUpsertResult TTableClient::TImpl::BulkUpsert(const std::string& table, TValue&& rows, const TBulkUpsertSettings& settings) { @@ -1128,10 +1187,14 @@ TAsyncBulkUpsertResult TTableClient::TImpl::BulkUpsert(const std::string& table, *mutable_rows->mutable_type() = rows.GetType().GetProto(); } + auto obs = MakeObservation("BulkUpsert"); + auto promise = NewPromise(); - auto extractor = [promise](google::protobuf::Any* any, TPlainStatus status) mutable { + auto extractor = [promise, obs](google::protobuf::Any* any, TPlainStatus status) mutable { Y_UNUSED(any); + obs->SetPeerEndpoint(status.Endpoint); TBulkUpsertResult val(TStatus(std::move(status))); + obs->End(val.GetStatus()); promise.SetValue(std::move(val)); }; @@ -1174,12 +1237,16 @@ TAsyncBulkUpsertResult TTableClient::TImpl::BulkUpsert(const std::string& table, } request.set_data(TStringType{data}); + auto obs = MakeObservation("BulkUpsert"); + auto promise = NewPromise(); - auto extractor = [promise] + auto extractor = [promise, obs] (google::protobuf::Any* any, TPlainStatus status) mutable { Y_UNUSED(any); + obs->SetPeerEndpoint(status.Endpoint); TBulkUpsertResult val(TStatus(std::move(status))); + obs->End(val.GetStatus()); promise.SetValue(std::move(val)); }; diff --git a/src/client/table/impl/table_client.h b/src/client/table/impl/table_client.h index 8fe71287f36..38343f7e8cd 100644 --- a/src/client/table/impl/table_client.h +++ b/src/client/table/impl/table_client.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #undef INCLUDE_YDB_INTERNAL_H @@ -153,6 +154,9 @@ class TTableClient::TImpl: public TClientImplCommon, public void CollectRetryStatAsync(EStatus status); void CollectRetryStatSync(EStatus status); + std::shared_ptr CreateRetryRootSpan(); + std::shared_ptr CreateRetryAttemptSpan(std::uint32_t attempt, std::int64_t backoffMs); + public: TClientSettings Settings_; @@ -237,6 +241,8 @@ class TTableClient::TImpl: public TClientImplCommon, public auto promise = NewPromise(); bool keepInCache = settings.KeepInQueryCache_ && settings.KeepInQueryCache_.value(); + auto obs = MakeObservation("ExecuteDataQuery"); + // We don't want to delay call of TSession dtor, so we can't capture it by copy // otherwise we break session pool and other clients logic. // Same problem with TDataQuery and TTransaction @@ -246,7 +252,7 @@ class TTableClient::TImpl: public TClientImplCommon, public // - capture pointer // - call free just before SetValue call auto sessionPtr = new TSession(session); - auto extractor = [promise, sessionPtr, query, fromCache, keepInCache] + auto extractor = [promise, sessionPtr, query, fromCache, keepInCache, obs] (google::protobuf::Any* any, TPlainStatus status) mutable { std::vector res; std::optional tx; @@ -282,9 +288,12 @@ class TTableClient::TImpl: public TClientImplCommon, public sessionPtr->SessionImpl_->AddQueryToCache(*dataQuery); } + obs->SetPeerEndpoint(status.Endpoint); TDataQueryResult dataQueryResult(TStatus(std::move(status)), std::move(res), tx, dataQuery, fromCache, queryStats); + obs->End(dataQueryResult.GetStatus()); + delete sessionPtr; tx.reset(); dataQuery.reset(); @@ -327,9 +336,21 @@ class TTableClient::TImpl: public TClientImplCommon, public NSdkStats::TAtomicCounter<::NMonitoring::TRate> SessionRemovedDueBalancing; private: + std::shared_ptr Tracer_; + NSdkStats::TStatCollector::TClientOperationStatCollector OperationStatCollector_; NSessionPool::TSessionPool SessionPool_; TRequestMigrator RequestMigrator_; static const TKeepAliveSettings KeepAliveSettings; + + std::shared_ptr MakeObservation(const std::string& operationName) { + return std::make_shared( + "Table", + &OperationStatCollector_, + Tracer_, + operationName, + DbDriverState_ + ); + } }; } diff --git a/src/client/table/table.cpp b/src/client/table/table.cpp index 8ed110c0147..c000ceb480f 100644 --- a/src/client/table/table.cpp +++ b/src/client/table/table.cpp @@ -1518,13 +1518,17 @@ TAsyncStatus TTableClient::RetryOperation(TOperationWithoutSessionFunc&& operati } TStatus TTableClient::RetryOperationSync(const TOperationWithoutSessionSyncFunc& operation, const TRetryOperationSettings& settings) { - NRetry::Sync::TRetryWithoutSession ctx(*this, operation, settings); - return ctx.Execute(); + return NRetry::Sync::RunSyncRetryWithParentSpan( + Impl_, + NRetry::Sync::TRetryWithoutSession(*this, operation, settings) + ); } TStatus TTableClient::RetryOperationSync(const TOperationSyncFunc& operation, const TRetryOperationSettings& settings) { - NRetry::Sync::TRetryWithSession ctx(*this, operation, settings); - return ctx.Execute(); + return NRetry::Sync::RunSyncRetryWithParentSpan( + Impl_, + NRetry::Sync::TRetryWithSession(*this, operation, settings) + ); } NThreading::TFuture TTableClient::Stop() { diff --git a/src/client/trace/CMakeLists.txt b/src/client/trace/CMakeLists.txt new file mode 100644 index 00000000000..86a8f8d4208 --- /dev/null +++ b/src/client/trace/CMakeLists.txt @@ -0,0 +1,7 @@ +_ydb_sdk_add_library(client-trace) + +target_sources(client-trace PRIVATE + trace.cpp +) + +_ydb_sdk_make_client_component(Trace client-trace) diff --git a/src/client/trace/trace.cpp b/src/client/trace/trace.cpp new file mode 100644 index 00000000000..6bf5bc664f0 --- /dev/null +++ b/src/client/trace/trace.cpp @@ -0,0 +1 @@ +#include diff --git a/tests/common/fake_metric_registry.h b/tests/common/fake_metric_registry.h new file mode 100644 index 00000000000..032234f080f --- /dev/null +++ b/tests/common/fake_metric_registry.h @@ -0,0 +1,135 @@ +#pragma once + +#include + +#include +#include +#include +#include + +namespace NYdb::NTests { + +class TFakeCounter : public NMetrics::ICounter { +public: + void Inc() override { + Count_.fetch_add(1, std::memory_order_relaxed); + } + + int64_t Get() const { + return Count_.load(std::memory_order_relaxed); + } + +private: + std::atomic Count_{0}; +}; + +class TFakeHistogram : public NMetrics::IHistogram { +public: + void Record(double value) override { + std::lock_guard lock(Mutex_); + Values_.push_back(value); + } + + std::vector GetValues() const { + std::lock_guard lock(Mutex_); + return Values_; + } + + size_t Count() const { + std::lock_guard lock(Mutex_); + return Values_.size(); + } + +private: + mutable std::mutex Mutex_; + std::vector Values_; +}; + +class TFakeGauge : public NMetrics::IGauge { +public: + void Add(double delta) override { Value_ += delta; } + void Set(double value) override { Value_ = value; } + double Get() const { return Value_; } + +private: + double Value_ = 0.0; +}; + +struct TMetricKey { + std::string Name; + NMetrics::TLabels Labels; + + bool operator==(const TMetricKey& other) const = default; + bool operator<(const TMetricKey& other) const { + if (Name != other.Name) return Name < other.Name; + return Labels < other.Labels; + } +}; + +class TFakeMetricRegistry : public NMetrics::IMetricRegistry { +public: + std::shared_ptr Counter(const std::string& name + , const NMetrics::TLabels& labels + , const std::string& /*description*/ + , const std::string& /*unit*/ + ) override { + std::lock_guard lock(Mutex_); + auto key = TMetricKey{name, labels}; + auto it = Counters_.find(key); + if (it != Counters_.end()) { + return it->second; + } + auto counter = std::make_shared(); + Counters_[key] = counter; + return counter; + } + + std::shared_ptr Gauge(const std::string& name + , const NMetrics::TLabels& labels + , const std::string& /*description*/ + , const std::string& /*unit*/ + ) override { + std::lock_guard lock(Mutex_); + auto key = TMetricKey{name, labels}; + auto gauge = std::make_shared(); + Gauges_[key] = gauge; + return gauge; + } + + std::shared_ptr Histogram(const std::string& name + , const std::vector& /*buckets*/ + , const NMetrics::TLabels& labels + , const std::string& /*description*/ + , const std::string& /*unit*/ + ) override { + std::lock_guard lock(Mutex_); + auto key = TMetricKey{name, labels}; + auto it = Histograms_.find(key); + if (it != Histograms_.end()) { + return it->second; + } + auto histogram = std::make_shared(); + Histograms_[key] = histogram; + return histogram; + } + + std::shared_ptr GetCounter(const std::string& name, const NMetrics::TLabels& labels = {}) const { + std::lock_guard lock(Mutex_); + auto it = Counters_.find(TMetricKey{name, labels}); + return it != Counters_.end() ? it->second : nullptr; + } + + std::shared_ptr GetHistogram(const std::string& name, const NMetrics::TLabels& labels = {}) const { + std::lock_guard lock(Mutex_); + auto it = Histograms_.find(TMetricKey{name, labels}); + return it != Histograms_.end() ? it->second : nullptr; + } + +private: + mutable std::mutex Mutex_; + std::map> Counters_; + std::map> Gauges_; + std::map> Histograms_; +}; + +} // namespace NYdb::NTests diff --git a/tests/common/fake_trace_provider.h b/tests/common/fake_trace_provider.h new file mode 100644 index 00000000000..90fd7de8e49 --- /dev/null +++ b/tests/common/fake_trace_provider.h @@ -0,0 +1,156 @@ +#pragma once + +#include + +#include +#include + +namespace NYdb::NTests { + +struct TFakeEvent { + std::string Name; + std::map Attributes; +}; + +class TFakeScope : public NTrace::IScope { +}; + +class TFakeSpan : public NTrace::ISpan { +public: + void End() override { + std::lock_guard lock(Mutex_); + Ended_ = true; + } + + void SetAttribute(const std::string& key, const std::string& value) override { + std::lock_guard lock(Mutex_); + StringAttributes_[key] = value; + } + + void SetAttribute(const std::string& key, int64_t value) override { + std::lock_guard lock(Mutex_); + IntAttributes_[key] = value; + } + + void AddEvent(const std::string& name, const std::map& attributes) override { + std::lock_guard lock(Mutex_); + Events_.push_back({name, attributes}); + } + + std::unique_ptr Activate() override { + std::lock_guard lock(Mutex_); + Activated_ = true; + return std::make_unique(); + } + + bool IsEnded() const { + std::lock_guard lock(Mutex_); + return Ended_; + } + + bool IsActivated() const { + std::lock_guard lock(Mutex_); + return Activated_; + } + + std::string GetStringAttribute(const std::string& key) const { + std::lock_guard lock(Mutex_); + auto it = StringAttributes_.find(key); + return it != StringAttributes_.end() ? it->second : ""; + } + + bool HasStringAttribute(const std::string& key) const { + std::lock_guard lock(Mutex_); + return StringAttributes_.contains(key); + } + + int64_t GetIntAttribute(const std::string& key) const { + std::lock_guard lock(Mutex_); + auto it = IntAttributes_.find(key); + return it != IntAttributes_.end() ? it->second : 0; + } + + bool HasIntAttribute(const std::string& key) const { + std::lock_guard lock(Mutex_); + return IntAttributes_.contains(key); + } + + std::vector GetEvents() const { + std::lock_guard lock(Mutex_); + return Events_; + } + +private: + mutable std::mutex Mutex_; + bool Ended_ = false; + bool Activated_ = false; + std::map StringAttributes_; + std::map IntAttributes_; + std::vector Events_; +}; + +class TFakeTracer : public NTrace::ITracer { +public: + std::shared_ptr StartSpan(const std::string& name, NTrace::ESpanKind kind) override { + auto span = std::make_shared(); + std::lock_guard lock(Mutex_); + Spans_.push_back({name, kind, span}); + return span; + } + + struct TSpanRecord { + std::string Name; + NTrace::ESpanKind Kind; + std::shared_ptr Span; + }; + + std::vector GetSpans() const { + std::lock_guard lock(Mutex_); + return Spans_; + } + + std::shared_ptr GetLastSpan() const { + std::lock_guard lock(Mutex_); + return Spans_.empty() ? nullptr : Spans_.back().Span; + } + + TSpanRecord GetLastSpanRecord() const { + std::lock_guard lock(Mutex_); + return Spans_.back(); + } + + size_t SpanCount() const { + std::lock_guard lock(Mutex_); + return Spans_.size(); + } + +private: + mutable std::mutex Mutex_; + std::vector Spans_; +}; + +class TFakeTraceProvider : public NTrace::ITraceProvider { +public: + std::shared_ptr GetTracer(const std::string& name) override { + std::lock_guard lock(Mutex_); + auto it = Tracers_.find(name); + if (it != Tracers_.end()) { + return it->second; + } + auto tracer = std::make_shared(); + Tracers_[name] = tracer; + return tracer; + } + + std::shared_ptr GetFakeTracer(const std::string& name) const { + std::lock_guard lock(Mutex_); + auto it = Tracers_.find(name); + return it != Tracers_.end() ? it->second : nullptr; + } + +private: + mutable std::mutex Mutex_; + std::map> Tracers_; +}; + +} // namespace NYdb::NTests diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index d5a1d709245..8aa28839a63 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(auth) add_subdirectory(basic_example) add_subdirectory(bulk_upsert) +add_subdirectory(metrics) add_subdirectory(server_restart) add_subdirectory(sessions) add_subdirectory(sessions_pool) diff --git a/tests/integration/metrics/CMakeLists.txt b/tests/integration/metrics/CMakeLists.txt new file mode 100644 index 00000000000..6c9bb8b3abd --- /dev/null +++ b/tests/integration/metrics/CMakeLists.txt @@ -0,0 +1,12 @@ +add_ydb_test(NAME metrics_it GTEST + INCLUDE_DIRS + ${YDB_SDK_SOURCE_DIR} + SOURCES + main.cpp + LINK_LIBRARIES + yutil + YDB-CPP-SDK::Query + client-metrics + LABELS + integration +) diff --git a/tests/integration/metrics/main.cpp b/tests/integration/metrics/main.cpp new file mode 100644 index 00000000000..7624f8b2b0c --- /dev/null +++ b/tests/integration/metrics/main.cpp @@ -0,0 +1,268 @@ +#include +#include +#include +#include + +#include + +#include + +using namespace NYdb; +using namespace NYdb::NQuery; +using namespace NYdb::NTests; + +namespace { + +std::string GetEnvOrEmpty(const char* name) { + const char* value = std::getenv(name); + return value ? std::string(value) : std::string(); +} + +struct TRunArgs { + TDriver Driver; + std::shared_ptr Registry; +}; + +TRunArgs MakeRunArgs() { + std::string endpoint = GetEnvOrEmpty("YDB_ENDPOINT"); + std::string database = GetEnvOrEmpty("YDB_DATABASE"); + + auto registry = std::make_shared(); + + auto driverConfig = TDriverConfig() + .SetEndpoint(endpoint) + .SetDatabase(database) + .SetAuthToken(GetEnvOrEmpty("YDB_TOKEN")) + .SetMetricRegistry(registry); + + TDriver driver(driverConfig); + return {driver, registry}; +} + +std::shared_ptr GetFailedCounter( + const std::shared_ptr& registry, + const std::string& operation, + EStatus status) +{ + const std::string statusName = ToString(status); + return registry->GetCounter("db.client.operation.failed", { + {"db.system.name", "ydb"}, + {"db.namespace", GetEnvOrEmpty("YDB_DATABASE")}, + {"db.operation.name", operation}, + {"ydb.client.api", "Query"}, + {"db.response.status_code", statusName}, + {"error.type", statusName}, + }); +} + +// Label set must match NSdkStats::TClientOperationStatCollector::RecordLatency (stats.h) and +// unit tests in tests/unit/client/observability/metrics_ut.cpp: success has no +// db.response.status_code; errors add status and error.type. +std::shared_ptr GetDuration( + const std::shared_ptr& registry, + const std::string& operation, + EStatus status) +{ + NMetrics::TLabels labels = { + {"db.system.name", "ydb"}, + {"db.namespace", GetEnvOrEmpty("YDB_DATABASE")}, + {"db.operation.name", operation}, + {"ydb.client.api", "Query"}, + }; + if (status != EStatus::SUCCESS) { + const std::string statusName = ToString(status); + labels["db.response.status_code"] = statusName; + labels["error.type"] = statusName; + } + return registry->GetHistogram("db.client.operation.duration", labels); +} + +} // namespace + +TEST(QueryMetricsIntegration, ExecuteQuerySuccessRecordsMetrics) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto session = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(session.IsSuccess()) << session.GetIssues().ToString(); + + auto result = session.GetSession().ExecuteQuery( + "SELECT 1;", + TTxControl::BeginTx().CommitTx() + ).ExtractValueSync(); + ASSERT_EQ(result.GetStatus(), EStatus::SUCCESS) << result.GetIssues().ToString(); + + auto duration = GetDuration(registry, "ydb.ExecuteQuery", EStatus::SUCCESS); + ASSERT_NE(duration, nullptr) << "ExecuteQuery duration histogram not created"; + EXPECT_GE(duration->Count(), 1u); + for (double v : duration->GetValues()) { + EXPECT_GE(v, 0.0); + } + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, ExecuteQueryErrorRecordsErrorMetric) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto session = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(session.IsSuccess()) << session.GetIssues().ToString(); + + auto result = session.GetSession().ExecuteQuery( + "INVALID SQL QUERY !!!", + TTxControl::BeginTx().CommitTx() + ).ExtractValueSync(); + EXPECT_NE(result.GetStatus(), EStatus::SUCCESS); + + auto failed = GetFailedCounter(registry, "ydb.ExecuteQuery", result.GetStatus()); + ASSERT_NE(failed, nullptr); + EXPECT_GE(failed->Get(), 1); + + auto duration = GetDuration(registry, "ydb.ExecuteQuery", result.GetStatus()); + ASSERT_NE(duration, nullptr); + EXPECT_GE(duration->Count(), 1u); + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, CreateSessionRecordsMetrics) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto session = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(session.IsSuccess()) << session.GetIssues().ToString(); + + // Query client uses observation name "CreateSession" -> ydb.CreateSession in metrics. + auto duration = GetDuration(registry, "ydb.CreateSession", EStatus::SUCCESS); + ASSERT_NE(duration, nullptr) << "CreateSession duration histogram not created"; + EXPECT_GE(duration->Count(), 1u); + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, CommitTransactionRecordsMetrics) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto sessionResult = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(sessionResult.IsSuccess()) << sessionResult.GetIssues().ToString(); + auto session = sessionResult.GetSession(); + + auto beginResult = session.BeginTransaction(TTxSettings::SerializableRW()).ExtractValueSync(); + ASSERT_TRUE(beginResult.IsSuccess()) << beginResult.GetIssues().ToString(); + auto tx = beginResult.GetTransaction(); + + auto execResult = tx.GetSession().ExecuteQuery( + "SELECT 1;", + TTxControl::Tx(tx) + ).ExtractValueSync(); + ASSERT_EQ(execResult.GetStatus(), EStatus::SUCCESS) << execResult.GetIssues().ToString(); + + if (execResult.GetTransaction()) { + auto commitResult = execResult.GetTransaction()->Commit().ExtractValueSync(); + ASSERT_TRUE(commitResult.IsSuccess()) << commitResult.GetIssues().ToString(); + + auto commitDuration = GetDuration(registry, "ydb.Commit", EStatus::SUCCESS); + ASSERT_NE(commitDuration, nullptr); + EXPECT_GE(commitDuration->Count(), 1u); + } + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, RollbackTransactionRecordsMetrics) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto sessionResult = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(sessionResult.IsSuccess()) << sessionResult.GetIssues().ToString(); + auto session = sessionResult.GetSession(); + + auto beginResult = session.BeginTransaction(TTxSettings::SerializableRW()).ExtractValueSync(); + ASSERT_TRUE(beginResult.IsSuccess()) << beginResult.GetIssues().ToString(); + auto tx = beginResult.GetTransaction(); + + auto rollbackResult = tx.Rollback().ExtractValueSync(); + ASSERT_TRUE(rollbackResult.IsSuccess()) << rollbackResult.GetIssues().ToString(); + + auto rollbackDuration = GetDuration(registry, "ydb.Rollback", EStatus::SUCCESS); + ASSERT_NE(rollbackDuration, nullptr); + EXPECT_GE(rollbackDuration->Count(), 1u); + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, MultipleQueriesAccumulateMetrics) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto sessionResult = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(sessionResult.IsSuccess()) << sessionResult.GetIssues().ToString(); + auto session = sessionResult.GetSession(); + + const int numQueries = 5; + for (int i = 0; i < numQueries; ++i) { + auto result = session.ExecuteQuery( + "SELECT 1;", + TTxControl::BeginTx().CommitTx() + ).ExtractValueSync(); + ASSERT_EQ(result.GetStatus(), EStatus::SUCCESS) << result.GetIssues().ToString(); + } + + auto duration = GetDuration(registry, "ydb.ExecuteQuery", EStatus::SUCCESS); + ASSERT_NE(duration, nullptr); + EXPECT_EQ(duration->Count(), static_cast(numQueries)); + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, NoRegistryDoesNotBreakOperations) { + std::string endpoint = GetEnvOrEmpty("YDB_ENDPOINT"); + std::string database = GetEnvOrEmpty("YDB_DATABASE"); + + auto driverConfig = TDriverConfig() + .SetEndpoint(endpoint) + .SetDatabase(database) + .SetAuthToken(GetEnvOrEmpty("YDB_TOKEN")); + + TDriver driver(driverConfig); + TQueryClient client(driver); + + auto session = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(session.IsSuccess()) << session.GetIssues().ToString(); + + auto result = session.GetSession().ExecuteQuery( + "SELECT 1;", + TTxControl::BeginTx().CommitTx() + ).ExtractValueSync(); + EXPECT_EQ(result.GetStatus(), EStatus::SUCCESS) << result.GetIssues().ToString(); + + driver.Stop(true); +} + +TEST(QueryMetricsIntegration, DurationValuesAreRealistic) { + auto [driver, registry] = MakeRunArgs(); + TQueryClient client(driver); + + auto sessionResult = client.GetSession().ExtractValueSync(); + ASSERT_TRUE(sessionResult.IsSuccess()) << sessionResult.GetIssues().ToString(); + auto session = sessionResult.GetSession(); + + auto result = session.ExecuteQuery( + "SELECT 1;", + TTxControl::BeginTx().CommitTx() + ).ExtractValueSync(); + ASSERT_EQ(result.GetStatus(), EStatus::SUCCESS) << result.GetIssues().ToString(); + + auto duration = GetDuration(registry, "ydb.ExecuteQuery", EStatus::SUCCESS); + ASSERT_NE(duration, nullptr); + ASSERT_GE(duration->Count(), 1u); + + for (double v : duration->GetValues()) { + EXPECT_GE(v, 0.0) << "Duration must be non-negative"; + EXPECT_LT(v, 30.0) << "Duration > 30s is unrealistic for SELECT 1"; + } + + driver.Stop(true); +} diff --git a/tests/unit/client/CMakeLists.txt b/tests/unit/client/CMakeLists.txt index 03b0a17c386..d59fa263748 100644 --- a/tests/unit/client/CMakeLists.txt +++ b/tests/unit/client/CMakeLists.txt @@ -100,3 +100,44 @@ add_ydb_test(NAME client-ydb_value_ut GTEST LABELS unit ) + +add_ydb_test(NAME client-ydb_metrics_ut GTEST + INCLUDE_DIRS + ${YDB_SDK_SOURCE_DIR} + SOURCES + observability/metrics_ut.cpp + LINK_LIBRARIES + yutil + impl-observability + client-ydb_query-impl + client-ydb_table-impl + client-metrics + LABELS + unit +) + +add_ydb_test(NAME client-ydb_query_spans_ut GTEST + INCLUDE_DIRS + ${YDB_SDK_SOURCE_DIR} + SOURCES + query/query_spans_ut.cpp + LINK_LIBRARIES + yutil + client-ydb_query-impl + client-trace + LABELS + unit +) + +add_ydb_test(NAME client-ydb_table_spans_ut GTEST + INCLUDE_DIRS + ${YDB_SDK_SOURCE_DIR} + SOURCES + table/table_spans_ut.cpp + LINK_LIBRARIES + yutil + client-ydb_table-impl + client-trace + LABELS + unit +) diff --git a/tests/unit/client/observability/metrics_ut.cpp b/tests/unit/client/observability/metrics_ut.cpp new file mode 100644 index 00000000000..279691a9a40 --- /dev/null +++ b/tests/unit/client/observability/metrics_ut.cpp @@ -0,0 +1,415 @@ +#include +#include +#include +#include + +#include + +using namespace NYdb; +using namespace NYdb::NObservability; +using namespace NYdb::NMetrics; +using namespace NYdb::NTests; +using namespace NYdb::NSdkStats; + +namespace { + constexpr const char kTestDbNamespace[] = "/Root/testdb"; + + std::string YdbOp(const std::string& op) { + return op.rfind("ydb.", 0) == 0 ? op : "ydb." + op; + } +} // namespace + +// --------------------------------------------------------------------------- +// TRequestMetrics (shared logic) +// --------------------------------------------------------------------------- + +class RequestMetricsTest : public ::testing::Test { +protected: + void SetUp() override { + Registry = std::make_shared(); + OpCollector = TStatCollector::TClientOperationStatCollector( + nullptr, kTestDbNamespace, "", Registry); + } + + std::shared_ptr FailedCounter(const std::string& op, EStatus status) { + const std::string statusName = ToString(status); + return Registry->GetCounter("db.client.operation.failed", { + {"db.system.name", "ydb"}, + {"db.namespace", kTestDbNamespace}, + {"db.operation.name", YdbOp(op)}, + {"ydb.client.api", "Unspecified"}, + {"db.response.status_code", statusName}, + {"error.type", statusName}, + }); + } + + std::shared_ptr DurationHistogram(const std::string& op, EStatus status) { + TLabels labels = { + {"db.system.name", "ydb"}, + {"db.namespace", kTestDbNamespace}, + {"db.operation.name", YdbOp(op)}, + {"ydb.client.api", "Unspecified"}, + }; + if (status != EStatus::SUCCESS) { + labels["db.response.status_code"] = ToString(status); + labels["error.type"] = ToString(status); + } + return Registry->GetHistogram("db.client.operation.duration", labels); + } + + TStatCollector::TClientOperationStatCollector OpCollector; + std::shared_ptr Registry; +}; + +TEST_F(RequestMetricsTest, SuccessDoesNotIncrementFailedCounter) { + { + TRequestMetrics metrics(&OpCollector, "DoSomething", TLog()); + metrics.End(EStatus::SUCCESS); + } + + auto failed = FailedCounter("DoSomething", EStatus::UNAVAILABLE); + // Counter might not exist at all if it was never incremented; either state is fine. + if (failed) { + EXPECT_EQ(failed->Get(), 0); + } +} + +TEST_F(RequestMetricsTest, FailureIncrementsFailedCounter) { + { + TRequestMetrics metrics(&OpCollector, "DoSomething", TLog()); + metrics.End(EStatus::UNAVAILABLE); + } + + auto failed = FailedCounter("DoSomething", EStatus::UNAVAILABLE); + ASSERT_NE(failed, nullptr); + EXPECT_EQ(failed->Get(), 1); +} + +TEST_F(RequestMetricsTest, DurationRecordedOnEnd) { + { + TRequestMetrics metrics(&OpCollector, "DoSomething", TLog()); + metrics.End(EStatus::SUCCESS); + } + + auto hist = DurationHistogram("DoSomething", EStatus::SUCCESS); + ASSERT_NE(hist, nullptr); + EXPECT_EQ(hist->Count(), 1u); + EXPECT_GE(hist->GetValues()[0], 0.0); +} + +TEST_F(RequestMetricsTest, DurationIsInSeconds) { + { + TRequestMetrics metrics(&OpCollector, "DoSomething", TLog()); + metrics.End(EStatus::SUCCESS); + } + + auto hist = DurationHistogram("DoSomething", EStatus::SUCCESS); + ASSERT_NE(hist, nullptr); + EXPECT_LT(hist->GetValues()[0], 1.0); +} + +TEST_F(RequestMetricsTest, DoubleEndIsIdempotent) { + TRequestMetrics metrics(&OpCollector, "DoSomething", TLog()); + metrics.End(EStatus::SUCCESS); + metrics.End(EStatus::INTERNAL_ERROR); + + auto failed = FailedCounter("DoSomething", EStatus::INTERNAL_ERROR); + // The second End is a no-op, so no failed counter should have been produced. + if (failed) { + EXPECT_EQ(failed->Get(), 0); + } + + auto hist = DurationHistogram("DoSomething", EStatus::SUCCESS); + ASSERT_NE(hist, nullptr); + EXPECT_EQ(hist->Count(), 1u); +} + +TEST_F(RequestMetricsTest, DestructorCallsEndWithClientInternalError) { + { + TRequestMetrics metrics(&OpCollector, "DoSomething", TLog()); + } + + auto failed = FailedCounter("DoSomething", EStatus::CLIENT_INTERNAL_ERROR); + ASSERT_NE(failed, nullptr); + EXPECT_EQ(failed->Get(), 1); + + auto hist = DurationHistogram("DoSomething", EStatus::CLIENT_INTERNAL_ERROR); + ASSERT_NE(hist, nullptr); + EXPECT_EQ(hist->Count(), 1u); +} + +TEST_F(RequestMetricsTest, NullRegistryDoesNotCrash) { + EXPECT_NO_THROW({ + TStatCollector::TClientOperationStatCollector nullCollector; + TRequestMetrics metrics(&nullCollector, "DoSomething", TLog()); + metrics.End(EStatus::SUCCESS); + }); +} + +TEST_F(RequestMetricsTest, DifferentOperationsHaveSeparateMetrics) { + { + TRequestMetrics m1(&OpCollector, "OpA", TLog()); + m1.End(EStatus::SUCCESS); + } + { + TRequestMetrics m2(&OpCollector, "OpB", TLog()); + m2.End(EStatus::OVERLOADED); + } + + auto failedA = FailedCounter("OpA", EStatus::SUCCESS); + if (failedA) { + EXPECT_EQ(failedA->Get(), 0); + } + auto failedB = FailedCounter("OpB", EStatus::OVERLOADED); + ASSERT_NE(failedB, nullptr); + EXPECT_EQ(failedB->Get(), 1); + EXPECT_EQ(DurationHistogram("OpA", EStatus::SUCCESS)->Count(), 1u); + EXPECT_EQ(DurationHistogram("OpB", EStatus::OVERLOADED)->Count(), 1u); +} + +TEST_F(RequestMetricsTest, MultipleRequestsAccumulate) { + for (int i = 0; i < 5; ++i) { + TRequestMetrics metrics(&OpCollector, "Op", TLog()); + metrics.End(i % 2 == 0 ? EStatus::SUCCESS : EStatus::TIMEOUT); + } + + auto failed = FailedCounter("Op", EStatus::TIMEOUT); + ASSERT_NE(failed, nullptr); + EXPECT_EQ(failed->Get(), 2); + EXPECT_EQ(DurationHistogram("Op", EStatus::SUCCESS)->Count(), 3u); + EXPECT_EQ(DurationHistogram("Op", EStatus::TIMEOUT)->Count(), 2u); +} + +TEST_F(RequestMetricsTest, AllErrorStatusesIncrementFailedCounter) { + std::vector errorStatuses = { + EStatus::BAD_REQUEST, + EStatus::UNAUTHORIZED, + EStatus::INTERNAL_ERROR, + EStatus::UNAVAILABLE, + EStatus::OVERLOADED, + EStatus::TIMEOUT, + EStatus::NOT_FOUND, + EStatus::CLIENT_INTERNAL_ERROR, + }; + + for (auto status : errorStatuses) { + TRequestMetrics metrics(&OpCollector, "Op", TLog()); + metrics.End(status); + } + + // Each status contributes its own dedicated counter series. + for (auto status : errorStatuses) { + auto failed = FailedCounter("Op", status); + ASSERT_NE(failed, nullptr); + EXPECT_EQ(failed->Get(), 1) << "status " << ToString(status); + } +} + +TEST_F(RequestMetricsTest, DeprecatedRequestAndErrorCountersAreNotEmitted) { + { + TRequestMetrics metrics(&OpCollector, "Op", TLog()); + metrics.End(EStatus::UNAVAILABLE); + } + + // Deprecated external counters must not be produced anymore. + TLabels baseLabels = { + {"db.system.name", "ydb"}, + {"db.namespace", kTestDbNamespace}, + {"db.operation.name", "ydb.Op"}, + {"ydb.client.api", "Unspecified"}, + }; + EXPECT_EQ(Registry->GetCounter("db.client.operation.requests", baseLabels), nullptr); + EXPECT_EQ(Registry->GetCounter("db.client.operation.errors", baseLabels), nullptr); +} + +TEST(RequestMetricsDbNamespaceTest, DifferentNamespacesAreSeparateMetricSeries) { + auto registry = std::make_shared(); + TStatCollector::TClientOperationStatCollector collectorA(nullptr, "/db/alpha", "", registry); + TStatCollector::TClientOperationStatCollector collectorB(nullptr, "/db/beta", "", registry); + + { + TRequestMetrics m(&collectorA, "GetSession", TLog()); + m.End(EStatus::SUCCESS); + } + { + TRequestMetrics m(&collectorB, "GetSession", TLog()); + m.End(EStatus::SUCCESS); + } + + auto labelsAlpha = NMetrics::TLabels{ + {"db.system.name", "ydb"}, + {"db.namespace", "/db/alpha"}, + {"db.operation.name", "ydb.GetSession"}, + {"ydb.client.api", "Unspecified"}, + }; + auto labelsBeta = NMetrics::TLabels{ + {"db.system.name", "ydb"}, + {"db.namespace", "/db/beta"}, + {"db.operation.name", "ydb.GetSession"}, + {"ydb.client.api", "Unspecified"}, + }; + + auto durAlpha = registry->GetHistogram("db.client.operation.duration", labelsAlpha); + auto durBeta = registry->GetHistogram("db.client.operation.duration", labelsBeta); + ASSERT_NE(durAlpha, nullptr); + ASSERT_NE(durBeta, nullptr); + EXPECT_EQ(durAlpha->Count(), 1u); + EXPECT_EQ(durBeta->Count(), 1u); +} + +TEST(RequestMetricsClientAliasesTest, QueryOperationsUseOtelStandardMetrics) { + auto registry = std::make_shared(); + TStatCollector::TClientOperationStatCollector collector(nullptr, "", "Query", registry); + + NObservability::TRequestMetrics metrics(&collector, "ydb.ExecuteQuery", TLog()); + metrics.End(EStatus::SUCCESS); + + EXPECT_NE( + registry->GetHistogram( + "db.client.operation.duration", + { + {"db.system.name", "ydb"}, + {"db.namespace", ""}, + {"db.operation.name", "ydb.ExecuteQuery"}, + {"ydb.client.api", "Query"}, + } + ), + nullptr + ); +} + +TEST(RequestMetricsClientAliasesTest, TableOperationsUseOtelStandardMetrics) { + auto registry = std::make_shared(); + TStatCollector::TClientOperationStatCollector collector(nullptr, "", "Table", registry); + + NObservability::TRequestMetrics metrics(&collector, "ExecuteDataQuery", TLog()); + metrics.End(EStatus::SUCCESS); + + EXPECT_NE( + registry->GetHistogram( + "db.client.operation.duration", + { + {"db.system.name", "ydb"}, + {"db.namespace", ""}, + {"db.operation.name", "ydb.ExecuteDataQuery"}, + {"ydb.client.api", "Table"}, + } + ), + nullptr + ); +} + +// --------------------------------------------------------------------------- +// Session pool / connection metrics +// --------------------------------------------------------------------------- + +namespace { + NMetrics::TLabels PoolLabels(const std::string& database, const std::string& clientType) { + return { + {"db.system.name", "ydb"}, + {"db.namespace", database}, + {"db.client.connection.pool.name", clientType.empty() ? std::string("Unspecified") : clientType}, + {"ydb.client.api", clientType.empty() ? std::string("Unspecified") : clientType}, + }; + } +} // namespace + +class ConnectionPoolMetricsTest : public ::testing::Test { +protected: + void SetUp() override { + Registry = std::make_shared(); + Collector = TStatCollector::TSessionPoolStatCollector( + /*activeSessions=*/nullptr, + /*inPoolSessions=*/nullptr, + /*fakeSessions=*/nullptr, + /*waiters=*/nullptr, + Registry, + kTestDbNamespace, + "Query"); + } + + std::shared_ptr Registry; + TStatCollector::TSessionPoolStatCollector Collector; +}; + +TEST_F(ConnectionPoolMetricsTest, CreateTimeRecorded) { + Collector.RecordConnectionCreateTime(0.002); + Collector.RecordConnectionCreateTime(0.100); + + auto hist = Registry->GetHistogram( + "db.client.connection.create_time", + PoolLabels(kTestDbNamespace, "Query")); + ASSERT_NE(hist, nullptr); + EXPECT_EQ(hist->Count(), 2u); + EXPECT_DOUBLE_EQ(hist->GetValues()[0], 0.002); + EXPECT_DOUBLE_EQ(hist->GetValues()[1], 0.100); +} + +TEST_F(ConnectionPoolMetricsTest, TimeoutsIncrement) { + Collector.IncConnectionTimeouts(); + Collector.IncConnectionTimeouts(); + Collector.IncConnectionTimeouts(); + + auto counter = Registry->GetCounter( + "db.client.connection.timeouts", + PoolLabels(kTestDbNamespace, "Query")); + ASSERT_NE(counter, nullptr); + EXPECT_EQ(counter->Get(), 3); +} + +TEST_F(ConnectionPoolMetricsTest, ConnectionCountGauge) { + Collector.UpdateConnectionCount(5); + Collector.UpdateConnectionCount(10); + Collector.UpdateConnectionCount(2); + + // Gauge uses `Set` semantics; no dedicated accessor in the fake, but + // the registration alone should not crash and the series must exist. + auto hist = Registry->GetHistogram( + "db.client.connection.create_time", + PoolLabels(kTestDbNamespace, "Query")); + EXPECT_EQ(hist, nullptr); // was never recorded +} + +TEST_F(ConnectionPoolMetricsTest, PendingRequestsGauge) { + Collector.UpdatePendingRequests(0); + Collector.UpdatePendingRequests(7); + + // No crash. Emission of a gauge is exercised; fake registry does not + // expose gauge values but setup must complete cleanly. + SUCCEED(); +} + +TEST(ConnectionPoolMetricsNoRegistryTest, NullRegistryIsSafe) { + TStatCollector::TSessionPoolStatCollector collector; // default ctor => no registry + EXPECT_FALSE(collector.HasExternalRegistry()); + EXPECT_NO_THROW({ + collector.RecordConnectionCreateTime(1.0); + collector.IncConnectionTimeouts(); + collector.UpdateConnectionCount(3); + collector.UpdatePendingRequests(1); + }); +} + +TEST(ConnectionPoolMetricsPoolNameTest, DifferentPoolsHaveSeparateMetrics) { + auto registry = std::make_shared(); + TStatCollector::TSessionPoolStatCollector queryPool( + nullptr, nullptr, nullptr, nullptr, registry, kTestDbNamespace, "Query"); + TStatCollector::TSessionPoolStatCollector tablePool( + nullptr, nullptr, nullptr, nullptr, registry, kTestDbNamespace, "Table"); + + queryPool.IncConnectionTimeouts(); + tablePool.IncConnectionTimeouts(); + tablePool.IncConnectionTimeouts(); + + auto queryCounter = registry->GetCounter( + "db.client.connection.timeouts", + PoolLabels(kTestDbNamespace, "Query")); + auto tableCounter = registry->GetCounter( + "db.client.connection.timeouts", + PoolLabels(kTestDbNamespace, "Table")); + + ASSERT_NE(queryCounter, nullptr); + ASSERT_NE(tableCounter, nullptr); + EXPECT_EQ(queryCounter->Get(), 1); + EXPECT_EQ(tableCounter->Get(), 2); +} diff --git a/tests/unit/client/query/query_spans_ut.cpp b/tests/unit/client/query/query_spans_ut.cpp new file mode 100644 index 00000000000..c8de94d2e5e --- /dev/null +++ b/tests/unit/client/query/query_spans_ut.cpp @@ -0,0 +1,373 @@ +#include +#include + +#include + +#include + +using namespace NYdb; +using namespace NYdb::NTests; + +namespace { + +constexpr const char kTestDbNamespace[] = "/Root/testdb"; + +NYdb::NObservability::TRequestSpan MakeRequestSpan( + std::shared_ptr tracer, + const std::string& operationName, + const std::string& endpoint +) { + return NYdb::NObservability::TRequestSpan( + "Query", + std::move(tracer), + operationName, + endpoint, + kTestDbNamespace + ); +} + +} // namespace + +class QuerySpanTest : public ::testing::Test { +protected: + void SetUp() override { + Tracer = std::make_shared(); + } + + std::shared_ptr Tracer; +}; + +TEST_F(QuerySpanTest, SpanNameFormat) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "localhost:2135"); + span.End(EStatus::SUCCESS); + + ASSERT_EQ(Tracer->SpanCount(), 1u); + EXPECT_EQ(Tracer->GetLastSpanRecord().Name, "ydb.ExecuteQuery"); +} + +TEST_F(QuerySpanTest, SpanKindIsClient) { + auto span = MakeRequestSpan(Tracer, "CreateSession", "localhost:2135"); + span.End(EStatus::SUCCESS); + + ASSERT_EQ(Tracer->SpanCount(), 1u); + EXPECT_EQ(Tracer->GetLastSpanRecord().Kind, NTrace::ESpanKind::CLIENT); +} + +TEST_F(QuerySpanTest, DbSystemAttribute) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.system.name"), "ydb"); +} + +TEST_F(QuerySpanTest, DbNamespaceAndClientApi) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.namespace"), kTestDbNamespace); + EXPECT_EQ(fakeSpan->GetStringAttribute("ydb.client.api"), "Query"); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.operation.name"), "ydb.ExecuteQuery"); +} + +TEST_F(QuerySpanTest, ServerAddressAndPort) { + auto span = MakeRequestSpan(Tracer, "Commit", "ydb.server:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "ydb.server"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2135); +} + +TEST_F(QuerySpanTest, ServerAddressCustomPort) { + auto span = MakeRequestSpan(Tracer, "Rollback", "myhost:9090"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "myhost"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 9090); +} + +TEST_F(QuerySpanTest, ServerAddressNoPortDefaultsTo2135) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "myhost"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "myhost"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2135); +} + +TEST_F(QuerySpanTest, IPv6EndpointParsing) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "[::1]:2136"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "::1"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2136); +} + +TEST_F(QuerySpanTest, IPv6EndpointNoPort) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "[fe80::1]"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "fe80::1"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2135); +} + +TEST_F(QuerySpanTest, PeerEndpointAttributes) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "discovery.ydb:2135"); + span.SetPeerEndpoint("10.0.0.1:2136"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("network.peer.address"), "10.0.0.1"); + EXPECT_EQ(fakeSpan->GetIntAttribute("network.peer.port"), 2136); +} + +TEST_F(QuerySpanTest, SuccessStatusDoesNotSetErrorAttrs) { + auto span = MakeRequestSpan(Tracer, "Commit", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_FALSE(fakeSpan->HasStringAttribute("db.response.status_code")); + EXPECT_FALSE(fakeSpan->HasStringAttribute("error.type")); +} + +TEST_F(QuerySpanTest, ErrorStatusSetsErrorType) { + auto span = MakeRequestSpan(Tracer, "Rollback", "localhost:2135"); + span.End(EStatus::UNAVAILABLE); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.response.status_code"), "UNAVAILABLE"); + EXPECT_TRUE(fakeSpan->HasStringAttribute("error.type")); + EXPECT_FALSE(fakeSpan->GetStringAttribute("error.type").empty()); +} + +TEST_F(QuerySpanTest, SpanIsEndedAfterEnd) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "localhost:2135"); + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + + EXPECT_FALSE(fakeSpan->IsEnded()); + span.End(EStatus::SUCCESS); + EXPECT_TRUE(fakeSpan->IsEnded()); +} + +TEST_F(QuerySpanTest, NullTracerDoesNotCrash) { + EXPECT_NO_THROW({ + NYdb::NObservability::TRequestSpan span( + "Query", + nullptr, + "ExecuteQuery", + "localhost:2135", + kTestDbNamespace + ); + span.SetPeerEndpoint("10.0.0.1:2136"); + span.AddEvent("retry", {{"attempt", "1"}}); + span.End(EStatus::SUCCESS); + }); +} + +TEST_F(QuerySpanTest, DestructorEndsSpan) { + auto fakeSpan = [&]() -> std::shared_ptr { + auto span = MakeRequestSpan(Tracer, "CreateSession", "localhost:2135"); + return Tracer->GetLastSpan(); + }(); + + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->IsEnded()); +} + +TEST_F(QuerySpanTest, ExplicitEndThenDestructorDoesNotDoubleEnd) { + auto fakeSpan = [&]() -> std::shared_ptr { + auto span = MakeRequestSpan(Tracer, "Commit", "localhost:2135"); + span.End(EStatus::SUCCESS); + return Tracer->GetLastSpan(); + }(); + + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->IsEnded()); +} + +TEST_F(QuerySpanTest, AddEventForwarded) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "localhost:2135"); + span.AddEvent("retry", {{"ydb.attempt", "2"}, {"error.type", "UNAVAILABLE"}}); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + auto events = fakeSpan->GetEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].Name, "retry"); + EXPECT_EQ(events[0].Attributes.at("ydb.attempt"), "2"); + EXPECT_EQ(events[0].Attributes.at("error.type"), "UNAVAILABLE"); +} + +TEST_F(QuerySpanTest, EmptyPeerEndpointIgnored) { + auto span = MakeRequestSpan(Tracer, "CreateSession", "localhost:2135"); + span.SetPeerEndpoint(""); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_FALSE(fakeSpan->HasStringAttribute("network.peer.address")); + EXPECT_FALSE(fakeSpan->HasIntAttribute("network.peer.port")); +} + +TEST_F(QuerySpanTest, AllFourOperationNames) { + const std::vector operations = {"CreateSession", "ExecuteQuery", "Commit", "Rollback"}; + + for (const auto& op : operations) { + auto span = MakeRequestSpan(Tracer, op, "localhost:2135"); + span.End(EStatus::SUCCESS); + } + + auto spans = Tracer->GetSpans(); + ASSERT_EQ(spans.size(), 4u); + EXPECT_EQ(spans[0].Name, "ydb.CreateSession"); + EXPECT_EQ(spans[1].Name, "ydb.ExecuteQuery"); + EXPECT_EQ(spans[2].Name, "ydb.Commit"); + EXPECT_EQ(spans[3].Name, "ydb.Rollback"); + + for (const auto& record : spans) { + EXPECT_EQ(record.Kind, NTrace::ESpanKind::CLIENT); + } +} + +TEST_F(QuerySpanTest, MultipleErrorStatuses) { + std::vector errorStatuses = { + EStatus::BAD_REQUEST, + EStatus::UNAUTHORIZED, + EStatus::INTERNAL_ERROR, + EStatus::UNAVAILABLE, + EStatus::OVERLOADED, + EStatus::TIMEOUT, + EStatus::NOT_FOUND, + EStatus::CLIENT_INTERNAL_ERROR, + }; + + for (auto status : errorStatuses) { + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", "localhost:2135"); + span.End(status); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->HasStringAttribute("error.type")) + << "error.type missing for status " << static_cast(status); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.response.status_code"), + fakeSpan->GetStringAttribute("error.type")); + } +} + +TEST_F(QuerySpanTest, EmptyEndpointDoesNotCrash) { + EXPECT_NO_THROW({ + auto span = MakeRequestSpan(Tracer, "ExecuteQuery", ""); + span.End(EStatus::SUCCESS); + }); +} + +TEST_F(QuerySpanTest, ActivateReturnsScope) { + auto span = MakeRequestSpan(Tracer, "RetryQuery", "localhost:2135"); + auto scope = span.Activate(); + EXPECT_NE(scope, nullptr); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->IsActivated()); + + span.End(EStatus::SUCCESS); +} + +TEST_F(QuerySpanTest, ActivateNullTracerReturnsNull) { + NYdb::NObservability::TRequestSpan span( + "Query", + nullptr, + "RetryQuery", + "localhost:2135", + kTestDbNamespace + ); + auto scope = span.Activate(); + EXPECT_EQ(scope, nullptr); +} + +TEST_F(QuerySpanTest, InternalSpanKindIsPropagated) { + NYdb::NObservability::TRequestSpan span( + "Query", + Tracer, + "ydb.RunWithRetry", + "localhost:2135", + kTestDbNamespace, + TLog(), + NTrace::ESpanKind::INTERNAL + ); + span.End(EStatus::SUCCESS); + + ASSERT_EQ(Tracer->SpanCount(), 1u); + EXPECT_EQ(Tracer->GetLastSpanRecord().Name, "ydb.RunWithRetry"); + EXPECT_EQ(Tracer->GetLastSpanRecord().Kind, NTrace::ESpanKind::INTERNAL); +} + +TEST_F(QuerySpanTest, ErrorStatusAddsExceptionEvent) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteQuery", "localhost:2135"); + span.End(EStatus::UNAVAILABLE); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + auto events = fakeSpan->GetEvents(); + ASSERT_FALSE(events.empty()); + + bool found = false; + for (const auto& event : events) { + if (event.Name == "exception") { + found = true; + EXPECT_EQ(event.Attributes.at("exception.type"), "UNAVAILABLE"); + EXPECT_EQ(event.Attributes.at("exception.message"), "UNAVAILABLE"); + } + } + EXPECT_TRUE(found) << "expected an 'exception' event on a failed span"; +} + +TEST_F(QuerySpanTest, SuccessStatusNoExceptionEvent) { + auto span = MakeRequestSpan(Tracer, "ydb.Commit", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + for (const auto& event : fakeSpan->GetEvents()) { + EXPECT_NE(event.Name, "exception"); + } +} + +TEST_F(QuerySpanTest, RecordExceptionEmitsEvent) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteQuery", "localhost:2135"); + span.RecordException("TimeoutException", "query timed out"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + auto events = fakeSpan->GetEvents(); + ASSERT_FALSE(events.empty()); + bool found = false; + for (const auto& event : events) { + if (event.Name == "exception" + && event.Attributes.count("exception.type") + && event.Attributes.at("exception.type") == "TimeoutException") + { + found = true; + EXPECT_EQ(event.Attributes.at("exception.message"), "query timed out"); + } + } + EXPECT_TRUE(found); +} diff --git a/tests/unit/client/table/table_spans_ut.cpp b/tests/unit/client/table/table_spans_ut.cpp new file mode 100644 index 00000000000..7af5700d90c --- /dev/null +++ b/tests/unit/client/table/table_spans_ut.cpp @@ -0,0 +1,380 @@ +#include +#include + +#include + +#include + +using namespace NYdb; +using namespace NYdb::NTests; + +namespace { + +constexpr const char kTestDbNamespace[] = "/Root/testdb"; + +NYdb::NObservability::TRequestSpan MakeRequestSpan( + std::shared_ptr tracer, + const std::string& operationName, + const std::string& endpoint +) { + return NYdb::NObservability::TRequestSpan( + "Table", + std::move(tracer), + operationName, + endpoint, + kTestDbNamespace + ); +} + +} // namespace + +class TableSpanTest : public ::testing::Test { +protected: + void SetUp() override { + Tracer = std::make_shared(); + } + + std::shared_ptr Tracer; +}; + +TEST_F(TableSpanTest, SpanNameFormat) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.End(EStatus::SUCCESS); + + ASSERT_EQ(Tracer->SpanCount(), 1u); + EXPECT_EQ(Tracer->GetLastSpanRecord().Name, "ydb.ExecuteDataQuery"); +} + +TEST_F(TableSpanTest, SpanKindIsClient) { + auto span = MakeRequestSpan(Tracer, "ydb.CreateSession", "localhost:2135"); + span.End(EStatus::SUCCESS); + + ASSERT_EQ(Tracer->SpanCount(), 1u); + EXPECT_EQ(Tracer->GetLastSpanRecord().Kind, NTrace::ESpanKind::CLIENT); +} + +TEST_F(TableSpanTest, DbSystemAttribute) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.system.name"), "ydb"); +} + +TEST_F(TableSpanTest, DbNamespaceAndClientApi) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.namespace"), kTestDbNamespace); + EXPECT_EQ(fakeSpan->GetStringAttribute("ydb.client.api"), "Table"); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.operation.name"), "ydb.ExecuteDataQuery"); +} + +TEST_F(TableSpanTest, ServerAddressAndPort) { + auto span = MakeRequestSpan(Tracer, "ydb.Commit", "ydb.server:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "ydb.server"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2135); +} + +TEST_F(TableSpanTest, ServerAddressCustomPort) { + auto span = MakeRequestSpan(Tracer, "ydb.Rollback", "myhost:9090"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "myhost"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 9090); +} + +TEST_F(TableSpanTest, ServerAddressNoPortDefaultsTo2135) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "myhost"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "myhost"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2135); +} + +TEST_F(TableSpanTest, IPv6EndpointParsing) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "[::1]:2136"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "::1"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2136); +} + +TEST_F(TableSpanTest, IPv6EndpointNoPort) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "[fe80::1]"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("server.address"), "fe80::1"); + EXPECT_EQ(fakeSpan->GetIntAttribute("server.port"), 2135); +} + +TEST_F(TableSpanTest, PeerEndpointAttributes) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "discovery.ydb:2135"); + span.SetPeerEndpoint("10.0.0.1:2136"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("network.peer.address"), "10.0.0.1"); + EXPECT_EQ(fakeSpan->GetIntAttribute("network.peer.port"), 2136); +} + +TEST_F(TableSpanTest, SuccessStatusDoesNotSetErrorAttrs) { + auto span = MakeRequestSpan(Tracer, "ydb.Commit", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_FALSE(fakeSpan->HasStringAttribute("db.response.status_code")); + EXPECT_FALSE(fakeSpan->HasStringAttribute("error.type")); +} + +TEST_F(TableSpanTest, ErrorStatusSetsErrorType) { + auto span = MakeRequestSpan(Tracer, "ydb.Rollback", "localhost:2135"); + span.End(EStatus::UNAVAILABLE); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_EQ(fakeSpan->GetStringAttribute("db.response.status_code"), "UNAVAILABLE"); + EXPECT_TRUE(fakeSpan->HasStringAttribute("error.type")); + EXPECT_FALSE(fakeSpan->GetStringAttribute("error.type").empty()); +} + +TEST_F(TableSpanTest, SpanIsEndedAfterEnd) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + + EXPECT_FALSE(fakeSpan->IsEnded()); + span.End(EStatus::SUCCESS); + EXPECT_TRUE(fakeSpan->IsEnded()); +} + +TEST_F(TableSpanTest, NullTracerDoesNotCrash) { + EXPECT_NO_THROW({ + NYdb::NObservability::TRequestSpan span( + "Table", + nullptr, + "ydb.ExecuteDataQuery", + "localhost:2135", + kTestDbNamespace + ); + span.SetPeerEndpoint("10.0.0.1:2136"); + span.AddEvent("retry", {{"attempt", "1"}}); + span.End(EStatus::SUCCESS); + }); +} + +TEST_F(TableSpanTest, DestructorEndsSpan) { + auto fakeSpan = [&]() -> std::shared_ptr { + auto span = MakeRequestSpan(Tracer, "ydb.CreateSession", "localhost:2135"); + return Tracer->GetLastSpan(); + }(); + + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->IsEnded()); +} + +TEST_F(TableSpanTest, ExplicitEndThenDestructorDoesNotDoubleEnd) { + auto fakeSpan = [&]() -> std::shared_ptr { + auto span = MakeRequestSpan(Tracer, "ydb.Commit", "localhost:2135"); + span.End(EStatus::SUCCESS); + return Tracer->GetLastSpan(); + }(); + + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->IsEnded()); +} + +TEST_F(TableSpanTest, AddEventForwarded) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.AddEvent("retry", {{"ydb.attempt", "2"}, {"error.type", "UNAVAILABLE"}}); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + auto events = fakeSpan->GetEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].Name, "retry"); + EXPECT_EQ(events[0].Attributes.at("ydb.attempt"), "2"); + EXPECT_EQ(events[0].Attributes.at("error.type"), "UNAVAILABLE"); +} + +TEST_F(TableSpanTest, EmptyPeerEndpointIgnored) { + auto span = MakeRequestSpan(Tracer, "ydb.CreateSession", "localhost:2135"); + span.SetPeerEndpoint(""); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_FALSE(fakeSpan->HasStringAttribute("network.peer.address")); + EXPECT_FALSE(fakeSpan->HasIntAttribute("network.peer.port")); +} + +TEST_F(TableSpanTest, RepresentativeOperationNames) { + const std::vector operations = { + "ydb.CreateSession", + "ydb.ExecuteDataQuery", + "ydb.Commit", + "ydb.Rollback", + }; + + for (const auto& op : operations) { + auto span = MakeRequestSpan(Tracer, op, "localhost:2135"); + span.End(EStatus::SUCCESS); + } + + auto spans = Tracer->GetSpans(); + ASSERT_EQ(spans.size(), 4u); + EXPECT_EQ(spans[0].Name, "ydb.CreateSession"); + EXPECT_EQ(spans[1].Name, "ydb.ExecuteDataQuery"); + EXPECT_EQ(spans[2].Name, "ydb.Commit"); + EXPECT_EQ(spans[3].Name, "ydb.Rollback"); + + for (const auto& record : spans) { + EXPECT_EQ(record.Kind, NTrace::ESpanKind::CLIENT); + } +} + +TEST_F(TableSpanTest, MultipleErrorStatuses) { + std::vector errorStatuses = { + EStatus::BAD_REQUEST, + EStatus::UNAUTHORIZED, + EStatus::INTERNAL_ERROR, + EStatus::UNAVAILABLE, + EStatus::OVERLOADED, + EStatus::TIMEOUT, + EStatus::NOT_FOUND, + EStatus::CLIENT_INTERNAL_ERROR, + }; + + for (auto status : errorStatuses) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.End(status); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->HasStringAttribute("error.type")) + << "error.type missing for status " << static_cast(status); + EXPECT_EQ( + fakeSpan->GetStringAttribute("db.response.status_code"), + fakeSpan->GetStringAttribute("error.type") + ); + } +} + +TEST_F(TableSpanTest, EmptyEndpointDoesNotCrash) { + EXPECT_NO_THROW({ + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", ""); + span.End(EStatus::SUCCESS); + }); +} + +TEST_F(TableSpanTest, ActivateReturnsScope) { + auto span = MakeRequestSpan(Tracer, "ydb.RunWithRetry", "localhost:2135"); + auto scope = span.Activate(); + EXPECT_NE(scope, nullptr); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + EXPECT_TRUE(fakeSpan->IsActivated()); + + span.End(EStatus::SUCCESS); +} + +TEST_F(TableSpanTest, ActivateNullTracerReturnsNull) { + NYdb::NObservability::TRequestSpan span( + "Table", + nullptr, + "ydb.RunWithRetry", + "localhost:2135", + kTestDbNamespace + ); + auto scope = span.Activate(); + EXPECT_EQ(scope, nullptr); +} + +TEST_F(TableSpanTest, InternalSpanKindIsPropagated) { + NYdb::NObservability::TRequestSpan span( + "Table", + Tracer, + "ydb.RunWithRetry", + "localhost:2135", + kTestDbNamespace, + TLog(), + NTrace::ESpanKind::INTERNAL + ); + span.End(EStatus::SUCCESS); + + ASSERT_EQ(Tracer->SpanCount(), 1u); + EXPECT_EQ(Tracer->GetLastSpanRecord().Name, "ydb.RunWithRetry"); + EXPECT_EQ(Tracer->GetLastSpanRecord().Kind, NTrace::ESpanKind::INTERNAL); +} + +TEST_F(TableSpanTest, ErrorStatusAddsExceptionEvent) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.End(EStatus::UNAVAILABLE); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + auto events = fakeSpan->GetEvents(); + ASSERT_FALSE(events.empty()); + + bool found = false; + for (const auto& event : events) { + if (event.Name == "exception") { + found = true; + EXPECT_EQ(event.Attributes.at("exception.type"), "UNAVAILABLE"); + EXPECT_EQ(event.Attributes.at("exception.message"), "UNAVAILABLE"); + } + } + EXPECT_TRUE(found) << "expected an 'exception' event on a failed span"; +} + +TEST_F(TableSpanTest, SuccessStatusNoExceptionEvent) { + auto span = MakeRequestSpan(Tracer, "ydb.Commit", "localhost:2135"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + for (const auto& event : fakeSpan->GetEvents()) { + EXPECT_NE(event.Name, "exception"); + } +} + +TEST_F(TableSpanTest, RecordExceptionEmitsEvent) { + auto span = MakeRequestSpan(Tracer, "ydb.ExecuteDataQuery", "localhost:2135"); + span.RecordException("TimeoutException", "data query timed out"); + span.End(EStatus::SUCCESS); + + auto fakeSpan = Tracer->GetLastSpan(); + ASSERT_NE(fakeSpan, nullptr); + auto events = fakeSpan->GetEvents(); + ASSERT_FALSE(events.empty()); + bool found = false; + for (const auto& event : events) { + if (event.Name == "exception" + && event.Attributes.count("exception.type") + && event.Attributes.at("exception.type") == "TimeoutException") + { + found = true; + EXPECT_EQ(event.Attributes.at("exception.message"), "data query timed out"); + } + } + EXPECT_TRUE(found); +}