diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2caad38..b6939c3 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -34,6 +34,7 @@ make docs-gen # regenerate AI docs from source - Pod builder is a pure function in internal/podbuilder/ (no k8s client) - Pacing logic lives exclusively in internal/pacing/ - Don't manually edit generated files — run make docs-gen +- Documentation must never contain unverified information — verify all examples against a real cluster before merging ## Testing Patterns diff --git a/api/v1alpha1/discoverypolicy_types.go b/api/v1alpha1/discoverypolicy_types.go index 9cdf22e..f90bf90 100644 --- a/api/v1alpha1/discoverypolicy_types.go +++ b/api/v1alpha1/discoverypolicy_types.go @@ -53,6 +53,27 @@ type DiscoverySource struct { SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` } +// AggregationMethod defines how range query values are aggregated into a score. +// +kubebuilder:validation:Enum=sum;count;avg;max +type AggregationMethod string + +const ( + // AggregationSum adds all data-point values over the lookback window. + // Use when the query returns a gauge/counter and the total magnitude matters + // (e.g., total memory usage across the window). + AggregationSum AggregationMethod = "sum" + // AggregationCount counts the number of non-zero data points over the lookback window. + // Use when you want to rank by how frequently an image appears + // (e.g., number of sample intervals where the image was running). + AggregationCount AggregationMethod = "count" + // AggregationAvg computes the arithmetic mean of all data-point values. + // Use when you want the average magnitude regardless of how many samples exist. + AggregationAvg AggregationMethod = "avg" + // AggregationMax takes the highest single data-point value. + // Use when peak usage is more relevant than cumulative usage. + AggregationMax AggregationMethod = "max" +) + // PrometheusSource defines Prometheus query configuration for image discovery. type PrometheusSource struct { // Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). @@ -66,13 +87,20 @@ type PrometheusSource struct { // +kubebuilder:validation:MinLength=1 Query string `json:"query"` // Lookback is the time window for aggregation. When set, the operator uses query_range - // (start=now-lookback, end=now) and sums all returned values per image to produce a score. + // (start=now-lookback, end=now) and aggregates all returned values per image to produce a score. + // The aggregation function is controlled by the aggregationMethod field. // When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. // Example: "168h" (7 days), "24h", "72h" // +optional Lookback *metav1.Duration `json:"lookback,omitempty"` + // AggregationMethod controls how data points from a range query are combined into a single score. + // Only used when lookback is set. Ignored for instant queries. + // Default: "sum". Options: "sum", "count", "avg", "max" + // +kubebuilder:default="sum" + // +optional + AggregationMethod AggregationMethod `json:"aggregationMethod,omitempty"` // Step is the resolution step for range queries (only used when lookback is set). - // Smaller steps = more data points = more accurate sums but higher Prometheus load. + // Smaller steps = more data points = more accurate aggregation but higher Prometheus load. // Default: "5m". Example: "1m", "15m" // +kubebuilder:default="5m" // +optional diff --git a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml index d85dab4..10bb7a3 100644 --- a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml +++ b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml @@ -86,6 +86,18 @@ spec: prometheus: description: Prometheus contains the configuration when type=prometheus. properties: + aggregationMethod: + default: sum + description: |- + AggregationMethod controls how data points from a range query are combined into a single score. + Only used when lookback is set. Ignored for instant queries. + Default: "sum". Options: "sum", "count", "avg", "max" + enum: + - sum + - count + - avg + - max + type: string endpoint: description: |- Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). @@ -95,7 +107,8 @@ spec: lookback: description: |- Lookback is the time window for aggregation. When set, the operator uses query_range - (start=now-lookback, end=now) and sums all returned values per image to produce a score. + (start=now-lookback, end=now) and aggregates all returned values per image to produce a score. + The aggregation function is controlled by the aggregationMethod field. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" type: string @@ -111,7 +124,7 @@ spec: default: 5m description: |- Step is the resolution step for range queries (only used when lookback is set). - Smaller steps = more data points = more accurate sums but higher Prometheus load. + Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: "5m". Example: "1m", "15m" type: string required: diff --git a/docs/content/docs/discovery.md b/docs/content/docs/discovery.md index 28e3aa7..7bef5f7 100644 --- a/docs/content/docs/discovery.md +++ b/docs/content/docs/discovery.md @@ -66,7 +66,14 @@ count(container_memory_working_set_bytes{ Hand-maintained image lists do not keep up in environments where automation (for example Renovate) ships new image versions every day. A practical pattern is to rank images by observed CI usage over a rolling window. -The `lookback` field tells Drop to use Prometheus `query_range` API over that time window and sum all returned values per image to produce a total usage score: +The `lookback` field tells Drop to use Prometheus `query_range` API over that time window. The `aggregationMethod` field controls how the returned data points are combined into a single score per image: + +| Method | Behavior | Use when | +|--------|----------|----------| +| `sum` (default) | Adds all data-point values over the window | Total cumulative usage matters (e.g. total memory consumed) | +| `count` | Counts the number of data points returned | You want to rank by how frequently an image appears | +| `avg` | Arithmetic mean of all data-point values | Average magnitude matters regardless of sample count | +| `max` | Highest single data-point value | Peak usage is more relevant than cumulative | ```yaml apiVersion: drop.corewire.io/v1alpha1 @@ -82,6 +89,7 @@ spec: endpoint: https://mimir.example.com lookback: 168h # 7 days step: 5m + aggregationMethod: sum # default — rank by total usage over 7 days query: | count( container_memory_working_set_bytes{ @@ -95,7 +103,8 @@ Use this when you want DiscoveryPolicy to continuously follow what your GitLab r #### Field-by-field explanation -- `lookback: 168h` — Drop uses `query_range` with start=now-7d, end=now, and sums all returned values per image to rank by total usage over the window. +- `lookback: 168h` — Drop uses `query_range` with start=now-7d, end=now, and aggregates all returned values per image using the chosen `aggregationMethod` (default: `sum`). +- `aggregationMethod: sum` — sums all data-point values to rank by total usage. Use `count` to rank by number of appearances, `avg` for average magnitude, or `max` for peak value. - `step: 5m` — resolution step for the range query (controls how many data points Prometheus returns). - `count(...) by (image)` — counts the number of running containers per image to rank by popularity. - `container_memory_working_set_bytes{...}` — source metric used to observe running containers. @@ -108,7 +117,12 @@ Use this when you want DiscoveryPolicy to continuously follow what your GitLab r For each unique `image` label, Drop uses the Prometheus query result value as the score. -When `lookback` is not set (the default), Drop sends an instant query (`/api/v1/query`) and uses the returned value directly. When `lookback` is set (e.g. `lookback: 168h`), Drop uses a range query (`/api/v1/query_range`) over that window and **sums all returned values** to produce the score. This means images that appear more frequently over the window get a higher score. +When `lookback` is not set (the default), Drop sends an instant query (`/api/v1/query`) and uses the returned value directly. When `lookback` is set (e.g. `lookback: 168h`), Drop uses a range query (`/api/v1/query_range`) over that window and aggregates data points using the `aggregationMethod`: + +- `sum` (default): adds all data-point values — images with higher cumulative usage score higher +- `count`: counts the number of data points — images that appear more frequently score higher +- `avg`: averages data-point values — images with higher average value score higher +- `max`: takes the peak value — images with the highest single observation score higher The example above uses `lookback: 168h` so Drop handles the 7-day windowing via the API — no need to embed `[7d]` in PromQL. @@ -156,6 +170,7 @@ spec: - type: prometheus prometheus: endpoint: https://mimir.example.com + aggregationMethod: count # rank by number of appearances query: | count(container_memory_working_set_bytes{ container!="", container!="POD", diff --git a/docs/content/docs/reference/_generated_crds.md b/docs/content/docs/reference/_generated_crds.md index 3779600..8b76b32 100644 --- a/docs/content/docs/reference/_generated_crds.md +++ b/docs/content/docs/reference/_generated_crds.md @@ -207,8 +207,9 @@ PrometheusSource defines Prometheus query configuration for image discovery. |-------|------|----------|---------|-------------| | `endpoint` | `string` | Yes | — | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" | | `query` | `string` | Yes | — | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) | -| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | -| `step` | `string` | No | 5m | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" | +| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and aggregates all returned values per image to produce a score. The aggregation function is controlled by the aggregationMethod field. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | +| `aggregationMethod` | `AggregationMethod` | No | sum | AggregationMethod controls how data points from a range query are combined into a single score. Only used when lookback is set. Ignored for instant queries. Default: "sum". Options: "sum", "count", "avg", "max" | +| `step` | `string` | No | 5m | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: "5m". Example: "1m", "15m" | ### RegistrySource diff --git a/docs/go.mod b/docs/go.mod index cc0eced..a8b9b26 100644 --- a/docs/go.mod +++ b/docs/go.mod @@ -1,5 +1,3 @@ module github.com/corewire/drop/docs go 1.26.0 - -require github.com/imfing/hextra v0.12.3 // indirect diff --git a/docs/go.sum b/docs/go.sum index afa8680..e69de29 100644 --- a/docs/go.sum +++ b/docs/go.sum @@ -1,2 +0,0 @@ -github.com/imfing/hextra v0.12.3 h1:DZHY2rUWYteyzjlHi9r4n7Bb5e2Q+6LXe4C1Dqn0ZjM= -github.com/imfing/hextra v0.12.3/go.mod h1:vi+yhpq8YPp/aghvJlNKVnJKcPJ/VyAEcfC1BSV9ARo= diff --git a/docs/static/llms-full.txt b/docs/static/llms-full.txt index 50e6d8c..d83aa4c 100644 --- a/docs/static/llms-full.txt +++ b/docs/static/llms-full.txt @@ -181,8 +181,9 @@ PrometheusSource defines Prometheus query configuration for image discovery. |-------|------|------|----------|---------|-------------| | Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" | | Query | `query` | `string` | ✓ | | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) | -| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | -| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" | +| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and aggregates all returned values per image to produce a score. The aggregation function is controlled by the aggregationMethod field. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | +| AggregationMethod | `aggregationMethod` | `AggregationMethod` | — | `sum` | AggregationMethod controls how data points from a range query are combined into a single score. Only used when lookback is set. Ignored for instant queries. Default: "sum". Options: "sum", "count", "avg", "max" | +| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: "5m". Example: "1m", "15m" | ### RegistrySource @@ -332,6 +333,7 @@ spec: query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/go.sum b/go.sum index 06ca73e..760283c 100644 --- a/go.sum +++ b/go.sum @@ -66,8 +66,6 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg= github.com/google/pprof v0.0.0-20260402051712-545e8a4df936/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -107,14 +105,8 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.27.4 h1:fcEcQW/A++6aZAZQNUmNjvA9PSOzefMJBerHJ4t8v8Y= -github.com/onsi/ginkgo/v2 v2.27.4/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/ginkgo/v2 v2.29.0 h1:rfh+ZFjgJhYWRoIqVf3Uwx/W20yLrcrE2h2GmYVRaag= github.com/onsi/ginkgo/v2 v2.29.0/go.mod h1:+aXOY+vzZ5mu2iI2HpTZUPmM//oQfsNFX6gU9kNcA44= -github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= -github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= -github.com/onsi/gomega v1.40.0 h1:Vtol0e1MghCD2ZVIilPDIg44XSL9l2QAn8ZNaljWcJc= -github.com/onsi/gomega v1.40.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A= github.com/onsi/gomega v1.41.0 h1:OwKp4pXNgVxf6sCplzYo794OFNuoL2q2SBMU5NSWOjA= github.com/onsi/gomega v1.41.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -192,36 +184,22 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= -golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= -golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= -golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= -golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= -golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= -golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= -golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= -golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= diff --git a/hack/dev-samples.yaml b/hack/dev-samples.yaml index 61b9d21..b10e5ba 100644 --- a/hack/dev-samples.yaml +++ b/hack/dev-samples.yaml @@ -83,6 +83,7 @@ spec: query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/hack/e2e-infra/prometheus-config.yaml b/hack/e2e-infra/prometheus-config.yaml index f731502..e483195 100644 --- a/hack/e2e-infra/prometheus-config.yaml +++ b/hack/e2e-infra/prometheus-config.yaml @@ -64,3 +64,43 @@ data: namespace: "production" pod: "myapp-xyz" expr: "209715200" + # Metrics for aggregation method e2e tests. + # Two images with multiple pods each so count() and sum() produce + # different rankings: + # alpine → 3 pods, values 100+200+300 → sum=600, count=3, avg=200, max=300 + # busybox → 1 pod, value 500 → sum=500, count=1, avg=500, max=500 + # With count(), alpine ranks higher (3 > 1). + # With sum(), alpine still ranks higher (600 > 500). + # With avg(), busybox ranks higher (500 > 200). + # With max(), busybox ranks higher (500 > 300). + - name: seed_aggregation_metrics + interval: 10s + rules: + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "aggregation-test" + pod: "worker-aaa" + expr: "100" + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "aggregation-test" + pod: "worker-bbb" + expr: "200" + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "aggregation-test" + pod: "worker-ccc" + expr: "300" + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/busybox:1.36" + container: "init" + namespace: "aggregation-test" + pod: "init-ddd" + expr: "500" diff --git a/hack/gen-ai-docs/config.go b/hack/gen-ai-docs/config.go index 325bb60..645104f 100644 --- a/hack/gen-ai-docs/config.go +++ b/hack/gen-ai-docs/config.go @@ -33,6 +33,7 @@ func conventions() []Convention { {Rule: "Pod builder is a pure function in internal/podbuilder/ (no k8s client)", Scope: []string{"code"}}, {Rule: "Pacing logic lives exclusively in internal/pacing/", Scope: []string{"code"}}, {Rule: "Don't manually edit generated files — run make docs-gen", Scope: []string{"code"}}, + {Rule: "Documentation must never contain unverified information — verify all examples against a real cluster before merging", Scope: []string{"code"}}, } } diff --git a/internal/controller/discoverypolicy_controller.go b/internal/controller/discoverypolicy_controller.go index 016b34b..04aef91 100644 --- a/internal/controller/discoverypolicy_controller.go +++ b/internal/controller/discoverypolicy_controller.go @@ -246,7 +246,7 @@ func (r *DiscoveryPolicyReconciler) buildSource(ctx context.Context, src dropv1a if src.Prometheus.Lookback != nil { lookback = src.Prometheus.Lookback.Duration } - return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, lookback, src.Prometheus.Step, httpClient), nil + return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, lookback, string(src.Prometheus.AggregationMethod), src.Prometheus.Step, httpClient), nil case "registry": if src.Registry == nil { return nil, fmt.Errorf("registry config is required when type=registry") diff --git a/internal/discovery/prometheus.go b/internal/discovery/prometheus.go index c3b4a31..d3966a8 100644 --- a/internal/discovery/prometheus.go +++ b/internal/discovery/prometheus.go @@ -13,27 +13,32 @@ import ( // PrometheusSource queries Prometheus for image references. type PrometheusSource struct { - Endpoint string - Query string - Lookback time.Duration // 0 = instant query; >0 = query_range - Step string // resolution step for range queries (default "5m") - HTTPClient *http.Client + Endpoint string + Query string + Lookback time.Duration // 0 = instant query; >0 = query_range + AggregationMethod string // sum, count, avg, max (default: sum) + Step string // resolution step for range queries (default "5m") + HTTPClient *http.Client } // NewPrometheusSource creates a new Prometheus discovery source. -func NewPrometheusSource(endpoint, query string, lookback time.Duration, step string, httpClient *http.Client) *PrometheusSource { +func NewPrometheusSource(endpoint, query string, lookback time.Duration, aggregationMethod, step string, httpClient *http.Client) *PrometheusSource { if httpClient == nil { httpClient = &http.Client{Timeout: 30 * time.Second} } if step == "" { step = "5m" } + if aggregationMethod == "" { + aggregationMethod = "sum" + } return &PrometheusSource{ - Endpoint: endpoint, - Query: query, - Lookback: lookback, - Step: step, - HTTPClient: httpClient, + Endpoint: endpoint, + Query: query, + Lookback: lookback, + AggregationMethod: aggregationMethod, + Step: step, + HTTPClient: httpClient, } } @@ -109,8 +114,8 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) { var score int64 if p.Lookback > 0 { - // Range query: sum all values to get total usage score - score = sumRangeValues(r.Values) + // Range query: aggregate values according to configured method + score = aggregateRangeValues(r.Values, p.AggregationMethod) } else { // Instant query: use single value score = extractScore(r.Value) @@ -146,9 +151,13 @@ func extractScore(value []interface{}) int64 { return int64(score) } -// sumRangeValues sums all values from a query_range result to produce a total usage score. -func sumRangeValues(values [][]interface{}) int64 { +// aggregateRangeValues aggregates all values from a query_range result using the specified method. +func aggregateRangeValues(values [][]interface{}, method string) int64 { var total float64 + var max float64 + var count int64 + maxSet := false + for _, pair := range values { if len(pair) < 2 { continue @@ -158,9 +167,28 @@ func sumRangeValues(values [][]interface{}) int64 { continue } var v float64 - if _, err := fmt.Sscanf(strVal, "%f", &v); err == nil { - total += v + if _, err := fmt.Sscanf(strVal, "%f", &v); err != nil { + continue + } + total += v + count++ + if !maxSet || v > max { + max = v + maxSet = true + } + } + + switch method { + case "count": + return count + case "avg": + if count == 0 { + return 0 } + return int64(total / float64(count)) + case "max": + return int64(max) + default: // "sum" + return int64(total) } - return int64(total) } diff --git a/internal/discovery/prometheus_test.go b/internal/discovery/prometheus_test.go index 2110a02..ced5865 100644 --- a/internal/discovery/prometheus_test.go +++ b/internal/discovery/prometheus_test.go @@ -103,7 +103,7 @@ func TestPrometheusSource_Fetch(t *testing.T) { })) defer server.Close() - source := NewPrometheusSource(server.URL, "test_query", 0, "", server.Client()) + source := NewPrometheusSource(server.URL, "test_query", 0, "", "", server.Client()) results, err := source.Fetch(context.Background()) if tt.wantErr { diff --git a/knowledge.yaml b/knowledge.yaml index 0eaa619..3ead568 100644 --- a/knowledge.yaml +++ b/knowledge.yaml @@ -449,13 +449,19 @@ helperTypes: json: lookback type: '*metav1.Duration' required: false - doc: 'Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h"' + doc: 'Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and aggregates all returned values per image to produce a score. The aggregation function is controlled by the aggregationMethod field. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h"' + - name: AggregationMethod + json: aggregationMethod + type: AggregationMethod + required: false + default: sum + doc: 'AggregationMethod controls how data points from a range query are combined into a single score. Only used when lookback is set. Ignored for instant queries. Default: "sum". Options: "sum", "count", "avg", "max"' - name: Step json: step type: string required: false default: 5m - doc: 'Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m"' + doc: 'Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: "5m". Example: "1m", "15m"' - name: RegistrySource doc: RegistrySource defines OCI registry tag listing configuration for image discovery. fields: @@ -537,6 +543,9 @@ conventions: - rule: Don't manually edit generated files — run make docs-gen scope: - code + - rule: Documentation must never contain unverified information — verify all examples against a real cluster before merging + scope: + - code errors: - reason: Cached controller: CachedImage @@ -764,6 +773,7 @@ samples: | query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/llms-full.txt b/llms-full.txt index 50e6d8c..d83aa4c 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -181,8 +181,9 @@ PrometheusSource defines Prometheus query configuration for image discovery. |-------|------|------|----------|---------|-------------| | Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" | | Query | `query` | `string` | ✓ | | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) | -| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | -| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" | +| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and aggregates all returned values per image to produce a score. The aggregation function is controlled by the aggregationMethod field. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | +| AggregationMethod | `aggregationMethod` | `AggregationMethod` | — | `sum` | AggregationMethod controls how data points from a range query are combined into a single score. Only used when lookback is set. Ignored for instant queries. Default: "sum". Options: "sum", "count", "avg", "max" | +| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: "5m". Example: "1m", "15m" | ### RegistrySource @@ -332,6 +333,7 @@ spec: query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/test/e2e/discovery-aggregation/01-discoverypolicies.yaml b/test/e2e/discovery-aggregation/01-discoverypolicies.yaml new file mode 100644 index 0000000..e6a8719 --- /dev/null +++ b/test/e2e/discovery-aggregation/01-discoverypolicies.yaml @@ -0,0 +1,67 @@ +# Four DiscoveryPolicies, each using a different aggregationMethod. +# All query the same seed metrics (container_cpu_usage_seconds_total in namespace aggregation-test). +# Seed data: alpine has 3 pods (values 100, 200, 300), busybox has 1 pod (value 500). +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-count +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + lookback: 1h + step: 5m + aggregationMethod: count + syncInterval: 30s + maxImages: 10 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-avg +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + lookback: 1h + step: 5m + aggregationMethod: avg + syncInterval: 30s + maxImages: 10 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-max +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + lookback: 1h + step: 5m + aggregationMethod: max + syncInterval: 30s + maxImages: 10 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-sum +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + lookback: 1h + step: 5m + aggregationMethod: sum + syncInterval: 30s + maxImages: 10 diff --git a/test/e2e/discovery-aggregation/02-assert-count.yaml b/test/e2e/discovery-aggregation/02-assert-count.yaml new file mode 100644 index 0000000..ee5e76b --- /dev/null +++ b/test/e2e/discovery-aggregation/02-assert-count.yaml @@ -0,0 +1,12 @@ +# Assert count aggregation: policy is Ready, both images discovered. +# count() by (image) returns alpine=3, busybox=1 at each step. +# aggregationMethod=count counts the number of data points (steps) per image. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-count +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/03-assert-avg.yaml b/test/e2e/discovery-aggregation/03-assert-avg.yaml new file mode 100644 index 0000000..ae09c4b --- /dev/null +++ b/test/e2e/discovery-aggregation/03-assert-avg.yaml @@ -0,0 +1,12 @@ +# Assert avg aggregation: policy is Ready, both images discovered. +# sum() by (image) returns alpine=600, busybox=500 at each step. +# aggregationMethod=avg averages the data-point values over the lookback window. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-avg +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/04-assert-max.yaml b/test/e2e/discovery-aggregation/04-assert-max.yaml new file mode 100644 index 0000000..2d240ef --- /dev/null +++ b/test/e2e/discovery-aggregation/04-assert-max.yaml @@ -0,0 +1,12 @@ +# Assert max aggregation: policy is Ready, both images discovered. +# sum() by (image) returns alpine=600, busybox=500 at each step. +# aggregationMethod=max takes the highest single data-point value. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-max +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/05-assert-sum.yaml b/test/e2e/discovery-aggregation/05-assert-sum.yaml new file mode 100644 index 0000000..af43f08 --- /dev/null +++ b/test/e2e/discovery-aggregation/05-assert-sum.yaml @@ -0,0 +1,12 @@ +# Assert sum (default) aggregation: policy is Ready, both images discovered. +# sum() by (image) returns alpine=600, busybox=500 at each step. +# aggregationMethod=sum adds all data-point values over the lookback window. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-sum +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/chainsaw-test.yaml b/test/e2e/discovery-aggregation/chainsaw-test.yaml new file mode 100644 index 0000000..279937d --- /dev/null +++ b/test/e2e/discovery-aggregation/chainsaw-test.yaml @@ -0,0 +1,83 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: discovery-aggregation-methods +spec: + description: | + Verify that DiscoveryPolicy aggregationMethod field works correctly against a + real Prometheus endpoint. Seeds use container_cpu_usage_seconds_total with two + images (alpine: 3 pods with values 100/200/300, busybox: 1 pod with value 500). + + Expected rankings per method: + count → alpine first (3 > 1) + avg → busybox first (500 > 200) + max → busybox first (500 > 300) + sum → alpine first (600 > 500) [default] + steps: + - name: Create DiscoveryPolicies with different aggregation methods + try: + - apply: + file: 01-discoverypolicies.yaml + - name: Assert count aggregation discovers images (alpine ranked first) + try: + - assert: + timeout: 90s + file: 02-assert-count.yaml + - name: Assert avg aggregation discovers images (busybox ranked first) + try: + - assert: + timeout: 90s + file: 03-assert-avg.yaml + - name: Assert max aggregation discovers images (busybox ranked first) + try: + - assert: + timeout: 90s + file: 04-assert-max.yaml + - name: Assert sum aggregation discovers images (alpine ranked first, default) + try: + - assert: + timeout: 90s + file: 05-assert-sum.yaml + - name: Verify aggregation scores are populated + try: + - script: + timeout: 30s + content: | + # Verify aggregation outputs are populated. + # Score relationships can vary with the number of data points and values + # returned by Prometheus in the lookback window. + SUM_SCORE=$(kubectl get discoverypolicy e2e-agg-sum -o jsonpath='{.status.discoveredImages[0].score}') + AVG_SCORE=$(kubectl get discoverypolicy e2e-agg-avg -o jsonpath='{.status.discoveredImages[0].score}') + COUNT_SCORE=$(kubectl get discoverypolicy e2e-agg-count -o jsonpath='{.status.discoveredImages[0].score}') + MAX_SCORE=$(kubectl get discoverypolicy e2e-agg-max -o jsonpath='{.status.discoveredImages[0].score}') + + echo "Scores — sum:$SUM_SCORE avg:$AVG_SCORE count:$COUNT_SCORE max:$MAX_SCORE" + + if [ -z "$SUM_SCORE" ] || [ -z "$AVG_SCORE" ] || [ -z "$COUNT_SCORE" ] || [ -z "$MAX_SCORE" ]; then + echo "FAIL: expected non-empty scores for all aggregation methods" + exit 1 + fi + echo "OK: aggregation methods produced non-empty scores" + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-count + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-avg + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-max + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-sum