From 451c8043f88cf47de3d98a8dc1d33c0876aafd4b Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 5 Dec 2025 23:01:57 +0000 Subject: [PATCH 1/2] Add running requests scorer and tests --- cmd/epp/runner/runner.go | 1 + .../inferencepool/templates/epp-config.yaml | 4 + pkg/epp/metrics/metrics.go | 1 + .../framework/plugins/scorer/running.go | 104 ++++++++++++++++++ .../framework/plugins/scorer/running_test.go | 85 ++++++++++++++ 5 files changed, 195 insertions(+) create mode 100644 pkg/epp/scheduling/framework/plugins/scorer/running.go create mode 100644 pkg/epp/scheduling/framework/plugins/scorer/running_test.go diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go index f02a36557..da98396c7 100644 --- a/cmd/epp/runner/runner.go +++ b/cmd/epp/runner/runner.go @@ -432,6 +432,7 @@ func (r *Runner) registerInTreePlugins() { plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory) plugins.Register(scorer.KvCacheUtilizationScorerType, scorer.KvCacheUtilizationScorerFactory) plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory) + plugins.Register(scorer.RunningQueueSizeScorerType, scorer.RunningQueueSizeScorerFactory) plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory) // Latency predictor plugins plugins.Register(slo_aware_router.SLOAwareRouterPluginType, slo_aware_router.SLOAwareRouterFactory) diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index e082db793..9682de468 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -9,6 +9,7 @@ data: kind: EndpointPickerConfig plugins: - type: queue-scorer + - type: running-queue-size-scorer - type: kv-cache-utilization-scorer - type: prefix-cache-scorer {{- if .Values.inferenceExtension.latencyPredictor.enabled }} @@ -45,6 +46,7 @@ data: - pluginRef: predicted-latency-scorer weight: 0 - pluginRef: queue-scorer + - pluginRef: running-queue-size-scorer - pluginRef: kv-cache-utilization-scorer - name: predicted-latency-routing plugins: @@ -54,6 +56,8 @@ data: plugins: - pluginRef: queue-scorer weight: 2 + - pluginRef: running-queue-size-scorer + weight: 2 - pluginRef: kv-cache-utilization-scorer weight: 2 - pluginRef: prefix-cache-scorer diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index 44c3be87d..0b41be647 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -37,6 +37,7 @@ const ( KVCacheUsagePercentKey = "KVCacheUsagePercent" WaitingQueueSizeKey = "WaitingQueueSize" + RunningQueueSizeKey = "RunningQueueSize" MaxActiveModelsKey = "MaxActiveModels" ActiveModelsKey = "ActiveModels" WaitingModelsKey = "WaitingModels" diff --git a/pkg/epp/scheduling/framework/plugins/scorer/running.go b/pkg/epp/scheduling/framework/plugins/scorer/running.go new file mode 100644 index 000000000..5b586f8e8 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/scorer/running.go @@ -0,0 +1,104 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "encoding/json" + "math" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + RunningQueueSizeScorerType = "running-queue-size-scorer" +) + +// compile-time type assertion +var _ framework.Scorer = &RunningQueueSizeScorer{} + +// RunningQueueSizeScorerFactory defines the factory function for RunningQueueSizeScorer. +func RunningQueueSizeScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewRunningQueueSizeScorer().WithName(name), nil +} + +// NewRunningQueueSizeScorer initializes a new RunningQueueSizeScorer and returns its pointer. +func NewRunningQueueSizeScorer() *RunningQueueSizeScorer { + return &RunningQueueSizeScorer{ + typedName: plugins.TypedName{Type: RunningQueueSizeScorerType, Name: RunningQueueSizeScorerType}, + } +} + +// RunningQueueSizeScorer scores list of candidate pods based on the pod's running request size. +// the less running request size the pod has, the higher score it will get (since it's more available to serve new request). +type RunningQueueSizeScorer struct { + typedName plugins.TypedName +} + +// TypedName returns the type and name tuple of this plugin instance. +func (s *RunningQueueSizeScorer) TypedName() plugins.TypedName { + return s.typedName +} + +// Consumes returns the list of data that is consumed by the plugin. +func (s *RunningQueueSizeScorer) Consumes() map[string]any { + return map[string]any{ + metrics.RunningQueueSizeKey: int(0), + } +} + +// WithName sets the name of the scorer. +func (s *RunningQueueSizeScorer) WithName(name string) *RunningQueueSizeScorer { + s.typedName.Name = name + return s +} + +// Score returns the scoring result for the given list of pods based on context. +func (s *RunningQueueSizeScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { + minQueueSize := math.MaxInt + maxQueueSize := math.MinInt + + // Iterate through the remaining pods to find min and max + for _, pod := range pods { + queueSize := pod.GetMetrics().RunningQueueSize + if queueSize < minQueueSize { + minQueueSize = queueSize + } + if queueSize > maxQueueSize { + maxQueueSize = queueSize + } + } + + // podScoreFunc calculates the score based on the queue size of each pod. Longer queue gets a lower score. + podScoreFunc := func(pod types.Pod) float64 { + if maxQueueSize == minQueueSize { + // If all pods have the same queue size, return a neutral score + return 1.0 + } + return float64(maxQueueSize-pod.GetMetrics().RunningQueueSize) / float64(maxQueueSize-minQueueSize) + } + + // Create a map to hold the scores for each pod + scores := make(map[types.Pod]float64, len(pods)) + for _, pod := range pods { + scores[pod] = podScoreFunc(pod) + } + return scores +} diff --git a/pkg/epp/scheduling/framework/plugins/scorer/running_test.go b/pkg/epp/scheduling/framework/plugins/scorer/running_test.go new file mode 100644 index 000000000..653291631 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/scorer/running_test.go @@ -0,0 +1,85 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestRunningQueueSizeScorer(t *testing.T) { + tests := []struct { + name string + pods []types.Pod + expectedScoresPod map[int]float64 // Map of pod index to expected score + }{ + { + name: "Different running queue sizes", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 10}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 5}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 0}}, + }, + expectedScoresPod: map[int]float64{ + 0: 0.0, // Longest queue (10) gets lowest score + 1: 0.5, // Medium queue (5) gets medium score + 2: 1.0, // Shortest queue (0) gets highest score + }, + }, + { + name: "Same running queue sizes", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 5}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 5}}, + }, + expectedScoresPod: map[int]float64{ + 0: 1.0, // When all pods have the same queue size, they get the same neutral score + 1: 1.0, + }, + }, + { + name: "Zero running queue sizes", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 0}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 0}}, + }, + expectedScoresPod: map[int]float64{ + 0: 1.0, + 1: 1.0, + }, + }, + } + + scorer := &RunningQueueSizeScorer{} + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scores := scorer.Score(context.Background(), types.NewCycleState(), &types.LLMRequest{}, test.pods) + + for i, pod := range test.pods { + expectedScore := test.expectedScoresPod[i] + assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %d should have score %f", i, expectedScore) + } + }) + } +} From fa6ad7ccfb8b796537d7d47b0c650c7298b344ba Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Sat, 6 Dec 2025 00:59:26 +0000 Subject: [PATCH 2/2] Remove running request scorer from epp config --- config/charts/inferencepool/templates/epp-config.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index 9682de468..e082db793 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -9,7 +9,6 @@ data: kind: EndpointPickerConfig plugins: - type: queue-scorer - - type: running-queue-size-scorer - type: kv-cache-utilization-scorer - type: prefix-cache-scorer {{- if .Values.inferenceExtension.latencyPredictor.enabled }} @@ -46,7 +45,6 @@ data: - pluginRef: predicted-latency-scorer weight: 0 - pluginRef: queue-scorer - - pluginRef: running-queue-size-scorer - pluginRef: kv-cache-utilization-scorer - name: predicted-latency-routing plugins: @@ -56,8 +54,6 @@ data: plugins: - pluginRef: queue-scorer weight: 2 - - pluginRef: running-queue-size-scorer - weight: 2 - pluginRef: kv-cache-utilization-scorer weight: 2 - pluginRef: prefix-cache-scorer