Skip to content

Commit 1478448

Browse files
committed
feat(metrics): add scheduler attempt counter and outcome helper
Signed-off-by: CYJiang <googs1025@gmail.com>
1 parent 25cfb90 commit 1478448

File tree

4 files changed

+88
-3
lines changed

4 files changed

+88
-3
lines changed

pkg/epp/metrics/metrics.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,16 @@ var (
299299
[]string{},
300300
)
301301

302+
// SchedulerAttemptsTotal counts total number of scheduling attempts, labeled by status.
303+
SchedulerAttemptsTotal = prometheus.NewCounterVec(
304+
prometheus.CounterOpts{
305+
Subsystem: InferenceExtension,
306+
Name: "scheduler_attempts_total",
307+
Help: metricsutil.HelpMsgWithStability("Total number of scheduling attempts.", compbasemetrics.ALPHA),
308+
},
309+
[]string{"status"}, // "success", "failure"
310+
)
311+
302312
PluginProcessingLatencies = prometheus.NewHistogramVec(
303313
prometheus.HistogramOpts{
304314
Subsystem: InferenceExtension,
@@ -409,6 +419,7 @@ func Register(customCollectors ...prometheus.Collector) {
409419
metrics.Registry.MustRegister(inferencePoolAvgQueueSize)
410420
metrics.Registry.MustRegister(inferencePoolReadyPods)
411421
metrics.Registry.MustRegister(SchedulerE2ELatency)
422+
metrics.Registry.MustRegister(SchedulerAttemptsTotal)
412423
metrics.Registry.MustRegister(PluginProcessingLatencies)
413424
metrics.Registry.MustRegister(InferenceExtensionInfo)
414425
metrics.Registry.MustRegister(PrefixCacheSize)
@@ -453,6 +464,7 @@ func Reset() {
453464
inferencePoolAvgQueueSize.Reset()
454465
inferencePoolReadyPods.Reset()
455466
SchedulerE2ELatency.Reset()
467+
SchedulerAttemptsTotal.Reset()
456468
PluginProcessingLatencies.Reset()
457469
InferenceExtensionInfo.Reset()
458470
PrefixCacheSize.Reset()
@@ -462,7 +474,7 @@ func Reset() {
462474
flowControlQueueSize.Reset()
463475
}
464476

465-
// RecordRequstCounter records the number of requests.
477+
// RecordRequestCounter records the number of requests.
466478
func RecordRequestCounter(modelName, targetModelName string) {
467479
requestCounter.WithLabelValues(modelName, targetModelName).Inc()
468480
}
@@ -684,6 +696,28 @@ func RecordSchedulerE2ELatency(duration time.Duration) {
684696
SchedulerE2ELatency.WithLabelValues().Observe(duration.Seconds())
685697
}
686698

699+
// RecordSchedulerAttempt records a scheduling attempt with status.
700+
func RecordSchedulerAttempt(status string) {
701+
SchedulerAttemptsTotal.WithLabelValues(status).Inc()
702+
}
703+
704+
const (
705+
SchedulerStatusSuccess = "success"
706+
SchedulerStatusFailure = "failure"
707+
)
708+
709+
// RecordSchedulingOutcome records metrics at the end of a scheduling attempt,
710+
// including latency, attempt status.
711+
func RecordSchedulingOutcome(duration time.Duration, err error) {
712+
RecordSchedulerE2ELatency(duration)
713+
714+
if err != nil {
715+
RecordSchedulerAttempt(SchedulerStatusFailure)
716+
} else {
717+
RecordSchedulerAttempt(SchedulerStatusSuccess)
718+
}
719+
}
720+
687721
// RecordPluginProcessingLatency records the processing latency for a plugin.
688722
func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration) {
689723
PluginProcessingLatencies.WithLabelValues(extensionPoint, pluginType, pluginName).Observe(duration.Seconds())

pkg/epp/metrics/metrics_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,50 @@ func TestSchedulerE2ELatency(t *testing.T) {
684684
}
685685
}
686686

687+
func TestSchedulerAttemptsTotal(t *testing.T) {
688+
689+
scenarios := []struct {
690+
name string
691+
successCount int
692+
failureCount int
693+
}{
694+
{
695+
name: "mixed success and failure attempts",
696+
successCount: 10,
697+
failureCount: 5,
698+
},
699+
}
700+
701+
for _, scenario := range scenarios {
702+
t.Run(scenario.name, func(t *testing.T) {
703+
Reset()
704+
for i := 0; i < scenario.successCount; i++ {
705+
RecordSchedulerAttempt(SchedulerStatusSuccess)
706+
}
707+
for i := 0; i < scenario.failureCount; i++ {
708+
RecordSchedulerAttempt(SchedulerStatusFailure)
709+
}
710+
711+
wantMetrics, err := os.Open("testdata/scheduler_attempts_total_metrics")
712+
defer func() {
713+
if err = wantMetrics.Close(); err != nil {
714+
t.Error(err)
715+
}
716+
}()
717+
if err != nil {
718+
t.Fatal(err)
719+
}
720+
if err := testutil.GatherAndCompare(
721+
metrics.Registry,
722+
wantMetrics,
723+
"inference_extension_scheduler_attempts_total",
724+
); err != nil {
725+
t.Errorf("metric comparison failed: %v", err)
726+
}
727+
})
728+
}
729+
}
730+
687731
func TestPrefixCacheMetrics(t *testing.T) {
688732
Reset()
689733
const (
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
2+
# TYPE inference_extension_scheduler_attempts_total counter
3+
inference_extension_scheduler_attempts_total{status="failure"} 5
4+
inference_extension_scheduler_attempts_total{status="success"} 10

pkg/epp/scheduling/scheduler.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,11 @@ func (s *Scheduler) Schedule(ctx context.Context, request *types.LLMRequest, can
4848
loggerVerbose := log.FromContext(ctx).V(logutil.VERBOSE)
4949

5050
scheduleStart := time.Now()
51+
var result *types.SchedulingResult
52+
var err error
53+
5154
defer func() {
52-
metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart))
55+
metrics.RecordSchedulingOutcome(time.Since(scheduleStart), err)
5356
}()
5457

5558
profileRunResults := map[string]*types.ProfileRunResult{}
@@ -85,7 +88,7 @@ func (s *Scheduler) Schedule(ctx context.Context, request *types.LLMRequest, can
8588

8689
loggerVerbose.Info("Running profile handler, ProcessResults", "plugin", s.profileHandler.TypedName())
8790
before := time.Now()
88-
result, err := s.profileHandler.ProcessResults(ctx, cycleState, request, profileRunResults)
91+
result, err = s.profileHandler.ProcessResults(ctx, cycleState, request, profileRunResults)
8992
metrics.RecordPluginProcessingLatency(framework.ProcessProfilesResultsExtensionPoint, s.profileHandler.TypedName().Type, s.profileHandler.TypedName().Name, time.Since(before))
9093
loggerVerbose.Info("Completed running profile handler ProcessResults successfully", "plugin", s.profileHandler.TypedName())
9194

0 commit comments

Comments
 (0)