Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ const (
ClusterPolicyCRDName = "ClusterPolicy"
// DefaultDCGMJobMappingDir is the default directory for DCGM Exporter HPC job mapping files
DefaultDCGMJobMappingDir = "/var/lib/dcgm-exporter/job-mapping"
// DefaultDCGMPodResourcesSocket is the default kubelet pod-resources socket path
DefaultDCGMPodResourcesSocket = "/var/lib/kubelet/pod-resources/kubelet.sock"
)

// ClusterPolicySpec defines the desired state of ClusterPolicy
Expand Down Expand Up @@ -969,6 +971,38 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"`

// Optional: Per-pod GPU utilization metrics for CUDA time-slicing workloads.
// When enabled, dcgm-exporter emits dcgm_fi_dev_sm_util_per_pod gauges that
// attribute SM utilization to individual pods sharing a GPU via time-slicing.
// Requires dcgm-exporter v3.4.0+ built with --enable-per-pod-gpu-util support.
// See: https://github.com/NVIDIA/dcgm-exporter/issues/587
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Per-Pod GPU Utilization Metrics"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
PerPodGPUUtil *DCGMExporterPerPodGPUUtilConfig `json:"perPodGPUUtil,omitempty"`
}

// DCGMExporterPerPodGPUUtilConfig configures per-pod GPU SM utilization metrics.
// This feature is useful when CUDA time-slicing is active and multiple pods share
// one physical GPU — standard per-device metrics lose per-workload attribution.
type DCGMExporterPerPodGPUUtilConfig struct {
// Enable per-pod GPU utilization collection via NVML process utilization API.
// Requires hostPID: true (automatically set when enabled).
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable Per-Pod GPU Utilization"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// PodResourcesSocketPath is the path to the kubelet pod-resources gRPC socket.
// Defaults to /var/lib/kubelet/pod-resources/kubelet.sock.
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod Resources Socket Path"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
PodResourcesSocketPath string `json:"podResourcesSocketPath,omitempty"`
}

// DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter
Expand Down Expand Up @@ -2101,6 +2135,24 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string {
return e.HPCJobMapping.Directory
}

// IsPerPodGPUUtilEnabled returns true if per-pod GPU utilization metrics are enabled.
// This feature attributes SM utilization to individual pods when CUDA time-slicing is active.
func (e *DCGMExporterSpec) IsPerPodGPUUtilEnabled() bool {
if e.PerPodGPUUtil == nil || e.PerPodGPUUtil.Enabled == nil {
return false
}
return *e.PerPodGPUUtil.Enabled
}

// GetPerPodGPUUtilSocketPath returns the kubelet pod-resources socket path for per-pod GPU util.
// Falls back to DefaultDCGMPodResourcesSocket if not explicitly configured.
func (e *DCGMExporterSpec) GetPerPodGPUUtilSocketPath() string {
if e.PerPodGPUUtil == nil || e.PerPodGPUUtil.PodResourcesSocketPath == "" {
return DefaultDCGMPodResourcesSocket
}
return e.PerPodGPUUtil.PodResourcesSocketPath
}

// IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator
func (g *GPUFeatureDiscoverySpec) IsEnabled() bool {
if g.Enabled == nil {
Expand Down
25 changes: 25 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1785,6 +1785,39 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
}

// configure per-pod GPU utilization metrics when enabled (for CUDA time-slicing workloads)
// See: https://github.com/NVIDIA/dcgm-exporter/issues/587
if config.DCGMExporter.IsPerPodGPUUtilEnabled() {
// enable the feature flag in dcgm-exporter
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL", "true")

// resolve pod→GPU mapping via kubelet pod-resources gRPC API
socketPath := config.DCGMExporter.GetPerPodGPUUtilSocketPath()
socketDir := socketPath[:strings.LastIndex(socketPath, "/")]

podResourcesVolMount := corev1.VolumeMount{
Name: "pod-resources",
ReadOnly: true,
MountPath: socketDir,
}
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(
obj.Spec.Template.Spec.Containers[0].VolumeMounts, podResourcesVolMount)

podResourcesVol := corev1.Volume{
Name: "pod-resources",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: socketDir,
Type: ptr.To(corev1.HostPathDirectory),
},
},
}
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, podResourcesVol)

// per-pod attribution requires resolving PIDs via /proc/<pid>/cgroup
obj.Spec.Template.Spec.HostPID = true
}

// mount configmap for custom metrics if provided by user
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
Expand Down
142 changes: 142 additions & 0 deletions docs/dcgm-exporter-per-pod-gpu-metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Per-Pod GPU Utilization with DCGM Exporter (Time-Slicing)

## Overview

When GPU time-slicing is enabled via `ClusterPolicy`, multiple pods share a
single physical GPU device. Standard DCGM metrics report aggregate utilization
for the whole device — `dcgm_fi_dev_gpu_util` cannot distinguish how much of
the GPU proxy, embeddings, or inference pods are each using.

GPU Operator v24.x+ integrates with dcgm-exporter's per-pod GPU utilization
feature to restore workload-level attribution without requiring MIG.

## Prerequisite: dcgm-exporter v3.4.0+

This feature requires dcgm-exporter v3.4.0 or later, which adds the
`--enable-per-pod-gpu-util` flag and `dcgm_fi_dev_sm_util_per_pod` metric.

See: [NVIDIA/dcgm-exporter#587](https://github.com/NVIDIA/dcgm-exporter/issues/587)

## Enabling Time-Slicing + Per-Pod Metrics

A complete `ClusterPolicy` for a T4 cluster running three shared workloads:

```yaml
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: gpu-cluster-policy
spec:
# 1. Configure time-slicing: 3 virtual slices per physical GPU
devicePlugin:
config:
name: time-slicing-config
default: any

# 2. Enable per-pod GPU utilization metrics in dcgm-exporter
dcgmExporter:
perPodGPUUtil:
enabled: true
# Optional: custom path (default: /var/lib/kubelet/pod-resources/kubelet.sock)
# podResourcesSocketPath: /var/lib/kubelet/pod-resources/kubelet.sock
```

The time-slicing ConfigMap referenced above must be deployed separately:

```yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: time-slicing-config
namespace: gpu-operator
data:
any: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
replicas: 3
renameByDefault: false
resources:
- name: nvidia.com/gpu
replicas: 3
```

## What GPU Operator does automatically

When `dcgmExporter.perPodGPUUtil.enabled: true` is set, GPU Operator:

1. Sets `DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL=true` in the dcgm-exporter
DaemonSet environment.
2. Mounts `/var/lib/kubelet/pod-resources/` as a read-only `hostPath` volume
so dcgm-exporter can reach the kubelet pod-resources gRPC socket.
3. Sets `hostPID: true` on the DaemonSet so dcgm-exporter can read
`/proc/<pid>/cgroup` to resolve NVML PIDs to containers.

## Emitted metric

```
# HELP dcgm_fi_dev_sm_util_per_pod SM utilization attributed to a pod (time-slicing)
# TYPE dcgm_fi_dev_sm_util_per_pod gauge
dcgm_fi_dev_sm_util_per_pod{
gpu="0",
uuid="GPU-abc123",
pod="synapse-proxy-7f9d4b-xkz2p",
namespace="synapse-staging",
container="proxy"
} 42
dcgm_fi_dev_sm_util_per_pod{...,pod="synapse-jina-...",container="jina"} 18
dcgm_fi_dev_sm_util_per_pod{...,pod="synapse-vllm-...",container="vllm"} 35
```

## Example Prometheus alert

```yaml
groups:
- name: per-pod-gpu
rules:
- alert: PodGPUHighUtilization
expr: dcgm_fi_dev_sm_util_per_pod > 80
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }} using >80% GPU SM"
```

## Cost model (example: g4dn.xlarge T4)

| Setup | Nodes | Cost/day |
|-------|-------|----------|
| 3 workloads, no time-slicing | 3 × g4dn.xlarge | ~$38/day |
| 3 workloads, time-slicing (3 replicas) | 1 × g4dn.xlarge | ~$13/day |
| **Savings** | | **~$25/day (~$9,000/year)** |

Time-slicing is appropriate for inference + embedding workloads that do not
fully saturate the GPU. For compute-bound training workloads, MIG or dedicated
GPUs remain the right choice.

## Security considerations

Enabling `perPodGPUUtil` grants dcgm-exporter:
- Read access to `/var/lib/kubelet/pod-resources/` (lists all GPU-using pods)
- Host PID namespace access (to read `/proc/<pid>/cgroup`)

These are the same permissions used by other node-level monitoring agents
(e.g., node-exporter, cAdvisor). Review your security policy before enabling
in sensitive environments.

## Compatibility

| GPU Operator | dcgm-exporter | Feature available |
|-------------|---------------|-------------------|
| < v24.x | any | No |
| ≥ v24.x | < v3.4.0 | Field accepted but no-op |
| ≥ v24.x | ≥ v3.4.0 | Yes |

## Related

- dcgm-exporter feature: [docs/per-pod-gpu-metrics.md](https://github.com/NVIDIA/dcgm-exporter/blob/main/docs/per-pod-gpu-metrics.md)
- Time-slicing setup: [GPU Sharing with Time-Slicing](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html)
- Issue: [NVIDIA/dcgm-exporter#587](https://github.com/NVIDIA/dcgm-exporter/issues/587)