-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgpu.sh
More file actions
83 lines (73 loc) · 3.25 KB
/
gpu.sh
File metadata and controls
83 lines (73 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# gpu - concise NVIDIA GPU summary with processes, thermal, and module info
# Usage: gpu
gpu() {
if ! command -v nvidia-smi &>/dev/null; then
echo "gpu: nvidia-smi not found" >&2
return 1
fi
# Driver and CUDA version
local driver cuda
driver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 | tr -d ' ')
cuda=$(nvidia-smi 2>/dev/null | grep -o "CUDA Version: [0-9.]*" | awk '{print $3}')
printf "%-10s %s\n" "driver:" "$driver"
printf "%-10s %s\n" "cuda:" "${cuda:-n/a}"
echo
# Per-GPU info
local count=0
while IFS=, read -r idx name mem_total mem_used mem_free util_gpu util_mem \
temp_gpu temp_mem fan pstate pwr_draw pwr_limit; do
# trim whitespace
local v
for v in idx name mem_total mem_used mem_free util_gpu util_mem \
temp_gpu temp_mem fan pstate pwr_draw pwr_limit; do
printf -v "$v" '%s' "${!v# }"
done
printf " [%s] %s %s\n" "$idx" "$name" "$pstate"
printf " %-10s %s MiB total / %s MiB used / %s MiB free\n" \
"memory:" "$mem_total" "$mem_used" "$mem_free"
printf " %-10s %s%% gpu / %s%% mem\n" "util:" "$util_gpu" "$util_mem"
# Thermal
local thermal="${temp_gpu}°C gpu"
[[ "$temp_mem" != "[N/A]" && -n "$temp_mem" ]] && thermal+=" / ${temp_mem}°C mem"
[[ "$fan" != "[N/A]" && -n "$fan" ]] && thermal+=" / fan ${fan}%"
[[ "$fan" == "[N/A]" ]] && thermal+=" (passive)"
printf " %-10s %s\n" "thermal:" "$thermal"
printf " %-10s %s W / %s W limit\n" "power:" "$pwr_draw" "$pwr_limit"
# Processes on this GPU
local procs
procs=$(nvidia-smi --query-compute-apps=pid,used_gpu_memory,process_name \
--format=csv,noheader,nounits 2>/dev/null \
| awk -F, -v gpu="$idx" '
NR==FNR { next }
{ pid=$1; mem=$2; name=$3
gsub(/ /,"",pid); gsub(/ /,"",mem); gsub(/^ /,"",name)
if (length(name) > 30) name = substr(name, 1, 29) ">"
printf " PID %-8s %-30s %s MiB\n", pid, name, mem
}' /dev/null -)
# Filter to this GPU's processes via pmon
local pmon
pmon=$(nvidia-smi pmon -s m -c 1 2>/dev/null \
| awk -v gpu="$idx" 'NR>2 && $1==gpu && $3!~/-/ {
name=$NF; if (length(name) > 30) name = substr(name, 1, 29) ">"
printf " PID %-8s %-30s %s MiB\n", $2, name, $4
}')
if [[ -n "$pmon" ]]; then
echo " processes:"
echo "$pmon"
else
echo " processes: none"
fi
echo
(( count++ ))
done < <(nvidia-smi \
--query-gpu=index,name,memory.total,memory.used,memory.free,\
utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,\
fan.speed,pstate,power.draw,power.limit \
--format=csv,noheader,nounits 2>/dev/null)
printf "%-10s %d\n" "total:" "$count"
echo
# Kernel modules
local modules
modules=$(lsmod 2>/dev/null | awk 'NR>1 && /nvidia/ {printf "%s ", $1}')
printf "%-10s %s\n" "modules:" "${modules:-none}"
}