-
Notifications
You must be signed in to change notification settings - Fork 29
[Optimization 3/n] Add GPU Spec Module #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
07a3268
3c4b124
11f4e79
251f419
b789660
4d35d57
d871678
db0c754
cd29759
bbfa6cd
543453a
706c9cc
4febdd6
d92a7b7
2994315
1378fc3
45fec80
b640cde
e952123
e7ba29a
72ac4d1
e2c599e
8ab907c
e350802
8541299
313a84f
9e608ac
f3220e1
4443f33
b12b138
f34ddc8
d47113c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| Diagnose Prompt Module for Hardware Bottleneck Analysis. | ||
| """ | ||
|
|
||
| from .gpu_specs import get_gpu_specs | ||
|
|
||
|
|
||
| __all__ = ["get_gpu_specs"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| GPU Specifications Database for Bottleneck Analysis | ||
|
|
||
| This module provides GPU hardware specifications needed for performance analysis | ||
| and bottleneck identification. It includes peak compute performance, memory bandwidth, | ||
| cache sizes, and SM counts for common NVIDIA GPUs. | ||
|
|
||
| """ | ||
|
|
||
| import logging | ||
| from typing import Any | ||
|
|
||
| from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import ( | ||
| GPU_SPECS_DATABASE, | ||
| ) | ||
|
|
||
| __all__ = ["GPU_SPECS_DATABASE", "get_gpu_specs"] | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def get_gpu_specs(gpu_name: str) -> dict[str, Any] | None: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The PR description claims auto-detection and fuzzy matching, maybe you can update the pr description to align with the updates
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the catch - updated! |
||
| """ | ||
| Get GPU specifications for bottleneck analysis. | ||
|
|
||
| This function returns hardware specifications needed for performance analysis, | ||
| including peak compute performance, memory bandwidth, cache sizes, and SM counts. | ||
|
|
||
| Args: | ||
| gpu_name: GPU name. Must exactly match a key in GPU_SPECS_DATABASE. | ||
|
|
||
| Returns: | ||
| Dictionary with GPU specifications, or None if GPU is not in the database. | ||
| When successful, contains: | ||
| - name: GPU name | ||
| - architecture: GPU architecture (e.g., "Ampere", "Hopper") | ||
| - peak_fp32_tflops: Peak FP32 compute performance in TFLOPS | ||
| - peak_fp16_tflops: Peak FP16 compute performance in TFLOPS | ||
| - peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported) | ||
| - peak_memory_bw_gbps: Peak memory bandwidth in GB/s | ||
| - sm_count: Number of streaming multiprocessors | ||
| - max_threads_per_sm: Maximum threads per SM | ||
| - l1_cache_kb: L1 cache size in KB per SM | ||
| - l2_cache_mb: Total L2 cache size in MB | ||
| - memory_gb: Total GPU memory in GB | ||
| - memory_type: Memory type (e.g., "HBM2e", "GDDR6X") | ||
|
|
||
| Examples: | ||
| >>> specs = get_gpu_specs("NVIDIA A100 SXM4 80GB") | ||
| >>> if specs: | ||
| ... print(f"SM Count: {specs['sm_count']}") | ||
| """ | ||
| if gpu_name in GPU_SPECS_DATABASE: | ||
| return GPU_SPECS_DATABASE[gpu_name].copy() | ||
|
|
||
| logger.warning( | ||
| "Unknown GPU: '%s'. Disable Optimization. Available GPUs: %s", | ||
| gpu_name, | ||
| ", ".join(GPU_SPECS_DATABASE.keys()), | ||
| ) | ||
| return None | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| print("GPU Specifications Module") | ||
| print("=" * 60) | ||
|
|
||
| # Show all available GPUs | ||
| print("Available GPU specifications in database:") | ||
| for gpu_name in sorted(GPU_SPECS_DATABASE.keys()): | ||
| print(f" - {gpu_name}") | ||
|
|
||
| # Example usage | ||
| print(f"\n{'=' * 60}") | ||
| example_gpu = "NVIDIA A100 SXM4 80GB" | ||
| specs = get_gpu_specs(example_gpu) | ||
| if specs: | ||
| print(f"\nExample specs for {example_gpu}:") | ||
| print(f" - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s") | ||
| print(f" - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS") | ||
| print(f" - SM Count: {specs['sm_count']}") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,182 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| GPU Specifications Database - Updated with Specific SKUs | ||
|
|
||
| This module contains the GPU hardware specifications database used for | ||
| performance analysis and bottleneck identification. Updated to include | ||
| specific SKU variants for multi-SKU GPUs like A100 and H100. | ||
|
|
||
| Sources: | ||
| - NVIDIA official specifications and datasheets | ||
| - TechPowerUp GPU Database | ||
| - Manufacturer datasheets | ||
|
|
||
| Last Updated: January 2026 | ||
| """ | ||
|
|
||
| GPU_SPECS_DATABASE: dict[str, dict[str, object]] = { | ||
| # NVIDIA A100 SKUs - SXM4 Variants | ||
| "NVIDIA A100 SXM4 40GB": { | ||
| "name": "NVIDIA A100 SXM4 40GB", | ||
| "architecture": "Ampere", | ||
| "peak_fp32_tflops": 19.5, | ||
| "peak_fp16_tflops": 312.0, # Without sparsity | ||
| "peak_bf16_tflops": 312.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 1555, | ||
| "sm_count": 108, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 192, | ||
| "l2_cache_mb": 40, | ||
| "memory_gb": 40, | ||
| "memory_type": "HBM2e", | ||
| "form_factor": "SXM4", | ||
| "tdp_w": 400, | ||
| }, | ||
| "NVIDIA A100 SXM4 80GB": { | ||
| "name": "NVIDIA A100 SXM4 80GB", | ||
| "architecture": "Ampere", | ||
| "peak_fp32_tflops": 19.5, | ||
| "peak_fp16_tflops": 312.0, # Without sparsity | ||
| "peak_bf16_tflops": 312.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 2039, | ||
| "sm_count": 108, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 192, | ||
| "l2_cache_mb": 40, | ||
| "memory_gb": 80, | ||
| "memory_type": "HBM2e", | ||
| "form_factor": "SXM4", | ||
| "tdp_w": 400, | ||
| }, | ||
| # NVIDIA A100 SKUs - PCIe Variants | ||
| "NVIDIA A100 PCIe 40GB": { | ||
| "name": "NVIDIA A100 PCIe 40GB", | ||
| "architecture": "Ampere", | ||
| "peak_fp32_tflops": 19.5, | ||
| "peak_fp16_tflops": 312.0, # Without sparsity | ||
| "peak_bf16_tflops": 312.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 1555, | ||
| "sm_count": 108, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 192, | ||
| "l2_cache_mb": 40, | ||
| "memory_gb": 40, | ||
| "memory_type": "HBM2e", | ||
| "form_factor": "PCIe", | ||
| "tdp_w": 250, | ||
| }, | ||
| "NVIDIA A100 PCIe 80GB": { | ||
| "name": "NVIDIA A100 PCIe 80GB", | ||
| "architecture": "Ampere", | ||
| "peak_fp32_tflops": 19.5, | ||
| "peak_fp16_tflops": 312.0, # Without sparsity | ||
| "peak_bf16_tflops": 312.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 1935, | ||
| "sm_count": 108, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 192, | ||
| "l2_cache_mb": 40, | ||
| "memory_gb": 80, | ||
| "memory_type": "HBM2e", | ||
| "form_factor": "PCIe", | ||
| "tdp_w": 300, | ||
| }, | ||
| # NVIDIA H100 SKUs - SXM5 Variant | ||
| "NVIDIA H100 SXM5 80GB": { | ||
| "name": "NVIDIA H100 SXM5 80GB", | ||
| "architecture": "Hopper", | ||
| "peak_fp32_tflops": 67.0, | ||
| "peak_fp16_tflops": 1979.0, # Without sparsity | ||
| "peak_bf16_tflops": 1979.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 3350, | ||
| "sm_count": 132, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 256, | ||
| "l2_cache_mb": 50, | ||
| "memory_gb": 80, | ||
| "memory_type": "HBM3", | ||
| "form_factor": "SXM5", | ||
| "tdp_w": 700, | ||
| }, | ||
| # NVIDIA H100 SKUs - PCIe Variant | ||
| "NVIDIA H100 PCIe 80GB": { | ||
| "name": "NVIDIA H100 PCIe 80GB", | ||
| "architecture": "Hopper", | ||
| "peak_fp32_tflops": 51.0, | ||
| "peak_fp16_tflops": 1513.0, # Without sparsity | ||
| "peak_bf16_tflops": 1513.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 2000, | ||
| "sm_count": 114, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 256, | ||
| "l2_cache_mb": 50, | ||
| "memory_gb": 80, | ||
| "memory_type": "HBM2e", | ||
| "form_factor": "PCIe", | ||
| "tdp_w": 350, | ||
| }, | ||
| # NVIDIA H100 SKUs - NVL Variant (for LLM inference) | ||
| "NVIDIA H100 NVL 94GB": { | ||
| "name": "NVIDIA H100 NVL 94GB", | ||
| "architecture": "Hopper", | ||
| "peak_fp32_tflops": 60.0, | ||
| "peak_fp16_tflops": 1671.0, # Without sparsity | ||
| "peak_bf16_tflops": 1671.0, # Without sparsity | ||
| "peak_memory_bw_gbps": 3900, | ||
| "sm_count": 132, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 256, | ||
| "l2_cache_mb": 50, | ||
| "memory_gb": 94, | ||
| "memory_type": "HBM3", | ||
| "form_factor": "PCIe", | ||
| "tdp_w": 400, | ||
| }, | ||
| # NVIDIA RTX 4090 | ||
| "NVIDIA RTX 4090": { | ||
| "name": "NVIDIA RTX 4090", | ||
| "architecture": "Ada Lovelace", | ||
| "peak_fp32_tflops": 82.58, | ||
| "peak_fp16_tflops": 82.58, | ||
| "peak_bf16_tflops": 82.58, | ||
| "peak_memory_bw_gbps": 1008, | ||
| "sm_count": 128, | ||
| "max_threads_per_sm": 1536, | ||
| "l1_cache_kb": 128, | ||
| "l2_cache_mb": 72, | ||
| "memory_gb": 24, | ||
| "memory_type": "GDDR6X", | ||
| "form_factor": "PCIe", | ||
| "tdp_w": 450, | ||
| }, | ||
| # NVIDIA RTX 5080 | ||
| "NVIDIA RTX 5080": { | ||
| "name": "NVIDIA RTX 5080", | ||
| "architecture": "Blackwell", | ||
| "peak_fp32_tflops": 56.28, | ||
| "peak_fp16_tflops": 56.28, | ||
| "peak_bf16_tflops": 56.28, | ||
| "peak_memory_bw_gbps": 960, | ||
| "sm_count": 84, | ||
| "max_threads_per_sm": 1536, | ||
| "l1_cache_kb": 128, | ||
| "l2_cache_mb": 64, | ||
| "memory_gb": 16, | ||
| "memory_type": "GDDR7", | ||
| "form_factor": "PCIe", | ||
| "tdp_w": 360, | ||
| }, | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GPU specs table: “A100/H100” are multi-SKU; peak BW/TFLOPs + memory size can be wrong under fuzzy match/fallback (e.g., A100 80GB → 40GB). Can we avoid silent A100 fallback and add a match_type/matched_name field (or split common SKUs) |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Examples/docs are now inconsistent with the API: get_gpu_specs() now requires gpu_name: str, but:
PR description example still shows get_gpu_specs() with no args
gpu_specs.py docstring example uses "NVIDIA A100" (no longer a key)
main uses example_gpu = "NVIDIA A100" (also not a key)