diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39e299cb0..3f905d3c8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7714,3 +7714,96 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb300-dynamo-sglang: + # _arm64 variant: GH runner pod doing `enroot import` is amd64, but + # gb300-cw compute nodes are aarch64 (Grace). Without the explicit + # arm64 tag the registry serves the amd64 manifest, which fails to + # exec on the compute side. + image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + # Five disagg topologies from NVIDIA/srt-slurm PR #85 branch + # recipes/dsv4-agg-disagg, overlaid with cw-specific fields by + # launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave (2x 18-node + # racks); recipes set their own sbatch_directives.segment for rack + # pinning. All use NIXL KV transfer. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP=4 MXFP4 — low-latency baseline (2 nodes) + - conc-list: [4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-tp4-mxfp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # 1P1D DEP4 mega_moe — TEP disagg (2 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-dep4-mega-moe.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + # 1P2D asymmetric DEP4->DEP8 mega_moe — best per-GPU efficiency (3 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p2d-dep4-to-dep8-mega-moe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 2P2D symmetric DEP8 mega_moe — largest throughput (4 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-dep8-mega-moe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 2P2D TP=8 MXFP4 — TP-only 4-node baseline (4 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-tp8-mxfp4.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 60f3299cf..f574c629c 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -139,3 +139,8 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml new file mode 100644 index 000000000..72baef909 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml @@ -0,0 +1,128 @@ +name: "dsv4-pro-gb300-disagg-1p1d-dep4-mega-moe-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml new file mode 100644 index 000000000..a0b60a00b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml @@ -0,0 +1,86 @@ +name: "dsv4-pro-gb300-disagg-1p1d-tp4-mxfp4-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml new file mode 100644 index 000000000..569373509 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml @@ -0,0 +1,127 @@ +name: "dsv4-pro-gb300-disagg-1p2d-dep4-to-dep8-mega-moe-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "3" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + data-parallel-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.83 + max-running-requests: 2048 + cuda-graph-max-bs: 2048 + chunked-prefill-size: 32768 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml new file mode 100644 index 000000000..8d82d58cb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml @@ -0,0 +1,126 @@ +name: "dsv4-pro-gb300-disagg-2p2d-dep8-mega-moe-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "4" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + data-parallel-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.83 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + data-parallel-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.83 + max-running-requests: 2048 + cuda-graph-max-bs: 2048 + chunked-prefill-size: 32768 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml new file mode 100644 index 000000000..1b697d826 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml @@ -0,0 +1,98 @@ +name: "dsv4-pro-gb300-disagg-2p2d-tp8-mxfp4-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "4" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 8192 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 8192 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a29c278f2..0b3b0b04f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1877,7 +1877,7 @@ - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 - + - config-keys: - dsv4-fp4-b200-sglang description: @@ -1918,3 +1918,12 @@ - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)" + - "Topology: 1P + 1D, both TP=4 on a single GB300; MXFP4 MoE kernels, NIXL KV transfer" + - "Recipe ported from NVIDIA/srt-slurm PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169 diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh new file mode 100755 index 000000000..348c436ef --- /dev/null +++ b/runners/gb300-cw-sglang-container-deps.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Custom container-deps installer for gb300-cw + sglang. pip-installs +# dynamo from a wheel + source archive that launch_gb300-cw.sh pre-built +# on /mnt/vast BEFORE submitting sbatch. +# +# Why the prebuild design (mirrors the vllm sibling at +# gb300-cw-vllm-container-deps.sh from PR #1150): +# srt-slurm's per-rank install path runs `maturin build` inside every +# container srtctl srun's. The lmsysorg/sglang:deepseek-v4-grace- +# blackwell_arm64 image lacks rust pre-installed, so the per-rank +# build path can't run; pinning a published dev wheel (1.2.0.dev*) +# trips API drift against the bundled sglang 0.5.9 (compat shim +# warning + disagg startup warmup hang — see runs ending 2026-04-27). +# Building dynamo ONCE from hash 6a159fed (the same commit the gb200 +# vllm recipe pins, known to be sglang-API-stable) on a single-node +# srun in launch_gb300-cw.sh sidesteps both: every rank pip-installs +# from the cache here (~30 s, no contention). +# +# Used in tandem with `dynamo.install: false` in the gb300-cw sglang +# recipes so srt-slurm's hardcoded install path is skipped and this +# script is the sole installer. + +set -e + +DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}" +CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH" +DONE_MARKER="$CACHE_DIR/.done" + +if [ ! -f "$DONE_MARKER" ]; then + echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2 + echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2 + exit 1 +fi + +echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR" +pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall + +rm -rf /tmp/dynamo_build +mkdir -p /tmp/dynamo_build/dynamo +tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo +cd /tmp/dynamo_build/dynamo +pip install --break-system-packages -e . + +echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" + +# --- NIXL DSv4 state-buffer patch: sglang PR #23773 -------------------------- +# The disagg recipes use NIXL KV transfer. Without this patch, NIXL +# silently drops auxiliary state buffers (SWA / NSA / Mamba), causing +# decode-side accuracy to collapse on DSv4-Pro. The patch mirrors what +# the Mooncake backend already does. See NVIDIA/srt-slurm PR #85 README. +SGLANG_DIR="${SGLANG_DIR:-/sgl-workspace/sglang}" +SGLANG_REMOTE="https://github.com/sgl-project/sglang.git" +SGLANG_PR_NUMBER="23773" +SGLANG_PR_REF="refs/pull/${SGLANG_PR_NUMBER}/head" +SGLANG_LOCAL_BRANCH="nixl-dsv4-pr-${SGLANG_PR_NUMBER}" + +echo "=== Installing SGLang NIXL DSV4 fix from PR #${SGLANG_PR_NUMBER} ===" + +if [ -d "$SGLANG_DIR/.git" ]; then + cd "$SGLANG_DIR" + git config --global --add safe.directory "$SGLANG_DIR" 2>/dev/null || true + if git remote get-url origin >/dev/null 2>&1; then + git remote set-url origin "$SGLANG_REMOTE" + else + git remote add origin "$SGLANG_REMOTE" + fi + git fetch --depth 1 origin "$SGLANG_PR_REF" + git checkout -f -B "$SGLANG_LOCAL_BRANCH" FETCH_HEAD + echo "Checked out SGLang PR #${SGLANG_PR_NUMBER} at $(git rev-parse HEAD)" +else + echo "WARNING: $SGLANG_DIR/.git not found; skipping NIXL patch (container may already include fix)" +fi + +# --- API-drift patch: dynamo 1.1.0 vs sglang 0.5.9 -------------------------- +# ai-dynamo at hash 6a159fed (1.1.0-equivalent) calls +# `engine.async_generate(return_routed_experts=...)`, but the sglang 0.5.9 +# bundled in lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 has an +# Engine.async_generate signature that doesn't accept that kwarg, so every +# request 500s with: +# TypeError: Engine.async_generate() got an unexpected keyword argument +# 'return_routed_experts' +# (See run 24973148979 → mooncake unblocked the disagg warmup; this is the +# next failure layer.) Strip the kwarg from every call site in the +# extracted dynamo source. `pip install -e .` above is editable, so the +# patch propagates immediately at next `python3 -m dynamo.sglang ...`. +DYNAMO_SRC=/tmp/dynamo_build/dynamo +patch_targets=$(grep -rl 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || true) +if [ -n "$patch_targets" ]; then + # Match WHOLE LINES that are just a kwarg pass: + # return_routed_experts=,? + # The value is constrained to a simple identifier ([A-Za-z_][\w.]*), + # which deliberately excludes function calls (no `(` allowed). This + # leaves the multi-line assignment statement at decode_handler.py:275 + # intact: + # return_routed_experts = getattr( + # self.config.server_args, "enable_return_routed_experts", False + # ) + # That assignment is dead code after we strip the kwarg passes, but + # leaving it costs nothing and avoids the syntax-error trap from the + # earlier (over-greedy) version of this patch. + for f in $patch_targets; do + echo "[dynamo-patch] stripping return_routed_experts kwarg lines in $f" + python3 - "$f" <<'PYEOF' +import re, sys +path = sys.argv[1] +with open(path) as fh: + src = fh.read() +# Whole-line kwarg pass: indented `return_routed_experts=,?` then EOL. +# `[A-Za-z_][\w.]*` matches identifiers, attribute access, True/False/None — but NOT calls. +new = re.sub( + r'^[ \t]+return_routed_experts\s*=\s*[A-Za-z_][\w.]*\s*,?[ \t]*\n', + '', + src, + flags=re.MULTILINE, +) +if new != src: + with open(path, 'w') as fh: + fh.write(new) + print(f'[dynamo-patch] patched: {path}') +else: + print(f'[dynamo-patch] no kwarg-pass lines matched in: {path}') +PYEOF + done + # Sanity: any remaining occurrence is fine if it's the assignment; + # log it so the next person knows what's left. + echo "[dynamo-patch] residual occurrences (expected: only the dead assignment in decode_handler.py):" + grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || echo " (none)" +else + echo "[dynamo-patch] no occurrences of return_routed_experts found in $DYNAMO_SRC (already patched or moved upstream)" +fi diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh new file mode 100755 index 000000000..edbb55375 --- /dev/null +++ b/runners/launch_gb300-cw.sh @@ -0,0 +1,360 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw +# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in +# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). Compared to +# that script, the SGLang flow is simpler: no dynamo wheel prebuild and +# no vllm-container-deps.sh override, because the SGLang recipes pin +# `dynamo.version: 0.8.1` and srtctl pip-installs from PyPI per rank. + +set -x + +if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local + # NVMe on cw. SRT_SLURM_MODEL_PREFIX matches the model.path alias in + # benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/. + export MODEL_PATH="/mnt/vast/models/dsv4/" + export SRT_SLURM_MODEL_PREFIX="dsv4-pro" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; account `cw-sup` is +# what `sacctmgr show assoc user=$USER` returns there. `benchmark` +# (inherited from gb200-nv) does not exist on cw. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="cw-sup" + +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env +# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so) +# to mount into the container. cw doesn't set them by default — without +# them the container has no libcuda and CUDA init fails. SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +NGINX_IMAGE="nginx:1.27.4" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / +# pyxis rejects '+' in image paths with "Invalid image format", and the +# old /mnt/vast/squash dir contains '+'-separated files from prior runs. +SQUASH_DIR="/mnt/vast/squash_dupe" +mkdir -p "$SQUASH_DIR" +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +enroot import -o $SQUASH_FILE docker://$IMAGE +enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + +# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting +# the main sbatch. The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 +# image lacks a working ai-dynamo (install: false → ModuleNotFoundError), +# and pinning a published dev wheel (1.2.0.dev*) trips API drift against +# the bundled sglang 0.5.9 (compat shim warns then disagg startup warmup +# hangs — see runs ending 2026-04-27). Building from hash 6a159fed (the +# same commit the gb200 vllm sibling pins, known sglang-API-stable) on +# a single dedicated srun eliminates per-rank coordination on /mnt/vast +# (NFS flock is unreliable). Same pattern as PR #1150's vllm launcher. +DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b" +DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache" +DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH" +DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done" +mkdir -p "$DYNAMO_CACHE_ROOT" + +if [ ! -f "$DYNAMO_DONE_MARKER" ]; then + echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..." + # Build into a unique temp dir, then atomically mv into place. Two + # concurrent runners may both build; the first to finish the rename + # wins, the loser cleans up. Same-directory rename() is atomic on + # NFS (unlike flock). + TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX") + # --mem=0: claim full node memory. Default cgroup is much smaller and + # rustc's link phase can OOM otherwise. CARGO_BUILD_JOBS=8 caps + # parallelism so peak rustc memory stays bounded on a 72-core Grace + # node, and `-C debuginfo=0` cuts per-process memory further. + srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \ + --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \ + --job-name="${RUNNER_NAME}-prebuild" \ + --container-image="$SQUASH_FILE" \ + --no-container-entrypoint --no-container-mount-home \ + --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \ + bash -c " + set -e + apt-get update -qq + apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + . \$HOME/.cargo/env + fi + if ! command -v maturin &>/dev/null; then + pip install --break-system-packages maturin + fi + rm -rf /tmp/dynamo_build + mkdir -p /tmp/dynamo_build + cd /tmp/dynamo_build + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + git checkout $DYNAMO_HASH + cd lib/bindings/python/ + export CARGO_BUILD_JOBS=8 + export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable' + maturin build -o '$TEMP_BUILD' + cd /tmp/dynamo_build/dynamo + tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \ + --exclude='lib/bindings/python/target' \ + --exclude='.git' \ + . + touch '$TEMP_BUILD/.done' + " + if [ -f "$TEMP_BUILD/.done" ]; then + # Atomic publish. If another runner already published, mv fails + # and we just discard our copy. + if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then + echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR" + else + echo "[dynamo-prebuild] another runner published first, discarding our copy" + rm -rf "$TEMP_BUILD" + fi + else + echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2 + rm -rf "$TEMP_BUILD" + exit 1 + fi +else + echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR" +fi + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits +# hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout recipes/dsv4-agg-disagg + +# Overlay our cw-adapted DSv4 SGLang disagg recipes onto the upstream +# recipes from PR #85. The upstream recipes at +# recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/ don't carry +# cw-specific fields (dynamo.install, setup_script, extra_mount, +# sbatch_directives), so we overlay locally-maintained copies that add +# those. `cp -rT` replaces the upstream files in place. +mkdir -p recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k" recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp + +# Drop our cache-installer setup_script next to upstream's configs. +# Recipes reference it via `setup_script: gb300-cw-sglang-container-deps.sh` +# alongside `dynamo.install: false` so srtctl skips its own pip install +# and this script (force-reinstalling from /mnt/vast/dynamo_cache) is the +# sole installer per rank. +cp "$GITHUB_WORKSPACE/runners/gb300-cw-sglang-container-deps.sh" configs/gb300-cw-sglang-container-deps.sh +chmod +x configs/gb300-cw-sglang-container-deps.sh + +echo "Installing srtctl..." +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is +# shared NFS across both. srtctl's slurm template (job_script_minimal.j2) +# does `if ! command -v uv` and skips its own ARM64 install when uv is +# already on PATH; on compute nodes $HOME/.local/bin is on PATH by +# default, so a stray x86 binary at $HOME/.local/bin/uv from this +# runner shadows the template's install and crashes the orchestrator +# with `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml <&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + CONFIG_NAME=$(basename "$result_subdir") + + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi