Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 27 additions & 26 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
ARG CUDA_VERSION=12.8.0
ARG CUDA_VERSION=13.0.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04

ARG PYTHON_VERSION=3.10
ARG MAMBA_VERSION=24.7.1-0
ARG VLLM_VERSION=0.16.0
ARG VLLM_VERSION=0.21.0
ARG NIXL_REF=v1.1.0
ARG FLASH_MLA_REF=47c35a7
ARG DEEPGEMM_REF=891d57b4db1071624b5c8fa0d1e51cb317fa709f
ARG TARGETPLATFORM
ARG ENABLE_DEEPEP=1
ARG ENABLE_NIXL=1
ARG ENABLE_CACHE=1
ARG ENABLE_SM100=0

ENV PATH=/opt/conda/bin:$PATH \
CONDA_PREFIX=/opt/conda
Expand Down Expand Up @@ -44,13 +47,18 @@ WORKDIR /root

COPY ./requirements.txt /lightllm/requirements.txt
RUN pip install -U pip
RUN pip install -r /lightllm/requirements.txt --no-cache-dir
RUN pip install --no-cache-dir vllm==${VLLM_VERSION}
RUN git clone https://github.com/deepseek-ai/FlashMLA.git /root/FlashMLA && \
RUN pip install --no-cache-dir \
--extra-index-url https://download.pytorch.org/whl/cu130 \
vllm==${VLLM_VERSION}
RUN pip install -r /lightllm/requirements.txt --no-cache-dir \
--extra-index-url https://download.pytorch.org/whl/cu130
RUN export CPATH=/usr/local/cuda/targets/x86_64-linux/include/cccl:/usr/local/cuda/targets/x86_64-linux/include${CPATH:+:${CPATH}} && \
git clone https://github.com/deepseek-ai/FlashMLA.git /root/FlashMLA && \
cd /root/FlashMLA && \
git checkout ${FLASH_MLA_REF} && \
git submodule update --init --recursive && \
FLASH_MLA_DISABLE_SM100=1 pip install --no-cache-dir .
FLASH_MLA_DISABLE_SM100="$(if [ "${ENABLE_SM100}" = "1" ]; then echo 0; else echo 1; fi)" \
pip install --no-cache-dir .

RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*

Expand Down Expand Up @@ -78,27 +86,20 @@ RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
set -e; \
ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
NVSHMEM_VERSION=3.3.9; \
CUDA_ARCHS=90; \
wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
&& cd nvshmem \
&& rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
&& cmake --build build --target install -j64; \
DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
python -m pip install --upgrade --no-deps \
"nvidia-nccl-cu13==2.30.4" \
"nvidia-nvshmem-cu13==3.6.5"; \
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout b306af06afd412c88e51e71802951606e40b7358; \
ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so.3 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so; \
ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so.2 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so; \
pip install --no-build-isolation .; \
fi

RUN cd /root && git clone https://github.com/deepseek-ai/DeepGEMM.git && \
cd DeepGEMM && git checkout ${DEEPGEMM_REF} && \
git submodule update --init --recursive && \
pip install --no-build-isolation .

RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
Expand Down Expand Up @@ -126,7 +127,7 @@ RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
apt-get update && apt-get install -y pkg-config tmux net-tools && \
cd /usr/local/src; \
pip install --upgrade meson pybind11 patchelf; \
git clone https://github.com/ai-dynamo/nixl.git -b main && \
git clone https://github.com/ai-dynamo/nixl.git -b ${NIXL_REF} && \
cd nixl && \
rm -rf build && \
mkdir build && \
Expand Down
14 changes: 10 additions & 4 deletions docker/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,23 @@ set -euo pipefail
# --no-nixl Disable NIXL (default: enabled)
# --no-cache Disable cache (default: enabled)
# --lite Disable DEEPEP, NIXL and cache in one shot
# --cuda-version <ver> CUDA version (default: 12.8.0)
# --cuda-version <ver> CUDA version (default: 13.0.0)
# --image-prefix <name> Image prefix (default: lightllm)
# --image-tag <tag> Image tag (default: generated from enabled features)
# --enable-sm100 Enable SM100 support (default: disabled)
# -h / --help Show help

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "${ROOT_DIR}"

IMAGE_PREFIX="${IMAGE_PREFIX:-lightllm}"
CUDA_VERSION="${CUDA_VERSION:-12.8.0}"
CUDA_VERSION="${CUDA_VERSION:-13.0.0}"
IMAGE_TAG="${IMAGE_TAG:-}"

ENABLE_DEEPEP="${ENABLE_DEEPEP:-1}"
ENABLE_NIXL="${ENABLE_NIXL:-1}"
ENABLE_CACHE="${ENABLE_CACHE:-1}"
ENABLE_SM100="${ENABLE_SM100:-0}"

print_help() {
sed -n '1,80p' "$0" | sed 's/^# \{0,1\}//'
Expand All @@ -43,6 +45,7 @@ while [[ $# -gt 0 ]]; do
--no-deepep) ENABLE_DEEPEP=0 ;;
--no-nixl) ENABLE_NIXL=0 ;;
--no-cache) ENABLE_CACHE=0 ;;
--enable-sm100) ENABLE_SM100=1 ;;
--lite)
ENABLE_DEEPEP=0
ENABLE_NIXL=0
Expand Down Expand Up @@ -78,13 +81,16 @@ done
# - Other combos: composed from enabled feature names
if [[ -z "${IMAGE_TAG}" ]]; then
tag_parts=()
if [[ "${ENABLE_SM100}" -eq 1 ]]; then
tag_parts+=("sm100")
fi
if [[ "${ENABLE_NIXL}" -eq 1 ]]; then
tag_parts+=("nixl")
fi
if [[ "${ENABLE_DEEPEP}" -eq 1 ]]; then
tag_parts+=("deepep")
fi
if [[ "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then
if [[ "${ENABLE_SM100}" -eq 0 && "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then
IMAGE_TAG="cuda${CUDA_VERSION}"
else
prefix=""
Expand All @@ -100,6 +106,6 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile \
--build-arg ENABLE_DEEPEP="${ENABLE_DEEPEP}" \
--build-arg ENABLE_NIXL="${ENABLE_NIXL}" \
--build-arg ENABLE_CACHE="${ENABLE_CACHE}" \
--build-arg ENABLE_SM100="${ENABLE_SM100}" \
--progress=plain \
-t "${IMAGE_PREFIX}:${IMAGE_TAG}" .

8 changes: 8 additions & 0 deletions docs/CN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,14 @@ PD 分离模式参数

示例可以在 test/advanced_config/mixed_quantization/llamacls-mix-down.yaml 中找到。

.. option:: --expert_dtype

EP MoE 专家量化类型,可选值:

* ``fp8``
* ``fp4``,仅支持 SM100 GPU
* ``None`` (默认)

.. option:: --vit_quant_type

ViT 量化方法,可选值:
Expand Down
8 changes: 8 additions & 0 deletions docs/EN/source/tutorial/api_server_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,14 @@ Quantization Parameters

Examples can be found in test/advanced_config/mixed_quantization/llamacls-mix-down.yaml.

.. option:: --expert_dtype

Expert quantization dtype for EP MoE, optional values:

* ``fp8``
* ``fp4``: SM100 GPUs only
* ``None`` (default)

.. option:: --vit_quant_type

ViT quantization method, optional values:
Expand Down
3 changes: 2 additions & 1 deletion lightllm/common/basemodel/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(self, kvargs):
self.disable_cudagraph = kvargs.get("disable_cudagraph", False)
self.quant_type = kvargs.get("quant_type", "none")
self.quant_cfg_path = kvargs.get("quant_cfg", None)
self.expert_dtype = kvargs.get("expert_dtype", None)
self.mem_fraction = kvargs.get("mem_fraction", 0.9)
self.tp_world_size_ = get_dp_world_size()
self.enable_tpsp_mix_mode = get_env_start_args().enable_tpsp_mix_mode
Expand Down Expand Up @@ -156,7 +157,7 @@ def _verify_params(self):
return

def _init_quant(self):
self.quant_cfg = Quantcfg(self.config, self.quant_type, self.quant_cfg_path)
self.quant_cfg = Quantcfg(self.config, self.quant_type, self.quant_cfg_path, self.expert_dtype)
logger.info(f"Initial quantization. " f"The default quantization method is {self.quant_cfg.quant_type}")

def _init_weights(self, start_layer_index=0):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class BufNode:
inner_tensor: torch.Tensor
shape_key: Tuple[int, torch.dtype]
storage_weak_ptr: int
free_use_count_bias: int = 0
shape_to_tensor: Dict[Union[torch.Size, Iterable[int]], torch.Tensor] = field(default_factory=dict)

def __del__(self):
Expand Down Expand Up @@ -99,7 +100,8 @@ def alloc_tensor(
# 回收可能消亡的 tensor
for ptr in self.changed_ptr:
t_buf_node = self.ptr_to_bufnode[ptr]
if self.use_count(ptr) == 1 + len(t_buf_node.shape_to_tensor):
free_use_count = t_buf_node.free_use_count_bias + 1 + len(t_buf_node.shape_to_tensor)
if self.use_count(ptr) <= free_use_count:
self.free_shape_dtype_to_bufs[t_buf_node.shape_key].append(t_buf_node)
self.changed_ptr.clear()

Expand Down Expand Up @@ -131,6 +133,7 @@ def alloc_tensor(
self.ptr_to_bufnode[storage_weak_ptr] = buf_node
if shape not in buf_node.shape_to_tensor:
buf_node.shape_to_tensor[shape] = buf_node.inner_tensor.view(shape)
buf_node.free_use_count_bias = self.use_count(storage_weak_ptr) - (1 + len(buf_node.shape_to_tensor))
mark_tensor = buf_node.shape_to_tensor[shape]
ans = mark_tensor.data # 返回一个新的引用, 否则引用计数会无法判断
ans.storage_weak_ptr = buf_node.storage_weak_ptr
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
from lightllm.distributed import dist_group_manager
from lightllm.common.triton_utils.autotuner import Autotuner
from lightllm.common.quantization.quantize_method import WeightPack
from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
from lightllm.utils.envs_utils import (
get_deepep_num_max_dispatch_tokens_per_rank_prefill,
get_deepep_num_max_dispatch_tokens_per_rank_decode,
)
from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe_ep import (
fused_experts_impl,
fused_experts,
get_ep_num_sms,
masked_group_gemm,
_deepgemm_grouped_fp8_nt_contiguous,
deepgemm_grouped_fp8_nt_contiguous,
quantize_fused_experts_input,
)
from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
per_token_group_quant_fp8,
Expand Down Expand Up @@ -72,23 +77,15 @@ def _fused_experts(
router_logits: Optional[torch.Tensor] = None,
is_prefill: Optional[bool] = None,
):
w13_weight, w13_scale = w13.weight, w13.weight_scale
w2_weight, w2_scale = w2.weight, w2.weight_scale
use_fp8_w8a8 = self.quant_method.method_name != "none"
output = fused_experts_impl(
output = fused_experts(
hidden_states=input_tensor,
w1=w13_weight,
w2=w2_weight,
w13=w13,
w2=w2,
topk_weights=topk_weights,
topk_idx=topk_ids.to(torch.long),
num_experts=self.total_expert_num_contain_redundancy, # number of all experts contain redundancy
buffer=dist_group_manager.ep_buffer,
quant_method=self.quant_method,
is_prefill=is_prefill,
use_fp8_w8a8=use_fp8_w8a8,
use_fp8_all2all=use_fp8_w8a8,
use_int8_w8a16=False, # default to False
w1_scale=w13_scale,
w2_scale=w2_scale,
previous_event=None, # for overlap
)
return output
Expand Down Expand Up @@ -118,13 +115,13 @@ def low_latency_dispatch(
)

topk_idx = topk_idx.to(torch.long)
num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank()
num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank_decode()
use_fp8_w8a8 = self.quant_method.method_name != "none"
recv_x, masked_m, handle, event, hook = dist_group_manager.ep_buffer.low_latency_dispatch(
hidden_states,
topk_idx,
num_max_dispatch_tokens_per_rank,
self.total_expert_num_contain_redundancy,
recv_x, masked_m, handle, event, hook = dist_group_manager.ep_low_latency_buffer.low_latency_dispatch(
topk_idx=topk_idx,
x=hidden_states,
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
num_experts=self.total_expert_num_contain_redundancy,
use_fp8=use_fp8_w8a8,
async_finish=False,
return_recv_hook=True,
Expand Down Expand Up @@ -155,13 +152,8 @@ def select_experts_and_quant_input(
num_expert_group=n_group,
scoring_func=scoring_func,
)
w13_weight, w13_scale = w13.weight, w13.weight_scale
block_size_k = 0
if w13_weight.ndim == 3:
block_size_k = w13_weight.shape[2] // w13_scale.shape[2]
assert block_size_k == 128, "block_size_k must be 128"
qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w13_weight.dtype)
return topk_weights, topk_idx.to(torch.long), (qinput_tensor, input_scale)
qinput_tensor = quantize_fused_experts_input(hidden_states, w13, self.quant_method)
return topk_weights, topk_idx.to(torch.long), qinput_tensor

def dispatch(
self,
Expand All @@ -171,38 +163,26 @@ def dispatch(
overlap_event: Optional[Any] = None,
):
buffer = dist_group_manager.ep_buffer
# get_dispatch_layout
(
num_tokens_per_rank,
num_tokens_per_rdma_rank,
num_tokens_per_expert,
is_token_in_rank,
previous_event,
) = buffer.get_dispatch_layout(
topk_idx,
self.total_expert_num_contain_redundancy,
previous_event=overlap_event,
async_finish=True,
allocate_on_comm_stream=True,
)
recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = buffer.dispatch(
num_max_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank_prefill()
recv_x, recv_topk_idx, recv_topk_weights, handle, event = buffer.dispatch(
qinput_tensor,
topk_idx=topk_idx,
topk_weights=topk_weights,
num_tokens_per_rank=num_tokens_per_rank,
num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
is_token_in_rank=is_token_in_rank,
num_tokens_per_expert=num_tokens_per_expert,
previous_event=previous_event,
async_finish=True,
allocate_on_comm_stream=True,
num_experts=self.total_expert_num_contain_redundancy,
num_max_tokens_per_rank=num_max_tokens_per_rank,
expert_alignment=128,
num_sms=get_ep_num_sms(),
previous_event=overlap_event,
async_with_compute_stream=True,
allocate_on_comm_stream=True,
do_cpu_sync=True,
do_handle_copy=False,
)

def hook():
event.current_stream_wait()

return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, hook
return recv_x, recv_topk_idx, recv_topk_weights, handle.num_recv_tokens_per_expert_list, handle, hook

def masked_group_gemm(
self,
Expand Down Expand Up @@ -281,7 +261,7 @@ def prefilled_group_gemm(
# groupgemm (contiguous layout)
gemm_out_a = torch.empty((all_tokens, N), device=device, dtype=hidden_dtype)

_deepgemm_grouped_fp8_nt_contiguous(input_tensor, (w13_weight, w13_scale), gemm_out_a, m_indices)
deepgemm_grouped_fp8_nt_contiguous(input_tensor, (w13_weight, w13_scale), gemm_out_a, m_indices)

# silu_and_mul_fwd + qaunt
# TODO fused kernel
Expand All @@ -295,7 +275,7 @@ def prefilled_group_gemm(
# groupgemm (contiguous layout)
gemm_out_b = torch.empty((all_tokens, K), device=device, dtype=hidden_dtype)

_deepgemm_grouped_fp8_nt_contiguous(
deepgemm_grouped_fp8_nt_contiguous(
(qsilu_out, qsilu_out_scale), (w2_weight, w2_scale), gemm_out_b, m_indices
)
# gather and local reduce
Expand All @@ -319,7 +299,7 @@ def low_latency_combine(
topk_weights: torch.Tensor,
handle: Any,
):
combined_x, event_overlap, hook = dist_group_manager.ep_buffer.low_latency_combine(
combined_x, event_overlap, hook = dist_group_manager.ep_low_latency_buffer.low_latency_combine(
gemm_out_b, topk_idx, topk_weights, handle, async_finish=False, return_recv_hook=True
)
return combined_x, hook
Expand All @@ -335,8 +315,9 @@ def combine(
gemm_out_b,
handle,
topk_weights=None,
async_finish=True,
num_sms=get_ep_num_sms(),
previous_event=overlap_event,
async_with_compute_stream=True,
allocate_on_comm_stream=True,
)

Expand Down
Loading
Loading