Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,20 @@ run-e2e-aio: protogen-go
@echo 'Running e2e AIO tests'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio

# vLLM multi-node DP smoke (CPU). Builds local-ai:tests and the
# cpu-vllm backend from the current working tree, then drives a
# head + headless follower via testcontainers-go and asserts a chat
# completion. BuildKit caches both images, so re-runs only rebuild
# what changed. The test lives under tests/e2e/distributed and is
# selected by the VLLMMultinode label so it doesn't run alongside
# the other distributed-suite tests by default.
test-e2e-vllm-multinode: docker-build-e2e extract-backend-vllm protogen-go
@echo 'Running e2e vLLM multi-node DP test'
LOCALAI_IMAGE=local-ai \
LOCALAI_IMAGE_TAG=tests \
LOCALAI_VLLM_BACKEND_DIR=$(abspath ./local-backends/vllm) \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter='VLLMMultinode' -v -r ./tests/e2e/distributed

########################################################
## E2E tests
########################################################
Expand Down Expand Up @@ -319,7 +333,7 @@ local-backends:

extract-backend-%: docker-build-% local-backends
@echo "Extracting backend $*..."
@CID=$$(docker create local-ai-backend:$*) && \
@CID=$$(docker create --entrypoint=/run.sh local-ai-backend:$*) && \
rm -rf local-backends/$* && mkdir -p local-backends/$* && \
docker cp $$CID:/ - | tar -xf - -C local-backends/$* && \
docker rm $$CID > /dev/null
Expand Down Expand Up @@ -594,6 +608,14 @@ test-extra-backend-vllm: docker-build-vllm
BACKEND_TEST_OPTIONS=tool_parser:hermes \
$(MAKE) test-extra-backend

## vllm multi-node data-parallel smoke test. Runs LocalAI head + a
## `local-ai p2p-worker vllm` follower in docker compose against
## Qwen2.5-0.5B with data_parallel_size=2. Requires 2 NVIDIA GPUs and
## nvidia-container-runtime on the host — vLLM v1's DP coordinator is
## not viable on CPU so this cannot run in CI without GPU.
test-extra-backend-vllm-multinode:
./tests/e2e/vllm-multinode/smoke.sh

## tinygrad mirrors the vllm target (same model, same caps, same parser) so
## the two backends are directly comparable. The LLM path covers Predict,
## streaming and native tool-call extraction. Companion targets below cover
Expand Down
88 changes: 77 additions & 11 deletions backend/python/vllm/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ else
source $backend_dir/../common/libbackend.sh
fi

# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
# Intel XPU: torch==2.11.0+xpu lives on the PyTorch XPU index, transitive
# deps on PyPI — unsafe-best-match lets uv mix both. vllm-xpu-kernels only
# ships a python3.12 wheel per upstream docs, so bump the portable Python
# before installRequirements (matches the l4t13 pattern below).
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
PYTHON_VERSION="3.12"
PYTHON_PATCH="11"
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi

# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
Expand All @@ -42,27 +45,90 @@ fi

# JetPack 7 / L4T arm64 wheels (torch, vllm, flash-attn) live on
# pypi.jetson-ai-lab.io and are built for cp312, so bump the venv Python
# accordingly. JetPack 6 keeps cp310 + USE_PIP=true. unsafe-best-match
# is required because the jetson-ai-lab index lists transitive deps at
# limited versions — without it uv pins to the first matching index and
# fails to resolve a compatible wheel from PyPI.
# accordingly. JetPack 6 keeps cp310 + USE_PIP=true.
#
# l4t13 uses pyproject.toml (see the elif branch below) to pin only the
# L4T-specific wheels to the jetson-ai-lab index via [tool.uv.sources].
# That keeps PyPI as the resolution path for transitive deps like
# anthropic/openai/propcache, which the L4T mirror's proxy 503s on.
if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
USE_PIP=true
fi
if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
PYTHON_VERSION="3.12"
PYTHON_PATCH="12"
PY_STANDALONE_TAG="20251120"
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi

# Intel XPU has no upstream-published vllm wheels, so we always build vllm
# from source against torch-xpu and replace the default triton with
# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
if [ "x${BUILD_TYPE}" == "xintel" ]; then
# Hide requirements-intel-after.txt so installRequirements doesn't
# try `pip install vllm` (would either fail or grab a non-XPU wheel).
_intel_after="${backend_dir}/requirements-intel-after.txt"
_intel_after_bak=""
if [ -f "${_intel_after}" ]; then
_intel_after_bak="${_intel_after}.xpu.bak"
mv "${_intel_after}" "${_intel_after_bak}"
fi
installRequirements
if [ -n "${_intel_after_bak}" ]; then
mv "${_intel_after_bak}" "${_intel_after}"
fi

# vllm's CMake build needs the Intel oneAPI dpcpp/sycl compiler — the
# base image (intel/oneapi-basekit) has it but the env isn't sourced.
if [ -f /opt/intel/oneapi/setvars.sh ]; then
set +u
source /opt/intel/oneapi/setvars.sh --force
set -u
fi

_vllm_src=$(mktemp -d)
trap 'rm -rf "${_vllm_src}"' EXIT
git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
pushd "${_vllm_src}/vllm"
# Install vllm's own runtime deps (torch-xpu, vllm_xpu_kernels,
# pydantic, fastapi, …) from upstream's requirements/xpu.txt — the
# canonical source of truth. Avoids re-pinning everything ourselves.
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -r requirements/xpu.txt
# Stock triton (NVIDIA-only) may have come in transitively; replace
# with triton-xpu==3.7.0 which matches torch 2.11.
uv pip uninstall triton triton-xpu 2>/dev/null || true
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
--extra-index-url https://download.pytorch.org/whl/xpu \
triton-xpu==3.7.0
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
popd
# L4T arm64 (JetPack 7): drive the install through pyproject.toml so that
# [tool.uv.sources] can pin torch/vllm/flash-attn/torchvision/torchaudio
# to the jetson-ai-lab index, while everything else (transitive deps and
# PyPI-resolvable packages like transformers) comes from PyPI. Bypasses
# installRequirements because uv pip install -r requirements.txt does not
# honor sources — see backend/python/vllm/pyproject.toml for the rationale.
elif [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
ensureVenv
if [ "x${PORTABLE_PYTHON}" == "xtrue" ]; then
export C_INCLUDE_PATH="${C_INCLUDE_PATH:-}:$(_portable_dir)/include/python${PYTHON_VERSION}"
fi
pushd "${backend_dir}"
# Build deps first (matches installRequirements' requirements-install.txt
# pass — fastsafetensors and friends need pybind11 in the venv before
# their sdists can build under --no-build-isolation).
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -r requirements-install.txt
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --requirement pyproject.toml
popd
runProtogen
# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
# requirements-cpu-after.txt and compiles vllm locally against the host's
# actual CPU. Not used by default because it takes ~30-40 minutes, but
# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
# bigger-runner with compatible hardware instead.
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
elif [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
# Temporarily hide the prebuilt wheel so installRequirements doesn't
# pull it — the rest of the requirements files (base deps, torch,
# transformers) are still installed normally.
Expand Down
104 changes: 104 additions & 0 deletions backend/python/vllm/package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,109 @@ copy_with_symlinks() {
copy_with_symlinks libnuma.so.1
copy_with_symlinks libgomp.so.1

# CPU profile only: bundle a g++ toolchain so torch._inductor's
# ISA probe (always run at vllm engine startup, regardless of
# enforce_eager) finds a C++ compiler. The LocalAI runtime image
# is FROM ubuntu:24.04 with a minimal apt list that does not
# include build-essential, and the backend image itself is FROM
# scratch -- so without this, cpu-vllm crashes with
# torch._inductor.exc.InvalidCxxCompiler at first inference
# unless the operator manually sets TORCH_COMPILE_DISABLE=1.
#
# We snapshot every file owned by the toolchain packages, mirroring
# the /usr/... layout into ${BACKEND}/toolchain/ so g++ can find
# cc1plus, headers, libs etc. via GCC_EXEC_PREFIX / CPATH /
# LIBRARY_PATH at runtime (libbackend.sh wires those up). Adds
# ~400 MB to the cpu-vllm image, which is tolerable -- cpu-vllm is
# already a niche profile.
if [ "${BUILD_TYPE:-}" = "" ] && command -v dpkg-query >/dev/null 2>&1; then
TOOLCHAIN_DIR="${CURDIR}/toolchain"
mkdir -p "${TOOLCHAIN_DIR}"
# The unversioned g++/gcc packages on Debian/Ubuntu only ship
# symlinks; the actual binaries live in g++-${VER}/gcc-${VER}.
# Discover the active version so the symlink targets get bundled
# along with their owners.
GCC_VER=$(gcc -dumpversion 2>/dev/null | cut -d. -f1 || true)
# `g++-${VER}` itself is just another symlink layer on Debian/
# Ubuntu — the real binary `x86_64-linux-gnu-g++-${VER}` lives
# in `g++-${VER}-x86-64-linux-gnu` (a separate package pulled in
# as a dependency). Same story for gcc/cpp. Compute the dpkg
# arch-triplet to find the right package name for both amd64 and
# arm64 hosts.
case "$(dpkg --print-architecture 2>/dev/null)" in
amd64) HOST_TRIPLET="x86-64-linux-gnu" ;;
arm64) HOST_TRIPLET="aarch64-linux-gnu" ;;
*) HOST_TRIPLET="" ;;
esac
PKGS=(g++ gcc cpp libstdc++-${GCC_VER}-dev libgcc-${GCC_VER}-dev libc6 libc6-dev binutils binutils-common libbinutils libc-dev-bin linux-libc-dev libcrypt-dev libgomp1 libstdc++6 libgcc-s1 libisl23 libmpc3 libmpfr6 libjansson4 libctf0 libctf-nobfd0 libsframe1)
if [ -n "${GCC_VER}" ]; then
PKGS+=("g++-${GCC_VER}" "gcc-${GCC_VER}" "cpp-${GCC_VER}" "gcc-${GCC_VER}-base")
if [ -n "${HOST_TRIPLET}" ]; then
PKGS+=(
"g++-${GCC_VER}-${HOST_TRIPLET}"
"gcc-${GCC_VER}-${HOST_TRIPLET}"
"cpp-${GCC_VER}-${HOST_TRIPLET}"
"binutils-${HOST_TRIPLET}"
)
fi
fi
for pkg in "${PKGS[@]}"; do
if ! dpkg-query -W "${pkg}" >/dev/null 2>&1; then
continue
fi
# Copy each owned path, preserving symlinks and mode. We
# tolerate dpkg listing directories alongside files.
dpkg -L "${pkg}" | while IFS= read -r path; do
if [ -L "${path}" ] || [ -f "${path}" ]; then
mkdir -p "${TOOLCHAIN_DIR}$(dirname "${path}")"
cp -aP "${path}" "${TOOLCHAIN_DIR}${path}" 2>/dev/null || true
fi
done
done
# Ubuntu's filesystem layout has /lib -> /usr/lib (UsrMerge) and
# /lib64 -> /usr/lib64. ld scripts (e.g. libm.so) hardcode
# `/lib/x86_64-linux-gnu/libm.so.6`; with --sysroot the linker
# looks for that path under the sysroot, which means we need
# the same symlinks under TOOLCHAIN_DIR.
[ -e "${TOOLCHAIN_DIR}/lib" ] || ln -s usr/lib "${TOOLCHAIN_DIR}/lib"
[ -e "${TOOLCHAIN_DIR}/lib64" ] || ln -s usr/lib64 "${TOOLCHAIN_DIR}/lib64"

# Replace the unversioned g++/gcc/cpp symlinks with wrapper
# scripts that pass --sysroot=<toolchain> and -B <gcc-exec-prefix>.
# Without these flags gcc would fall back to its compiled-in
# /usr search and fail to find headers (the runtime image has no
# libc6-dev) or fail to invoke `as`/`ld` (binutils not on PATH at
# /usr/bin). Wrappers self-resolve their location at runtime so
# they work from any BackendsPath.
BIN_DIR="${TOOLCHAIN_DIR}/usr/bin"
if [ -n "${GCC_VER}" ] && [ -n "${HOST_TRIPLET}" ]; then
# HOST_TRIPLET in package names uses dashes ("x86-64-linux-gnu");
# the binary suffix uses underscores in the arch part
# ("x86_64-linux-gnu-g++-13"). Translate.
BIN_TRIPLET=${HOST_TRIPLET//x86-64/x86_64}
for tool in g++ gcc cpp; do
real="${BIN_DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}"
if [ -x "${real}" ]; then
rm -f "${BIN_DIR}/${tool}" "${BIN_DIR}/${tool}-${GCC_VER}"
cat > "${BIN_DIR}/${tool}" <<EOF
#!/bin/bash
# Auto-generated by package.sh. Passes --sysroot and -B so the
# bundled toolchain works from any BackendsPath without depending
# on libc6-dev / binutils being installed at /usr in the runtime
# image. See backend/python/vllm/package.sh.
DIR="\$(dirname "\$(readlink -f "\$0")")" # …/toolchain/usr/bin
SYSROOT="\$(dirname "\$(dirname "\${DIR}")")" # …/toolchain
exec "\${DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}" \\
-B "\${SYSROOT}/usr/lib/gcc/${BIN_TRIPLET}/${GCC_VER}/" \\
--sysroot="\${SYSROOT}" \\
"\$@"
EOF
chmod +x "${BIN_DIR}/${tool}"
fi
done
fi
echo "Bundled g++ toolchain (gcc-${GCC_VER}) into ${TOOLCHAIN_DIR} ($(du -sh "${TOOLCHAIN_DIR}" | cut -f1))"
fi

echo "vllm packaging completed successfully"
ls -liah "${LIB_DIR}/"
61 changes: 61 additions & 0 deletions backend/python/vllm/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# L4T arm64 (JetPack 7 / sbsa cu130) install spec for the vllm backend.
#
# Why this file exists, and why only the l4t13 BUILD_PROFILE consumes it:
#
# pypi.jetson-ai-lab.io hosts the L4T-specific torch / vllm / flash-attn
# wheels we need on aarch64 + cuda13, but it ALSO transparently proxies the
# rest of PyPI through `/+f/<sha>/<filename>` URLs that 503 frequently. With
# `--extra-index-url` + `--index-strategy=unsafe-best-match` (the historical
# fix in install.sh) uv would pick those proxy URLs for ordinary PyPI
# packages — `anthropic`, `openai`, `propcache`, `annotated-types` — and
# trip on the 503s. See e.g. CI run 25212201349 (anthropic-0.97.0).
#
# `explicit = true` on the index makes uv consult the L4T mirror ONLY for
# packages mapped under [tool.uv.sources]. Everything else goes to PyPI.
# This breaks the historical 503 path without losing access to the L4T
# wheels we actually need from there.
#
# `uv pip install -r requirements.txt` does NOT honor [tool.uv.sources]
# (sources are project-mode only, not pip-compat mode), so install.sh's
# l4t13 branch invokes `uv pip install --requirement pyproject.toml`
# directly. Other BUILD_PROFILEs continue to use the requirements-*.txt
# pipeline through libbackend.sh's installRequirements and never read
# this file.
[project]
name = "localai-vllm-l4t13"
version = "0.0.0"
requires-python = ">=3.12,<3.13"
dependencies = [
# Mirror of requirements.txt — kept in sync manually for now since the
# l4t13 path bypasses installRequirements (see install.sh).
"grpcio==1.80.0",
"protobuf",
"certifi",
"setuptools",
"pillow",
"charset-normalizer>=3.4.0",
"chardet",
# L4T-specific accelerator stack (sourced from jetson-ai-lab below).
"torch",
"torchvision",
"torchaudio",
"flash-attn",
"vllm",
# PyPI-resolvable packages that complete the runtime — accelerate,
# transformers, bitsandbytes carry their own wheels for aarch64.
"accelerate",
"transformers",
"bitsandbytes",
]

[[tool.uv.index]]
name = "jetson-ai-lab"
url = "https://pypi.jetson-ai-lab.io/sbsa/cu130"
explicit = true

[tool.uv.sources]
torch = { index = "jetson-ai-lab" }
torchvision = { index = "jetson-ai-lab" }
torchaudio = { index = "jetson-ai-lab" }
flash-attn = { index = "jetson-ai-lab" }
vllm = { index = "jetson-ai-lab" }
4 changes: 3 additions & 1 deletion backend/python/vllm/requirements-intel-after.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
vllm
# Intel XPU has no upstream-published vllm wheels — install.sh builds vllm
# from source with VLLM_TARGET_DEVICE=xpu and hides this file during
# installRequirements. Don't add a `vllm` line here.
7 changes: 4 additions & 3 deletions backend/python/vllm/requirements-intel.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
--extra-index-url https://download.pytorch.org/whl/xpu
# vllm's own deps (torch==2.11.0+xpu, vllm_xpu_kernels, pydantic, …) are
# installed from upstream's requirements/xpu.txt during the source build —
# see install.sh. Only list what LocalAI's vllm backend.py needs directly.
accelerate
torch
transformers
optimum[openvino]
bitsandbytes
setuptools
bitsandbytes
2 changes: 0 additions & 2 deletions backend/python/vllm/requirements-l4t13-after.txt

This file was deleted.

8 changes: 0 additions & 8 deletions backend/python/vllm/requirements-l4t13.txt

This file was deleted.

5 changes: 4 additions & 1 deletion backend/python/vllm/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
grpcio==1.80.0
protobuf
certifi
setuptools
setuptools
pillow
charset-normalizer>=3.4.0
chardet
Loading
Loading