From 6bb2edb6bf7b5510124926a51be276a05b1c2479 Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Thu, 13 Nov 2025 20:18:35 +0200 Subject: [PATCH 01/12] Extending serving multiple AI models guide with an example of how to serve multiple LoRAs (many LoRAs per one model while having multiple models) --- .../bbr-example/httproute_bbr_lora.yaml | 93 ++++++ config/manifests/vllm/sim-deployment-1.yaml | 44 +++ .../guides/serve-multiple-genai-models.md | 301 +++++++++++++++++- 3 files changed, 435 insertions(+), 3 deletions(-) create mode 100644 config/manifests/bbr-example/httproute_bbr_lora.yaml create mode 100644 config/manifests/vllm/sim-deployment-1.yaml diff --git a/config/manifests/bbr-example/httproute_bbr_lora.yaml b/config/manifests/bbr-example/httproute_bbr_lora.yaml new file mode 100644 index 0000000000..9578597875 --- /dev/null +++ b/config/manifests/bbr-example/httproute_bbr_lora.yaml @@ -0,0 +1,93 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-llama-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-deepseek-route #give this HTTPRoute any name that helps you to group and track the matchers +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-deepseek-r1 + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'deepseek/vllm-deepseek-r1' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'movie-critique' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment + timeouts: + request: 300s diff --git a/config/manifests/vllm/sim-deployment-1.yaml b/config/manifests/vllm/sim-deployment-1.yaml new file mode 100644 index 0000000000..9798077caf --- /dev/null +++ b/config/manifests/vllm/sim-deployment-1.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-deepseek-r1 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-deepseek-r1 + template: + metadata: + labels: + app: vllm-deepseek-r1 + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0 + imagePullPolicy: Always + args: + - --model + - deepseek/vllm-deepseek-r1 + - --port + - "8000" + - --max-loras + - "2" + - --lora-modules + - '{"name": "food-review"}' + - '{"name": "movie-critique"}' + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + cpu: 10m diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index c94ff1a772..cab2391fa2 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -1,9 +1,11 @@ -# Serve multiple generative AI models +# Serve multiple generative AI models and multiple LoRAs for the base AI models A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads. For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application (or as in the example in this guide, a combination of a Llama3 model and a smaller Phi4 model).. You may choose to locate these 2 models at 2 different L7 url paths and follow the steps described in the [`Getting started`](index.md) guide for each such model as already described. However you may also need to serve multiple models located at the same L7 url path and rely on parsing information such as the Model name in the LLM prompt requests as defined in the OpenAI API format which is commonly used by most models. For such Model-aware routing, you can use the Body-Based Routing feature as described in this guide. +In addition, for each base AI model multiple [Low Rank Adaptaions (LoRAs)](https://www.ibm.com/think/topics/lora) can be defined. LoRAs defined for the same base AI model are served from the same backend inference server that serves the base model. A LoRA name is specified as the Model name in the body of LLM prompt requests. LoRA naming is not standardised. Therefore, it cannot be expected that the base model name can be inferred from the LoRA name. + ## How The following diagram illustrates how an Inference Gateway routes requests to different models based on the model name. @@ -13,7 +15,7 @@ The model name is extracted by [Body-Based routing](https://github.com/kubernete ### Example Model-Aware Routing using Body-Based Routing (BBR) -This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models. +This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models and multiple LoRAs with many to one relationship of LoRAs to the base model. ### Deploy Body-Based Routing Extension @@ -83,7 +85,7 @@ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-i Once this is installed, and after allowing for model download and startup time which can last several minutes, verify that the pod with this 2nd LLM phi4-mini, is running without errors using the command `kubectl get pods`. ### Deploy the 2nd InferencePool and Endpoint Picker Extension -We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints or LORA adapters within each base model. Hence we create these for our second model as follows. +We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints. === "GKE" @@ -266,3 +268,296 @@ kubectl get httproute llm-phi4-route -o yaml }' ``` +### Serving multiple LoRAs per base AI model + +
+⚠️ Known Limitation : LoRA names must be unique across the base AI models (i.e., across the backend inference server deployments) +
+ +Deploy the third base model that is used to demonstrate multiple LoRA configuration per base model. The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous examples. + + +```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment-1.yaml +``` + +Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods` + + +### Deploy the 3rd InferencePool and Endpoint Picker Extension +We also want to use an InferencePool and EndPoint Picker for this third model. + +=== "GKE" + + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Istio" + + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +### Configure HTTPRoute + +Now configure new HTTPRoutes for the two simulated models and their LoRAs that we want to serve via BBR using the following command which configures both routes. + +```bash +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/bbr-example/httproute_bbr_lora.yaml +``` + +Also examine this manifest file (see the yaml below), to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. Note that the manifest file uses two different ways of defining the routes to LoRAs: (1) via adding match clauses on the same base AI model HTTPRoute or by (2) defining a separate HTTPRoutes. There is no functional diffeence between the two methods, except for the limitation on the number of matchers per route imposed by the API Gateway + +
+⚠️ Known Limitation : + +[Kubernetes API Gateway limits the total number of matchers per HTTPRoute to be less than 128](https://github.com/kubernetes-sigs/gateway-api/blob/df8c96c254e1ac6d5f5e0d70617f36143723d479/apis/v1/httproute_types.go#L128). +
+ +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-llama-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-deepseek-route #give this HTTPRoute any name that helps you to group and track the matchers +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-deepseek-r1 + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'deepseek/vllm-deepseek-r1' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'movie-critique' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment + timeouts: + request: 300s +--- +``` + +Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. + +```bash +kubectl get httproute llm-llama-route -o yaml +``` + +```bash +kubectl get httproute llm-deepseek-route -o yaml +``` + +```bash +kubectl vllm-llama3-8b-instruct-lora-food-review-1 -o yaml +``` + +### Try it out + +=== "Chat Completions API" + + 1. Send a few requests to Llama model to test that it works as before, as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Linux is said to be an open source kernel because " + } + ] + }' + ``` + + 2. Send a few requests to the LoRA of the Llama model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review-1", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Write a review of the best restaurans in San-Francisco" + } + ] + }' + ``` + + 3. Send a few requests to one LoRA of the Deepseek model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "movie-critique", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "The best movies of 2025 are" + } + ] + }' + ``` + 4. Send a few requests to another LoRA of the Deepseek model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "The best movies of 2025 are" + } + ] + }' + ``` + +=== "Completions API" + + 1. Send a few requests to Llama model's LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review-1", + "prompt": "Linux is said to be an open source kernel because ", + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 2. Send a few requests to the first Deepseek LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review", + "prompt": "Write as if you were a food critique", + "max_tokens": 20, + "temperature": 0 + }' + ``` + + 3. Send a few requests to the second Deepseek LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "movie-critique", + "prompt": "Write as if you were a movie critique", + "max_tokens": 20, + "temperature": 0 + }' + ``` + From e723c1fd129dbe52a3a3e39119552b4353a5c5fb Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Wed, 26 Nov 2025 12:49:03 +0200 Subject: [PATCH 02/12] Changes to PR to address feedback of the reviewers --- .../bbr-example/httproute_bbr_lora.yaml | 7 +- config/manifests/vllm/sim-deployment-1.yaml | 2 +- .../guides/serve-multiple-genai-models.md | 534 +++++++++--------- 3 files changed, 272 insertions(+), 271 deletions(-) diff --git a/config/manifests/bbr-example/httproute_bbr_lora.yaml b/config/manifests/bbr-example/httproute_bbr_lora.yaml index 9578597875..d926ef8b95 100644 --- a/config/manifests/bbr-example/httproute_bbr_lora.yaml +++ b/config/manifests/bbr-example/httproute_bbr_lora.yaml @@ -18,7 +18,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'meta-llama/Llama-3.1-8B-Instruct' timeouts: @@ -44,7 +43,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'deepseek/vllm-deepseek-r1' - path: @@ -52,15 +50,13 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name - value: 'food-review' + value: 'ski-resorts' - path: type: PathPrefix value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'movie-critique' timeouts: @@ -86,7 +82,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment timeouts: diff --git a/config/manifests/vllm/sim-deployment-1.yaml b/config/manifests/vllm/sim-deployment-1.yaml index 9798077caf..d9c032a707 100644 --- a/config/manifests/vllm/sim-deployment-1.yaml +++ b/config/manifests/vllm/sim-deployment-1.yaml @@ -24,7 +24,7 @@ spec: - --max-loras - "2" - --lora-modules - - '{"name": "food-review"}' + - '{"name": "ski-resorts"}' - '{"name": "movie-critique"}' env: - name: POD_NAME diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index cab2391fa2..4bb514e816 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -2,9 +2,9 @@ A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads. For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application (or as in the example in this guide, a combination of a Llama3 model and a smaller Phi4 model).. You may choose to locate these 2 models at 2 different L7 url paths and follow the steps described in the [`Getting started`](index.md) guide for each such model as already described. However you may also need to serve multiple models located at the same L7 url path and rely on parsing information such as -the Model name in the LLM prompt requests as defined in the OpenAI API format which is commonly used by most models. For such Model-aware routing, you can use the Body-Based Routing feature as described in this guide. +the Model name in the LLM prompt requests as defined in the OpenAI API format which is commonly used by most models. For such Model-aware routing, you can use the Body-Based Routing feature as described in this guide. -In addition, for each base AI model multiple [Low Rank Adaptaions (LoRAs)](https://www.ibm.com/think/topics/lora) can be defined. LoRAs defined for the same base AI model are served from the same backend inference server that serves the base model. A LoRA name is specified as the Model name in the body of LLM prompt requests. LoRA naming is not standardised. Therefore, it cannot be expected that the base model name can be inferred from the LoRA name. +In addition, for each base AI model multiple [Low Rank Adaptaions (LoRAs)](https://www.ibm.com/think/topics/lora) can be defined. LoRAs defined for the same base AI model are served from the same backend inference server that serves the base model. A LoRA name is specified as the Model name in the body of LLM prompt requests. LoRA naming is not standardised. Therefore, it cannot be expected that the base model name can be inferred from the LoRA name. ## How @@ -17,7 +17,6 @@ The model name is extracted by [Body-Based routing](https://github.com/kubernete This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models and multiple LoRAs with many to one relationship of LoRAs to the base model. - ### Deploy Body-Based Routing Extension To enable body-based routing, you need to deploy the Body-Based Routing ExtProc server using Helm. This is a separate ExtProc server from the EndPoint Picker and when installed, is automatically inserted at the start of the gateway's ExtProc chain ahead of other EtxProc servers such as EPP. @@ -26,21 +25,21 @@ First install this server. Depending on your Gateway provider, you can use one o === "GKE" - ```bash - helm install body-based-router \ - --set provider.name=gke \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing - ``` + ```bash + helm install body-based-router \ + --set provider.name= \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` === "Istio" - ```bash - helm install body-based-router \ - --set provider.name=istio \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing - ``` + ```bash + helm install body-based-router \ + --set provider.name=istio \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` === "Kgateway" @@ -68,52 +67,58 @@ First install this server. Depending on your Gateway provider, you can use one o === "Other" - ```bash - helm install body-based-router \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing - ``` + ```bash + helm install body-based-router \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods`. ### Serving a Second Base Model -Next deploy the second base model that will be served from the same L7 path (which is `/`) as the `meta-llama/Llama-3.1-8B-Instruct` model already being served after following the steps from the [`Getting started`](index.md) guide. In this example the 2nd model is `microsoft/Phi-4-mini-instruct` a relatively small model ( about 3B parameters) from HuggingFace. Note that for this exercise, there need to be atleast 2 GPUs available on the system one each for the two models being served. Serve the second model via the following command. + +Next deploy the second base model that will be served from the same L7 path (which is `/`) as the `meta-llama/Llama-3.1-8B-Instruct` model already being served after following the steps from the [`Getting started`](index.md) guide. In this example the 2nd model is `microsoft/Phi-4-mini-instruct` a relatively small model ( about 3B parameters) from HuggingFace. Note that for this exercise, there need to be atleast 2 GPUs available on the system one each for the two models being served. Serve the second model via the following command. ```bash kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/heads/main/config/manifests/bbr-example/vllm-phi4-mini.yaml ``` + Once this is installed, and after allowing for model download and startup time which can last several minutes, verify that the pod with this 2nd LLM phi4-mini, is running without errors using the command `kubectl get pods`. ### Deploy the 2nd InferencePool and Endpoint Picker Extension -We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints. + +We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints. + +=== + === "GKE" - ```bash - export GATEWAY_PROVIDER=gke - helm install vllm-phi4-mini-instruct \ - --set inferencePool.modelServers.matchLabels.app=phi4-mini \ - --set provider.name=$GATEWAY_PROVIDER \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-phi4-mini-instruct \ + --set inferencePool.modelServers.matchLabels.app=phi4-mini \ + --set provider.name=$GATEWAY_PROVIDER \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` === "Istio" - ```bash - export GATEWAY_PROVIDER=istio - helm install vllm-phi4-mini-instruct \ - --set inferencePool.modelServers.matchLabels.app=phi4-mini \ - --set provider.name=$GATEWAY_PROVIDER \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-phi4-mini-instruct \ + --set inferencePool.modelServers.matchLabels.app=phi4-mini \ + --set provider.name=$GATEWAY_PROVIDER \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` After executing this, verify that you see two InferencePools and two EPP pods, one per base model type, running without errors, using the CLIs `kubectl get inferencepools` and `kubectl get pods`. ### Configure HTTPRoute -Before configuring the httproutes for the models, we need to delete the prior httproute created for the vllm-llama3-8b-instruct model because we will alter the routing to now also match on the model name as determined by the `X-Gateway-Model-Name` http header that will get inserted by the BBR extension after parsing the Model name from the body of the LLM request message. +Before configuring the httproutes for the models, we need to delete the prior httproute created for the vllm-llama3-8b-instruct model because we will alter the routing to now also match on the model name as determined by the `X-Gateway-Model-Name` http header that will get inserted by the BBR extension after parsing the Model name from the body of the LLM request message. ```bash kubectl delete httproute llm-route @@ -181,7 +186,7 @@ spec: --- ``` -Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. +Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. ```bash kubectl get httproute llm-llama-route -o yaml @@ -194,136 +199,135 @@ kubectl get httproute llm-phi4-route -o yaml ## Try it out 1. Get the gateway IP: - ```bash - IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 - ``` + +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 +``` === "Chat Completions API" - 1. Send a few requests to Llama model as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "developer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Linux is said to be an open source kernel because " - } - ] - }' - ``` - - 2. Send a few requests to the Phi4 as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "microsoft/Phi-4-mini-instruct", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "developer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "2+2 is " - } - ] - }' - ``` + 1. Send a few requests to Llama model as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Linux is said to be an open source kernel because " + } + ] + }' + ``` + + 1. Send a few requests to the Phi4 as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "microsoft/Phi-4-mini-instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "2+2 is " + } + ] + }' + ``` === "Completions API" - 1. Send a few requests to Llama model as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", - "prompt": "Linux is said to be an open source kernel because ", - "max_tokens": 100, - "temperature": 0 - }' - ``` - - 2. Send a few requests to the Phi4 as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "microsoft/Phi-4-mini-instruct", - "prompt": "2+2 is ", - "max_tokens": 20, - "temperature": 0 - }' - ``` + 1. Send a few requests to Llama model as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "Linux is said to be an open source kernel because ", + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 1. Send a few requests to the Phi4 as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "microsoft/Phi-4-mini-instruct", + "prompt": "2+2 is ", + "max_tokens": 20, + "temperature": 0 + }' + ``` ### Serving multiple LoRAs per base AI model -
-⚠️ Known Limitation : LoRA names must be unique across the base AI models (i.e., across the backend inference server deployments) -
- -Deploy the third base model that is used to demonstrate multiple LoRA configuration per base model. The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous examples. +⚠️ **Requirement**: LoRA names must be unique across the base AI models (i.e., across the backend inference server deployments) +Deploy the third base model that is used to demonstrate multiple LoRA configuration per base model. The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous examples. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment-1.yaml +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment-1.yaml ``` Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods` - ### Deploy the 3rd InferencePool and Endpoint Picker Extension -We also want to use an InferencePool and EndPoint Picker for this third model. + +We also want to use an InferencePool and EndPoint Picker for this third model. === "GKE" - ```bash - export GATEWAY_PROVIDER=gke - helm install vllm-deepseek-r1 \ - --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ - --set provider.name=$GATEWAY_PROVIDER \ - --version $IGW_CHART_VERSION \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` === "Istio" - ```bash - export GATEWAY_PROVIDER=istio - helm install vllm-deepseek-r1 \ - --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ - --set provider.name=$GATEWAY_PROVIDER \ - --version $IGW_CHART_VERSION \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` -### Configure HTTPRoute +### Configure HTTPRoutes -Now configure new HTTPRoutes for the two simulated models and their LoRAs that we want to serve via BBR using the following command which configures both routes. +Now configure new HTTPRoutes for the two simulated models and their LoRAs that we want to serve via BBR using the following command which configures both routes. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/bbr-example/httproute_bbr_lora.yaml ``` -Also examine this manifest file (see the yaml below), to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. Note that the manifest file uses two different ways of defining the routes to LoRAs: (1) via adding match clauses on the same base AI model HTTPRoute or by (2) defining a separate HTTPRoutes. There is no functional diffeence between the two methods, except for the limitation on the number of matchers per route imposed by the API Gateway - -
-⚠️ Known Limitation : +Also examine this manifest file (see the yaml below), to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. Note that the manifest file uses two different ways of defining the routes to LoRAs: (1) via adding match clauses on the same base AI model HTTPRoute or by (2) defining a separate HTTPRoutes. There is no functional diffeence between the two methods, except for the limitation on the number of matchers per route imposed by the API Gateway +⚠️ **Known Issue** : [Kubernetes API Gateway limits the total number of matchers per HTTPRoute to be less than 128](https://github.com/kubernetes-sigs/gateway-api/blob/df8c96c254e1ac6d5f5e0d70617f36143723d479/apis/v1/httproute_types.go#L128). -
```yaml apiVersion: gateway.networking.k8s.io/v1 @@ -346,7 +350,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'meta-llama/Llama-3.1-8B-Instruct' timeouts: @@ -372,7 +375,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'deepseek/vllm-deepseek-r1' - path: @@ -380,7 +382,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'food-review' - path: @@ -388,7 +389,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'movie-critique' timeouts: @@ -414,7 +414,6 @@ spec: value: / headers: - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. name: X-Gateway-Model-Name value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment timeouts: @@ -422,7 +421,7 @@ spec: --- ``` -Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. +Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. ```bash kubectl get httproute llm-llama-route -o yaml @@ -433,131 +432,138 @@ kubectl get httproute llm-deepseek-route -o yaml ``` ```bash -kubectl vllm-llama3-8b-instruct-lora-food-review-1 -o yaml +kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml ``` -### Try it out +### Try the setup === "Chat Completions API" - 1. Send a few requests to Llama model to test that it works as before, as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "developer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Linux is said to be an open source kernel because " - } - ] - }' - ``` - - 2. Send a few requests to the LoRA of the Llama model as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "food-review-1", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "reviewer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Write a review of the best restaurans in San-Francisco" - } - ] - }' - ``` - - 3. Send a few requests to one LoRA of the Deepseek model as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "movie-critique", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "reviewer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "The best movies of 2025 are" - } - ] - }' - ``` - 4. Send a few requests to another LoRA of the Deepseek model as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "food-review", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "reviewer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "The best movies of 2025 are" - } - ] - }' - ``` + 1. Send a few requests to Llama model to test that it works as before, as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Linux is said to be an open source kernel because " + } + ] + }' + ``` + + 1. Send a few requests to the LoRA of the Llama model as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review-1", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Write a review of the best restaurans in San-Francisco" + } + ] + }' + ``` + + 1. Send a few requests to one LoRA of the Deepseek model as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "movie-critique", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "The best movies of 2025 are" + } + ] + }' + ``` + + 1. Send a few requests to another LoRA of the Deepseek model as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "ski-resorts", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "reviewer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "The best movies of 2025 are" + } + ] + }' + ``` === "Completions API" - 1. Send a few requests to Llama model's LoRA as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "food-review-1", - "prompt": "Linux is said to be an open source kernel because ", - "max_tokens": 100, - "temperature": 0 - }' - ``` - - 2. Send a few requests to the first Deepseek LoRA as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "food-review", - "prompt": "Write as if you were a food critique", - "max_tokens": 20, - "temperature": 0 - }' - ``` - - 3. Send a few requests to the second Deepseek LoRA as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "movie-critique", - "prompt": "Write as if you were a movie critique", - "max_tokens": 20, - "temperature": 0 - }' - ``` - + 1. Send a few requests to Llama model's LoRA as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "food-review-1", + "prompt": "Linux is said to be an open source kernel because ", + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 1. Send a few requests to the first Deepseek LoRA as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "ski-resorts", + "prompt": "What is the best ski resort in Austria?", + "max_tokens": 20, + "temperature": 0 + }' + ``` + + 1. Send a few requests to the second Deepseek LoRA as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "movie-critique", + "prompt": "Write as if you were a movie critique", + "max_tokens": 20, + "temperature": 0 + }' + ``` From 906a205079ae4dc5d9a3b2263537ce3412f3955e Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Wed, 3 Dec 2025 17:11:37 +0200 Subject: [PATCH 03/12] Address review comments from PR #1859: -- The BBR guide is aligned with Getting Started (Main/Latest) -- There are only two models deployed, with the second one being a simulator -- Formatting issues and style fixed -- Typos and dangling sentences fixed -- The LoRA names are completely different -- The Routing example simplified: one HTTPRoute with matchers --- .../bbr-example/httproute_bbr_lora.yaml | 33 +- .../guides/serve-multiple-genai-models.md | 394 ++++++------------ 2 files changed, 143 insertions(+), 284 deletions(-) diff --git a/config/manifests/bbr-example/httproute_bbr_lora.yaml b/config/manifests/bbr-example/httproute_bbr_lora.yaml index d926ef8b95..8fc341a3fa 100644 --- a/config/manifests/bbr-example/httproute_bbr_lora.yaml +++ b/config/manifests/bbr-example/httproute_bbr_lora.yaml @@ -20,6 +20,13 @@ spec: - type: Exact name: X-Gateway-Model-Name value: 'meta-llama/Llama-3.1-8B-Instruct' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: X-Gateway-Model-Name + value: 'food-review-1' timeouts: request: 300s --- @@ -61,28 +68,4 @@ spec: value: 'movie-critique' timeouts: request: 300s ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - rules: - - backendRefs: - - group: inference.networking.k8s.io - kind: InferencePool - name: vllm-llama3-8b-instruct - matches: - - path: - type: PathPrefix - value: / - headers: - - type: Exact - name: X-Gateway-Model-Name - value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment - timeouts: - request: 300s +--- \ No newline at end of file diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index 4bb514e816..2a2d5f23f3 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -1,35 +1,32 @@ # Serve multiple generative AI models and multiple LoRAs for the base AI models -A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads. -For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application (or as in the example in this guide, a combination of a Llama3 model and a smaller Phi4 model).. You may choose to locate these 2 models at 2 different L7 url paths and follow the steps described in the [`Getting started`](index.md) guide for each such model as already described. However you may also need to serve multiple models located at the same L7 url path and rely on parsing information such as -the Model name in the LLM prompt requests as defined in the OpenAI API format which is commonly used by most models. For such Model-aware routing, you can use the Body-Based Routing feature as described in this guide. +A company may need to deploy multiple large language models (LLMs) in a cluster to support different workloads. For example, a Llama model could power a chatbot interface, while a DeepSeek model might serve a recommendation application. One approach is to expose these models on separate Layer 7 (L7) URL paths and follow the steps in the [`Getting Started (Latest/Main)`](getting-started-latest.md) guide for each model. -In addition, for each base AI model multiple [Low Rank Adaptaions (LoRAs)](https://www.ibm.com/think/topics/lora) can be defined. LoRAs defined for the same base AI model are served from the same backend inference server that serves the base model. A LoRA name is specified as the Model name in the body of LLM prompt requests. LoRA naming is not standardised. Therefore, it cannot be expected that the base model name can be inferred from the LoRA name. +However, one may also need to serve multiple models from the same L7 URL path. To achieve this, the system needs to extract information (such as the model name) from the request body (i.e., the LLM prompt). This pattern of serving multiple models behind a single endpoint is common among providers and is generally expected by clients. The OpenAI API format requires the model name to be specified in the request body. For such model-aware routing, use the Body-Based Routing (BBR) feature described in this guide. + +Additionally, each base AI model can have multiple Low-Rank Adaptations ([LoRAs](https://www.ibm.com/think/topics/lora)). LoRAs associated with the same base model are served by the same backend inference server that hosts the base model. A LoRA name is also provided as the model name in the request body. ## How -The following diagram illustrates how an Inference Gateway routes requests to different models based on the model name. -The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) (BBR) - from the request body to the header. The header is then matched to dispatch - requests to different `InferencePool` (and their EPPs) instances. +[Body-Based Router (BBR)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) extracts the model name from the request body and adds it to the `X-Gateway-Model-Name` header. This header is then used for matching and routing the request to the appropriate `InferencePool` and its associated Endpoint Picker Extension (EPP) instances. ### Example Model-Aware Routing using Body-Based Routing (BBR) -This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models and multiple LoRAs with many to one relationship of LoRAs to the base model. +This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started (Latest/Main)`](getting-started-latest.md) guide. In what follows, this guide describes the additional steps required to deploy and test routing across multiple models and multiple LoRAs, where several LoRAs may be associated with a single base model. ### Deploy Body-Based Routing Extension -To enable body-based routing, you need to deploy the Body-Based Routing ExtProc server using Helm. This is a separate ExtProc server from the EndPoint Picker and when installed, is automatically inserted at the start of the gateway's ExtProc chain ahead of other EtxProc servers such as EPP. +To enable body-based routing, deploy the BBR `ext_proc` server using Helm. This server is independent of EPP. Once installed, it is automatically added as the first filter in the gateway’s filter chain, ahead of other `ext_proc` servers such as EPP. -First install this server. Depending on your Gateway provider, you can use one of the following commands: +Select an appropriate tab depending on your Gateway provider: === "GKE" ```bash helm install body-based-router \ - --set provider.name= \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + --set provider.name=gke \ + --version v0 \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-routing ``` === "Istio" @@ -37,8 +34,8 @@ First install this server. Depending on your Gateway provider, you can use one o ```bash helm install body-based-router \ --set provider.name=istio \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + --version v0 \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-routing ``` === "Kgateway" @@ -69,230 +66,94 @@ First install this server. Depending on your Gateway provider, you can use one o ```bash helm install body-based-router \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + --version v0 \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-routing ``` -Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods`. - -### Serving a Second Base Model - -Next deploy the second base model that will be served from the same L7 path (which is `/`) as the `meta-llama/Llama-3.1-8B-Instruct` model already being served after following the steps from the [`Getting started`](index.md) guide. In this example the 2nd model is `microsoft/Phi-4-mini-instruct` a relatively small model ( about 3B parameters) from HuggingFace. Note that for this exercise, there need to be atleast 2 GPUs available on the system one each for the two models being served. Serve the second model via the following command. +After the installation, verify that the BBR pod is running without errors: ```bash -kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/heads/main/config/manifests/bbr-example/vllm-phi4-mini.yaml +kubectl get pods ``` -Once this is installed, and after allowing for model download and startup time which can last several minutes, verify that the pod with this 2nd LLM phi4-mini, is running without errors using the command `kubectl get pods`. - -### Deploy the 2nd InferencePool and Endpoint Picker Extension - -We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints. - -=== - - -=== "GKE" - - ```bash - export GATEWAY_PROVIDER=gke - helm install vllm-phi4-mini-instruct \ - --set inferencePool.modelServers.matchLabels.app=phi4-mini \ - --set provider.name=$GATEWAY_PROVIDER \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` - -=== "Istio" - - ```bash - export GATEWAY_PROVIDER=istio - helm install vllm-phi4-mini-instruct \ - --set inferencePool.modelServers.matchLabels.app=phi4-mini \ - --set provider.name=$GATEWAY_PROVIDER \ - --version v1.0.0 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` - -After executing this, verify that you see two InferencePools and two EPP pods, one per base model type, running without errors, using the CLIs `kubectl get inferencepools` and `kubectl get pods`. +### Serving a Second Base Model -### Configure HTTPRoute +The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous example from the [Getting Started (Latest/Main)](getting-started-latest.md) guide. -Before configuring the httproutes for the models, we need to delete the prior httproute created for the vllm-llama3-8b-instruct model because we will alter the routing to now also match on the model name as determined by the `X-Gateway-Model-Name` http header that will get inserted by the BBR extension after parsing the Model name from the body of the LLM request message. +Deploy the second base model: ```bash -kubectl delete httproute llm-route +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment-1.yaml ``` -Now configure new HTTPRoutes, one per each model we want to serve via BBR using the following command which configures both routes. Also examine this manifest file, to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. +The overall setup is as follows. Two base models are deployed: `meta-llama/Llama-3.1-8B-Instruct` and `deepseek/vllm-deepseek-r1`. Additionally, the `food-review-1` LoRA is associated with `meta-llama/Llama-3.1-8B-Instruct`, while the `ski-resorts` and `movie-critique` LoRAs are associated with `deepseek/vllm-deepseek-r1`. -```bash -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/bbr-example/httproute_bbr.yaml -``` +⚠️ **Note**: LoRA names must be unique across the base AI models (i.e., across the backend inference server deployments) -```yaml ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-llama-route -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - rules: - - backendRefs: - - group: inference.networking.k8s.io - kind: InferencePool - name: vllm-llama3-8b-instruct - matches: - - path: - type: PathPrefix - value: / - headers: - - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. - name: X-Gateway-Model-Name # (1)! - value: 'meta-llama/Llama-3.1-8B-Instruct' - timeouts: - request: 300s ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-phi4-route -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - rules: - - backendRefs: - - group: inference.networking.k8s.io - kind: InferencePool - name: vllm-phi4-mini-instruct - matches: - - path: - type: PathPrefix - value: / - headers: - - type: Exact - #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. - name: X-Gateway-Model-Name - value: 'microsoft/Phi-4-mini-instruct' - timeouts: - request: 300s ---- -``` - -Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. +Review the YAML definition. -```bash -kubectl get httproute llm-llama-route -o yaml -``` - -```bash -kubectl get httproute llm-phi4-route -o yaml +```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: vllm-deepseek-r1 + spec: + replicas: 1 + selector: + matchLabels: + app: vllm-deepseek-r1 + template: + metadata: + labels: + app: vllm-deepseek-r1 + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0 + imagePullPolicy: Always + args: + - --model + - deepseek/vllm-deepseek-r1 + - --port + - "8000" + - --max-loras + - "2" + - --lora-modules + - '{"name": "ski-resorts"}' + - '{"name": "movie-critique"}' + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + scpu: 10m ``` -## Try it out - -1. Get the gateway IP: +Verify that the second base model pod is running without errors: ```bash -IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 +kubectl get pods ``` -=== "Chat Completions API" - - 1. Send a few requests to Llama model as follows: - - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "developer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Linux is said to be an open source kernel because " - } - ] - }' - ``` - - 1. Send a few requests to the Phi4 as follows: - - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "microsoft/Phi-4-mini-instruct", - "max_tokens": 100, - "temperature": 0, - "messages": [ - { - "role": "developer", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "2+2 is " - } - ] - }' - ``` - -=== "Completions API" - - 1. Send a few requests to Llama model as follows: - - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", - "prompt": "Linux is said to be an open source kernel because ", - "max_tokens": 100, - "temperature": 0 - }' - ``` - - 1. Send a few requests to the Phi4 as follows: - - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "microsoft/Phi-4-mini-instruct", - "prompt": "2+2 is ", - "max_tokens": 20, - "temperature": 0 - }' - ``` - -### Serving multiple LoRAs per base AI model - -⚠️ **Requirement**: LoRA names must be unique across the base AI models (i.e., across the backend inference server deployments) - -Deploy the third base model that is used to demonstrate multiple LoRA configuration per base model. The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous examples. - -```bash -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment-1.yaml -``` +### Deploy the 2nd InferencePool and Endpoint Picker Extension -Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods` +Set the Helm chart version (unless already set). -### Deploy the 3rd InferencePool and Endpoint Picker Extension + ```bash + export IGW_CHART_VERSION=v0 + ``` -We also want to use an InferencePool and EndPoint Picker for this third model. +Select a tab to follow the provider-specific instructions. === "GKE" @@ -302,7 +163,7 @@ We also want to use an InferencePool and EndPoint Picker for this third model. --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool ``` === "Istio" @@ -313,21 +174,34 @@ We also want to use an InferencePool and EndPoint Picker for this third model. --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool ``` +After the installation, verify that you have two `InferencePools` and two EPP pods, one per base model type, running without errors + +```bash +kubectl get inferencepools +``` + +```bash +kubectl get pods +``` + ### Configure HTTPRoutes +Before configuring the HTTPRoutes for the models and their LoRAs, delete the existing HTTPRoute for the `meta-llama/Llama-3.1-8B-Instruct` model. The new routes will match the model name in the `X-Gateway-Model-Name` HTTP header, which is inserted by the BBR extension after parsing the model name from the LLM request body. + +```bash +kubectl delete httproute llm-route +``` + Now configure new HTTPRoutes for the two simulated models and their LoRAs that we want to serve via BBR using the following command which configures both routes. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/bbr-example/httproute_bbr_lora.yaml ``` -Also examine this manifest file (see the yaml below), to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. Note that the manifest file uses two different ways of defining the routes to LoRAs: (1) via adding match clauses on the same base AI model HTTPRoute or by (2) defining a separate HTTPRoutes. There is no functional diffeence between the two methods, except for the limitation on the number of matchers per route imposed by the API Gateway - -⚠️ **Known Issue** : -[Kubernetes API Gateway limits the total number of matchers per HTTPRoute to be less than 128](https://github.com/kubernetes-sigs/gateway-api/blob/df8c96c254e1ac6d5f5e0d70617f36143723d479/apis/v1/httproute_types.go#L128). +Also examine the manifest file (see the yaml below), to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on the model name. ```yaml apiVersion: gateway.networking.k8s.io/v1 @@ -352,6 +226,13 @@ spec: - type: Exact name: X-Gateway-Model-Name value: 'meta-llama/Llama-3.1-8B-Instruct' + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: X-Gateway-Model-Name + value: 'food-review-1' timeouts: request: 300s --- @@ -383,7 +264,7 @@ spec: headers: - type: Exact name: X-Gateway-Model-Name - value: 'food-review' + value: 'ski-resorts' - path: type: PathPrefix value: / @@ -394,33 +275,11 @@ spec: timeouts: request: 300s --- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - rules: - - backendRefs: - - group: inference.networking.k8s.io - kind: InferencePool - name: vllm-llama3-8b-instruct - matches: - - path: - type: PathPrefix - value: / - headers: - - type: Exact - name: X-Gateway-Model-Name - value: 'food-review-1' #this is the name of LoRA as defined in vLLM deployment - timeouts: - request: 300s ---- ``` +⚠️ **Note** : +[Kubernetes API Gateway limits the total number of matchers per HTTPRoute to be less than 128](https://github.com/kubernetes-sigs/gateway-api/blob/df8c96c254e1ac6d5f5e0d70617f36143723d479/apis/v1/httproute_types.go#L128). + Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. ```bash @@ -431,10 +290,6 @@ kubectl get httproute llm-llama-route -o yaml kubectl get httproute llm-deepseek-route -o yaml ``` -```bash -kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml -``` - ### Try the setup === "Chat Completions API" @@ -461,7 +316,28 @@ kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml }' ``` - 1. Send a few requests to the LoRA of the Llama model as follows: + 1. Send a few requests to Deepseek model to test that it works, as follows: + + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek/vllm-deepseek-r1", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Linux is said to be an open source kernel because " + } + ] + }' + ``` + 3. Send a few requests to the LoRA of the Llama model as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ @@ -483,7 +359,7 @@ kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml }' ``` - 1. Send a few requests to one LoRA of the Deepseek model as follows: + 4. Send a few requests to one LoRA of the Deepseek model as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ @@ -499,13 +375,13 @@ kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml }, { "role": "user", - "content": "The best movies of 2025 are" + "content": "What are the best movies of 2025?" } ] }' ``` - 1. Send a few requests to another LoRA of the Deepseek model as follows: + 5. Send a few requests to another LoRA of the Deepseek model as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ @@ -521,7 +397,7 @@ kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml }, { "role": "user", - "content": "The best movies of 2025 are" + "content": "Tell mne about ski deals" } ] }' @@ -536,7 +412,7 @@ kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml -H "Content-Type: application/json" \ -d '{ "model": "food-review-1", - "prompt": "Linux is said to be an open source kernel because ", + "prompt": "Write as if you were a critic: San Francisco ", "max_tokens": 100, "temperature": 0 }' @@ -562,7 +438,7 @@ kubectl get httproute vllm-llama3-8b-instruct-lora-food-review-1 -o yaml -H "Content-Type: application/json" \ -d '{ "model": "movie-critique", - "prompt": "Write as if you were a movie critique", + "prompt": "Tell me about movies", "max_tokens": 20, "temperature": 0 }' From 7c987940eff9d7939e33b77c9c3396831767322b Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Wed, 3 Dec 2025 17:51:52 +0200 Subject: [PATCH 04/12] Adds missing Kgateway and Nginx tabs for the second EPP model deployment --- .../guides/serve-multiple-genai-models.md | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index 2a2d5f23f3..ab90b62bef 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -176,6 +176,27 @@ Select a tab to follow the provider-specific instructions. --version $IGW_CHART_VERSION \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool ``` +=== "Kgateway" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "NGINX Gateway Fabric" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-deepseek-r1 \ + --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` After the installation, verify that you have two `InferencePools` and two EPP pods, one per base model type, running without errors From 30374404819558ff3be56f419a1bb90282d046ec Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Wed, 3 Dec 2025 18:17:26 +0200 Subject: [PATCH 05/12] fixes formatting typos --- site-src/guides/serve-multiple-genai-models.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index ab90b62bef..e3e5e4c08a 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -358,7 +358,7 @@ kubectl get httproute llm-deepseek-route -o yaml ] }' ``` - 3. Send a few requests to the LoRA of the Llama model as follows: + 1. Send a few requests to the LoRA of the Llama model as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ @@ -380,7 +380,7 @@ kubectl get httproute llm-deepseek-route -o yaml }' ``` - 4. Send a few requests to one LoRA of the Deepseek model as follows: + 1. Send a few requests to one LoRA of the Deepseek model as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ @@ -402,7 +402,7 @@ kubectl get httproute llm-deepseek-route -o yaml }' ``` - 5. Send a few requests to another LoRA of the Deepseek model as follows: + 1. Send a few requests to another LoRA of the Deepseek model as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ From a644c52e94a367b06f12f045dda417eda4954760 Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 12:14:22 +0200 Subject: [PATCH 06/12] Update config/manifests/vllm/sim-deployment-1.yaml Co-authored-by: Shmuel Kallner --- config/manifests/vllm/sim-deployment-1.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/manifests/vllm/sim-deployment-1.yaml b/config/manifests/vllm/sim-deployment-1.yaml index d9c032a707..0967a5920b 100644 --- a/config/manifests/vllm/sim-deployment-1.yaml +++ b/config/manifests/vllm/sim-deployment-1.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: vllm-sim - image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0 + image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1 imagePullPolicy: Always args: - --model From 826f88f952af75bee2421f41122b2dfa643f68ed Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 12:14:53 +0200 Subject: [PATCH 07/12] Update site-src/guides/serve-multiple-genai-models.md Co-authored-by: Shmuel Kallner --- site-src/guides/serve-multiple-genai-models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index e3e5e4c08a..e2ae7abac7 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -1,6 +1,6 @@ # Serve multiple generative AI models and multiple LoRAs for the base AI models -A company may need to deploy multiple large language models (LLMs) in a cluster to support different workloads. For example, a Llama model could power a chatbot interface, while a DeepSeek model might serve a recommendation application. One approach is to expose these models on separate Layer 7 (L7) URL paths and follow the steps in the [`Getting Started (Latest/Main)`](getting-started-latest.md) guide for each model. +A company may need to deploy multiple large language models (LLMs) in a cluster to support different workloads. For example, a Llama model could power a chatbot interface, while a DeepSeek model might serve a recommendation application. One approach is to expose these models on separate URL paths and follow the steps in the [`Getting Started (Latest/Main)`](getting-started-latest.md) guide for each model. However, one may also need to serve multiple models from the same L7 URL path. To achieve this, the system needs to extract information (such as the model name) from the request body (i.e., the LLM prompt). This pattern of serving multiple models behind a single endpoint is common among providers and is generally expected by clients. The OpenAI API format requires the model name to be specified in the request body. For such model-aware routing, use the Body-Based Routing (BBR) feature described in this guide. From 95382fdea0e20368e6d8b7b33508d96a6f3c3a9b Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 12:15:38 +0200 Subject: [PATCH 08/12] Update site-src/guides/serve-multiple-genai-models.md Co-authored-by: Shmuel Kallner --- site-src/guides/serve-multiple-genai-models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index e2ae7abac7..b58d9f76f4 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -2,7 +2,7 @@ A company may need to deploy multiple large language models (LLMs) in a cluster to support different workloads. For example, a Llama model could power a chatbot interface, while a DeepSeek model might serve a recommendation application. One approach is to expose these models on separate URL paths and follow the steps in the [`Getting Started (Latest/Main)`](getting-started-latest.md) guide for each model. -However, one may also need to serve multiple models from the same L7 URL path. To achieve this, the system needs to extract information (such as the model name) from the request body (i.e., the LLM prompt). This pattern of serving multiple models behind a single endpoint is common among providers and is generally expected by clients. The OpenAI API format requires the model name to be specified in the request body. For such model-aware routing, use the Body-Based Routing (BBR) feature described in this guide. +However, one may also need to serve multiple models from the same URL path. To achieve this, the system needs to extract information (such as the model name) from the request body (i.e., the LLM prompt). This pattern of serving multiple models behind a single endpoint is common among providers and is generally expected by clients. The OpenAI API format requires the model name to be specified in the request body. For such model-aware routing, use the Body-Based Routing (BBR) feature described in this guide. Additionally, each base AI model can have multiple Low-Rank Adaptations ([LoRAs](https://www.ibm.com/think/topics/lora)). LoRAs associated with the same base model are served by the same backend inference server that hosts the base model. A LoRA name is also provided as the model name in the request body. From bb9eab035951b29275a59adda67f67d68130b92c Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 12:16:32 +0200 Subject: [PATCH 09/12] Update site-src/guides/serve-multiple-genai-models.md Co-authored-by: Shmuel Kallner --- site-src/guides/serve-multiple-genai-models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index b58d9f76f4..cef7d8053e 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -8,7 +8,7 @@ Additionally, each base AI model can have multiple Low-Rank Adaptations ([LoRAs] ## How -[Body-Based Router (BBR)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) extracts the model name from the request body and adds it to the `X-Gateway-Model-Name` header. This header is then used for matching and routing the request to the appropriate `InferencePool` and its associated Endpoint Picker Extension (EPP) instances. +The BBR extracts the model name from the request body and adds it to the `X-Gateway-Model-Name` header. This header is then used for matching and routing the request to the appropriate `InferencePool` and its associated Endpoint Picker Extension (EPP) instances. ### Example Model-Aware Routing using Body-Based Routing (BBR) From 40cc071f57eb99927f7d0f7ec482b5280617ea6d Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 12:17:20 +0200 Subject: [PATCH 10/12] Update site-src/guides/serve-multiple-genai-models.md Co-authored-by: Shmuel Kallner --- site-src/guides/serve-multiple-genai-models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index cef7d8053e..643334cd87 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -16,7 +16,7 @@ This guide assumes you have already setup the cluster for basic model serving as ### Deploy Body-Based Routing Extension -To enable body-based routing, deploy the BBR `ext_proc` server using Helm. This server is independent of EPP. Once installed, it is automatically added as the first filter in the gateway’s filter chain, ahead of other `ext_proc` servers such as EPP. +To enable body-based routing, deploy the BBR server using Helm. This server runs as a gateway extension and is independent of the EPP. Once installed, it is automatically added as the first filter in the gateway’s filter chain, ahead of other gateway extension servers such as the EPP. Select an appropriate tab depending on your Gateway provider: From dafa5a88819a363203e4ef5df89981a9e3496011 Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 12:17:56 +0200 Subject: [PATCH 11/12] Update site-src/guides/serve-multiple-genai-models.md Co-authored-by: Shmuel Kallner --- site-src/guides/serve-multiple-genai-models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index 643334cd87..8c2fe2405e 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -78,7 +78,7 @@ kubectl get pods ### Serving a Second Base Model -The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same `/` L7 path, as in the previous example from the [Getting Started (Latest/Main)](getting-started-latest.md) guide. +The example uses a vLLM simulator since this is the least common denominator configuration that can be run in every environment. The model, `deepseek/vllm-deepseek-r1`, will be served from the same URL path, as in the previous example from the [Getting Started (Latest/Main)](getting-started-latest.md) guide. Deploy the second base model: From 54dae0974a0361cfde1f3f2b500c5dd3bd84c439 Mon Sep 17 00:00:00 2001 From: David Breitgand Date: Tue, 9 Dec 2025 13:00:40 +0200 Subject: [PATCH 12/12] Addressing reviewer (shmuelk) comment to include an explicit setting of PORT and IP when trying out multiple LLM setup --- .../guides/serve-multiple-genai-models.md | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index e3e5e4c08a..d621407c1c 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -313,9 +313,13 @@ kubectl get httproute llm-deepseek-route -o yaml ### Try the setup +First, make sure that the setup works as before by sending a request to the LoRA of the first model set up in the [`Getting started (Latest/Main)`](getting-started-latest.md) guide. + +--8<-- "site-src/_includes/test.md" + === "Chat Completions API" - 1. Send a few requests to Llama model to test that it works as before, as follows: + 1. Send a few requests to the Llama model directly: ```bash curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ @@ -426,20 +430,19 @@ kubectl get httproute llm-deepseek-route -o yaml === "Completions API" - 1. Send a few requests to Llama model's LoRA as follows: - - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "food-review-1", - "prompt": "Write as if you were a critic: San Francisco ", - "max_tokens": 100, - "temperature": 0 - }' - ``` + 1. Send a few requests to the Deepseek model: - 1. Send a few requests to the first Deepseek LoRA as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek/vllm-deepseek-r1", + "prompt": "What is the best ski resort in Austria?", + "max_tokens": 20, + "temperature": 0 + }' + ``` + 1. Send a few requests to the first Deepseek LoRA as follows: ```bash curl -X POST -i ${IP}:${PORT}/v1/completions \