From 12a318c5eff39890f25b10e015359ff09dd0d150 Mon Sep 17 00:00:00 2001
From: "H P, Vijay Kumar" <vijay.kumar.h.p@intel.com>
Date: Wed, 30 Oct 2024 11:16:07 +0530
Subject: [PATCH] K8S deployments with readme instructions to bring up on Gaudi
 machine

---
 InferenceasAService/Kubernetes/README.md      |  66 ++++++
 .../Kubernetes/chatqna-tei.yml                | 167 ++++++++++++++
 .../Kubernetes/chatqna-teirerank.yml          | 165 ++++++++++++++
 .../Kubernetes/chatqna-tgi-llama.yml          | 133 ++++++++++++
 .../Kubernetes/chatqna-tgi-llama70b.yml       | 205 ++++++++++++++++++
 5 files changed, 736 insertions(+)
 create mode 100644 InferenceasAService/Kubernetes/README.md
 create mode 100644 InferenceasAService/Kubernetes/chatqna-tei.yml
 create mode 100644 InferenceasAService/Kubernetes/chatqna-teirerank.yml
 create mode 100644 InferenceasAService/Kubernetes/chatqna-tgi-llama.yml
 create mode 100644 InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml
diff --git a/InferenceasAService/Kubernetes/README.md b/InferenceasAService/Kubernetes/README.md
new file mode 100644
index 0000000000..98bec45a48
--- /dev/null
+++ b/InferenceasAService/Kubernetes/README.md
@@ -0,0 +1,66 @@
+# Kubernetes Deployments with gaudi devices
+## Prerequisites
+- **Kubernetes Cluster**: Access to a Kubernetes v1.29 cluster
+ - **CSI Driver**: The K8s cluster must have the CSI driver installed, using the [local-path-provisioner](https://github.com/rancher/local-path-provisioner) with `local_path_provisioner_claim_root` set to `/mnt`. 
+ - **Operating System**: Ubuntu 22.04
+ - **Gaudi Software Stack**: Verify that your setup uses a valid software stack for Gaudi accelerators, see [Gaudi support matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html). Note that running LLM on a CPU is possible but will significantly reduce performance.
+ - **Gaudi Firmware**:Make sure Firmware is installed on Gaudi nodes. Follow the [Gaudi Firmware Installation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare) guide for detailed instructions.
+ - **K8s Plugin for Gaudi**: Install the K8s plugin by following the instructions in [How to install K8s Plugin for Gaudi](https://docs.habana.ai/en/latest/Orchestration/Gaudi_Kubernetes/Device_Plugin_for_Kubernetes.html).
+ - **Hugging Face Model Access**: Ensure you have the necessary access to download and use the chosen Hugging Face model. For example, such access is mandatory when using the [Mixtral-8x22B](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1).
+ - **Helm CLIs installed**
+------------
+
+### Deploying the Intel Gaudi base operator on K8S
+
+Install the Operator on a cluster by deploying a Helm chart:
+
+#### Create the Operator namespace
+```
+kubectl create namespace habana-ai-operator
+kubectl label namespace habana-ai-operator pod-security.kubernetes.io/enforce=privileged --overwrite
+kubectl label namespace habana-ai-operator pod-security.kubernetes.io/audit=privileged --overwrite
+kubectl label namespace habana-ai-operator pod-security.kubernetes.io/warn=privileged --overwrite
+```
+
+#### Install Helm chart
+```
+helm repo add gaudi-helm https://vault.habana.ai/artifactory/api/helm/gaudi-helm
+helm repo update
+helm install habana-ai-operator gaudi-helm/habana-ai-operator --version 1.18.0-524 -n habana-ai-operator
+```
+------------
+### Kubernetes Deployments steps for each models 
+Below steps has the kubernetes deployments which are used for Inference as a service on habana Gaudi . Following are kubectl commands examples for TGI models inference
+Make sure to update the HuggingFace token in the yaml files before applying them - HF_TOKEN: "<your-hf-token>"
+
+To delpoy Llama3.1-8B on 1 card
+```
+kubectl apply -f chatqna-tgi-llama.yml
+```
+To delpoy Llama3.1-70B 8 cards
+```
+kubectl apply -f chatqna-tgi-llama70b.yml
+```
+To deploy text-embeddings-inference
+```
+kubectl apply -f chatqna-tei.yml
+kubectl apply -f chatqna-teirerank.yml
+```
+
+------------
+
+## Verify pods and Services
+
+To verify the installation, 
+run the command `kubectl get pods -A` to make sure all pods are running.
+run the command `kubectl get svc -A` to validate service specific configurations for all the models deployed above
+
+run the below curl command modifying the IP and port respectively to validate the model response
+```
+curl -k http://<Cluster-IP>:<service-port>/ -X POST -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":32}}' -H 'Content-Type: application/json'
+```
+------------
+## License
+The license to use TGI on Habana Gaudi is the one of TGI: https://github.com/huggingface/text-generation-inference/blob/main/LICENSE
+
+Please reach out to api-enterprise@huggingface.co if you have any question.
\ No newline at end of file
diff --git a/InferenceasAService/Kubernetes/chatqna-tei.yml b/InferenceasAService/Kubernetes/chatqna-tei.yml
new file mode 100644
index 0000000000..ec7025dfc7
--- /dev/null
+++ b/InferenceasAService/Kubernetes/chatqna-tei.yml
@@ -0,0 +1,167 @@
+---
+# Source: chatqna/charts/tei/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chatqna-tei-config
+  labels:
+    helm.sh/chart: tei-1.0.0
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+data:
+  MODEL_ID: "BAAI/bge-base-en-v1.5"
+  PORT: "2081"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  NUMBA_CACHE_DIR: "/tmp"
+  TRANSFORMERS_CACHE: "/tmp/transformers_cache"
+  HF_HOME: "/tmp/.cache/huggingface"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+---
+# Source: chatqna/charts/tei/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: chatqna-tei
+  labels:
+    helm.sh/chart: tei-1.0.0
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2081
+      protocol: TCP
+      name: tei
+  selector:
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: chatqna
+---
+# Source: chatqna/charts/tei/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chatqna-tei
+  labels:
+    helm.sh/chart: tei-1.0.0
+    app.kubernetes.io/name: tei
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tei
+      app.kubernetes.io/instance: chatqna
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: tei
+        app.kubernetes.io/instance: chatqna
+    spec:
+      securityContext:
+        {}
+      containers:
+        - name: tei
+          envFrom:
+            - configMapRef:
+                name: chatqna-tei-config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "ghcr.io/huggingface/tei-gaudi:latest"
+          imagePullPolicy: IfNotPresent
+          args:
+            - "--auto-truncate"
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2081
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          startupProbe:
+            failureThreshold: 120
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          resources:
+            limits:
+              habana.ai/gaudi: 1
+      volumes:
+        - name: model-volume
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+        - name: tmp
+          emptyDir: {}
+---
+# Source: chatqna/charts/tei/templates/horizontalPodAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+---
+# Source: chatqna/charts/tei/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+---
+# Source: chatqna/charts/tgi/templates/horizontalPorAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+---
+# Source: chatqna/charts/tgi/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed TGI metrics:
+# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
+# Metric descriptions:
+# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
+---
+# Source: chatqna/templates/customMetrics.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
diff --git a/InferenceasAService/Kubernetes/chatqna-teirerank.yml b/InferenceasAService/Kubernetes/chatqna-teirerank.yml
new file mode 100644
index 0000000000..a33d2f1064
--- /dev/null
+++ b/InferenceasAService/Kubernetes/chatqna-teirerank.yml
@@ -0,0 +1,165 @@
+---
+# Source: chatqna/charts/teirerank/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chatqna-teirerank-config
+  labels:
+    helm.sh/chart: teirerank-1.0.0
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+data:
+  MODEL_ID: "BAAI/bge-reranker-base"
+  PORT: "2082"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  NUMBA_CACHE_DIR: "/tmp"
+  TRANSFORMERS_CACHE: "/tmp/transformers_cache"
+  HF_HOME: "/tmp/.cache/huggingface"
+---
+# Source: chatqna/charts/teirerank/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: chatqna-teirerank
+  labels:
+    helm.sh/chart: teirerank-1.0.0
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2082
+      protocol: TCP
+      name: teirerank
+  selector:
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: chatqna
+---
+# Source: chatqna/charts/teirerank/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chatqna-teirerank
+  labels:
+    helm.sh/chart: teirerank-1.0.0
+    app.kubernetes.io/name: teirerank
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "cpu-1.5"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: teirerank
+      app.kubernetes.io/instance: chatqna
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: teirerank
+        app.kubernetes.io/instance: chatqna
+    spec:
+      securityContext:
+        {}
+      containers:
+        - name: teirerank
+          envFrom:
+            - configMapRef:
+                name: chatqna-teirerank-config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: true
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
+          imagePullPolicy: IfNotPresent
+          args:
+            - "--auto-truncate"
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2082
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          startupProbe:
+            failureThreshold: 120
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          resources:
+            {}
+      volumes:
+        - name: model-volume
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+        - name: tmp
+          emptyDir: {}
+---
+# Source: chatqna/charts/teirerank/templates/horizontalPodAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+---
+# Source: chatqna/charts/teirerank/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+---
+# Source: chatqna/charts/tgi/templates/horizontalPorAutoscaler.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+---
+# Source: chatqna/charts/tgi/templates/servicemonitor.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed TGI metrics:
+# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/
+# Metric descriptions:
+# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
+---
+# Source: chatqna/templates/customMetrics.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
diff --git a/InferenceasAService/Kubernetes/chatqna-tgi-llama.yml b/InferenceasAService/Kubernetes/chatqna-tgi-llama.yml
new file mode 100644
index 0000000000..e0f0b97dd7
--- /dev/null
+++ b/InferenceasAService/Kubernetes/chatqna-tgi-llama.yml
@@ -0,0 +1,133 @@
+---
+# Source: chatqna/charts/tgi/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chatqna-tgi-llama-config
+  labels:
+    helm.sh/chart: tgi-1.0.0
+    app.kubernetes.io/name: tgi
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+data:
+  MODEL_ID: "meta-llama/Meta-Llama-3.1-8B-Instruct"
+  PORT: "2080"
+  HF_TOKEN: "<your-hf-token>"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  HABANA_LOGS: "/tmp/habana_logs"
+  NUMBA_CACHE_DIR: "/tmp"
+  HF_HOME: "/tmp/.cache/huggingface"
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
+---
+# Source: chatqna/charts/tgi/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: chatqna-tgi-llama
+  labels:
+    helm.sh/chart: tgi-1.0.0
+    app.kubernetes.io/name: tgillama
+    app.kubernetes.io/instance: tgillama
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2080
+      protocol: TCP
+      name: tgillama
+  selector:
+    app.kubernetes.io/name: tgillama
+    app.kubernetes.io/instance: tgillama
+---
+# Source: chatqna/charts/tgi/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chatqna-tgi-llama
+  labels:
+    helm.sh/chart: tgi-1.0.0
+    app.kubernetes.io/name: tgillama
+    app.kubernetes.io/instance: tgillama
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tgillama
+      app.kubernetes.io/instance: tgillama
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: tgillama
+        app.kubernetes.io/instance: tgillama
+    spec:
+      securityContext:
+        {}
+      containers:
+        - name: tgi
+          envFrom:
+            - configMapRef:
+                name: chatqna-tgi-llama-config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: true
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          imagePullPolicy: IfNotPresent
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2080
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 24
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          readinessProbe:
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          startupProbe:
+            failureThreshold: 120
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          resources:
+            limits:
+              habana.ai/gaudi: 1
+      volumes:
+        - name: model-volume
+          emptyDir: {}
+        - name: tmp
+          emptyDir: {}
\ No newline at end of file
diff --git a/InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml b/InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml
new file mode 100644
index 0000000000..9b5899893d
--- /dev/null
+++ b/InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml
@@ -0,0 +1,205 @@
+---
+# Source: chatqna/charts/tgi/templates/configmap.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chatqna-tgi-llamab-config
+  labels:
+    helm.sh/chart: tgi-1.0.0
+    app.kubernetes.io/name: tgi
+    app.kubernetes.io/instance: chatqna
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+data:
+  PORT: "2080"
+  HF_TOKEN: "<your-hf-token>"
+  http_proxy: ""
+  https_proxy: ""
+  no_proxy: ""
+  HABANA_LOGS: "/tmp/habana_logs"
+  NUMBA_CACHE_DIR: "/tmp"
+  HF_HOME: "/tmp/.cache/huggingface"
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
+  TRANSFORMERS_CACHE: "/tmp/transformers_cache"
+  NUM_SHARD: "8"
+  SHARDED: "true"
+  LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct"
+---
+
+# Source: chatqna/charts/tgi/templates/service.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: chatqna-tgi-llamab
+  labels:
+    helm.sh/chart: tgi-1.0.0
+    app.kubernetes.io/name: tgillama70b
+    app.kubernetes.io/instance: tgillama70b
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 2080
+      protocol: TCP
+      name: tgillama70b
+  selector:
+    app.kubernetes.io/name: tgillama70b
+    app.kubernetes.io/instance: tgillama70b
+---
+# Source: chatqna/charts/tgi/templates/deployment.yaml
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chatqna-tgi-llamab
+  labels:
+    helm.sh/chart: tgi-1.0.0
+    app.kubernetes.io/name: tgillama70b
+    app.kubernetes.io/instance: tgillama70b
+    app.kubernetes.io/version: "2.1.0"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: tgillama70b
+      app.kubernetes.io/instance: tgillama70b
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: tgillama70b
+        app.kubernetes.io/instance: tgillama70b
+    spec:
+      securityContext:
+        {}
+      hostIPC: true
+      containers:
+        - name: tgi
+          envFrom:
+            - configMapRef:
+                name: chatqna-tgi-llamab-config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+              add:
+                - SYS_NICE
+            readOnlyRootFilesystem: false
+            runAsNonRoot: false
+            runAsUser: 0
+            seccompProfile:
+              type: RuntimeDefault
+          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
+          args:
+            - --model-id
+            - $(LLM_MODEL_ID)
+            - --sharded
+            - 'true'
+            - --num-shard
+            - $(NUM_SHARD)
+            - --max-input-length
+            - '1024'
+            - --max-total-tokens
+            - '2048'
+            - --max-batch-prefill-tokens
+            - '4096'
+            - --max-batch-total-tokens
+            - '524288'
+            - --waiting-served-ratio
+            - '1.2'
+            - --max-waiting-tokens
+            - '7'
+            - --max-concurrent-requests
+            - '512'
+          env:
+            - name: OMPI_MCA_btl_vader_single_copy_mechanism
+              value: none
+            - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
+              value: 'true'
+            - name: runtime
+              value: habana
+            - name: HABANA_VISIBLE_DEVICES
+              value: all
+            - name: HF_TOKEN
+              value: '<your-hf-token>'
+            - name: MAX_WARMUP_SEQUENCE_LENGTH
+              value: '512'
+            - name: MAX_TOTAL_TOKENS
+              value: '2048'
+            - name: BATCH_BUCKET_SIZE
+              value: '256'
+            - name: PREFILL_BATCH_BUCKET_SIZE
+              value: '4'
+            - name: PAD_SEQUENCE_TO_MULTIPLE_OF
+              value: '64'
+            - name: ENABLE_HPU_GRAPH
+              value: 'true'
+            - name: LIMIT_HPU_GRAPH
+              value: 'true'
+            - name: USE_FLASH_ATTENTION
+              value: 'true'
+            - name: FLASH_ATTENTION_RECOMPUTE
+              value: 'true'
+          imagePullPolicy: IfNotPresent
+          volumeMounts:
+            - mountPath: /data
+              name: model-volume
+            - mountPath: /tmp
+              name: tmp
+          ports:
+            - name: http
+              containerPort: 2080
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 60
+            initialDelaySeconds: 1800
+            periodSeconds: 30
+            tcpSocket:
+              port: http
+          readinessProbe:
+            initialDelaySeconds: 1800
+            periodSeconds: 30
+            tcpSocket:
+              port: http
+          startupProbe:
+            failureThreshold: 300
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            tcpSocket:
+              port: http
+          resources:
+            limits:
+              habana.ai/gaudi: 8
+              cpu: 80
+              memory: 600Gi
+            requests:
+              habana.ai/gaudi: 8
+              cpu: 80
+              memory: 600Gi
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: instance
+                operator: In
+                values:
+                - gaudiworker
+      volumes:
+        - name: model-volume
+          emptyDir: {}
+        - name: tmp
+          emptyDir: {}
\ No newline at end of file