From 12a318c5eff39890f25b10e015359ff09dd0d150 Mon Sep 17 00:00:00 2001 From: "H P, Vijay Kumar" Date: Wed, 30 Oct 2024 11:16:07 +0530 Subject: [PATCH] K8S deployments with readme instructions to bring up on Gaudi machine --- InferenceasAService/Kubernetes/README.md | 66 ++++++ .../Kubernetes/chatqna-tei.yml | 167 ++++++++++++++ .../Kubernetes/chatqna-teirerank.yml | 165 ++++++++++++++ .../Kubernetes/chatqna-tgi-llama.yml | 133 ++++++++++++ .../Kubernetes/chatqna-tgi-llama70b.yml | 205 ++++++++++++++++++ 5 files changed, 736 insertions(+) create mode 100644 InferenceasAService/Kubernetes/README.md create mode 100644 InferenceasAService/Kubernetes/chatqna-tei.yml create mode 100644 InferenceasAService/Kubernetes/chatqna-teirerank.yml create mode 100644 InferenceasAService/Kubernetes/chatqna-tgi-llama.yml create mode 100644 InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml diff --git a/InferenceasAService/Kubernetes/README.md b/InferenceasAService/Kubernetes/README.md new file mode 100644 index 0000000000..98bec45a48 --- /dev/null +++ b/InferenceasAService/Kubernetes/README.md @@ -0,0 +1,66 @@ +# Kubernetes Deployments with gaudi devices +## Prerequisites +- **Kubernetes Cluster**: Access to a Kubernetes v1.29 cluster + - **CSI Driver**: The K8s cluster must have the CSI driver installed, using the [local-path-provisioner](https://github.com/rancher/local-path-provisioner) with `local_path_provisioner_claim_root` set to `/mnt`. + - **Operating System**: Ubuntu 22.04 + - **Gaudi Software Stack**: Verify that your setup uses a valid software stack for Gaudi accelerators, see [Gaudi support matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html). Note that running LLM on a CPU is possible but will significantly reduce performance. + - **Gaudi Firmware**:Make sure Firmware is installed on Gaudi nodes. Follow the [Gaudi Firmware Installation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#driver-fw-install-bare) guide for detailed instructions. + - **K8s Plugin for Gaudi**: Install the K8s plugin by following the instructions in [How to install K8s Plugin for Gaudi](https://docs.habana.ai/en/latest/Orchestration/Gaudi_Kubernetes/Device_Plugin_for_Kubernetes.html). + - **Hugging Face Model Access**: Ensure you have the necessary access to download and use the chosen Hugging Face model. For example, such access is mandatory when using the [Mixtral-8x22B](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1). + - **Helm CLIs installed** +------------ + +### Deploying the Intel Gaudi base operator on K8S + +Install the Operator on a cluster by deploying a Helm chart: + +#### Create the Operator namespace +``` +kubectl create namespace habana-ai-operator +kubectl label namespace habana-ai-operator pod-security.kubernetes.io/enforce=privileged --overwrite +kubectl label namespace habana-ai-operator pod-security.kubernetes.io/audit=privileged --overwrite +kubectl label namespace habana-ai-operator pod-security.kubernetes.io/warn=privileged --overwrite +``` + +#### Install Helm chart +``` +helm repo add gaudi-helm https://vault.habana.ai/artifactory/api/helm/gaudi-helm +helm repo update +helm install habana-ai-operator gaudi-helm/habana-ai-operator --version 1.18.0-524 -n habana-ai-operator +``` +------------ +### Kubernetes Deployments steps for each models +Below steps has the kubernetes deployments which are used for Inference as a service on habana Gaudi . Following are kubectl commands examples for TGI models inference +Make sure to update the HuggingFace token in the yaml files before applying them - HF_TOKEN: "" + +To delpoy Llama3.1-8B on 1 card +``` +kubectl apply -f chatqna-tgi-llama.yml +``` +To delpoy Llama3.1-70B 8 cards +``` +kubectl apply -f chatqna-tgi-llama70b.yml +``` +To deploy text-embeddings-inference +``` +kubectl apply -f chatqna-tei.yml +kubectl apply -f chatqna-teirerank.yml +``` + +------------ + +## Verify pods and Services + +To verify the installation, +run the command `kubectl get pods -A` to make sure all pods are running. +run the command `kubectl get svc -A` to validate service specific configurations for all the models deployed above + +run the below curl command modifying the IP and port respectively to validate the model response +``` +curl -k http://:/ -X POST -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":32}}' -H 'Content-Type: application/json' +``` +------------ +## License +The license to use TGI on Habana Gaudi is the one of TGI: https://github.com/huggingface/text-generation-inference/blob/main/LICENSE + +Please reach out to api-enterprise@huggingface.co if you have any question. \ No newline at end of file diff --git a/InferenceasAService/Kubernetes/chatqna-tei.yml b/InferenceasAService/Kubernetes/chatqna-tei.yml new file mode 100644 index 0000000000..ec7025dfc7 --- /dev/null +++ b/InferenceasAService/Kubernetes/chatqna-tei.yml @@ -0,0 +1,167 @@ +--- +# Source: chatqna/charts/tei/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-tei-config + labels: + helm.sh/chart: tei-1.0.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-base-en-v1.5" + PORT: "2081" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + MAX_WARMUP_SEQUENCE_LENGTH: "512" +--- +# Source: chatqna/charts/tei/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-tei + labels: + helm.sh/chart: tei-1.0.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2081 + protocol: TCP + name: tei + selector: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/tei/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-tei + labels: + helm.sh/chart: tei-1.0.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: tei + envFrom: + - configMapRef: + name: chatqna-tei-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/tei-gaudi:latest" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2081 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + limits: + habana.ai/gaudi: 1 + volumes: + - name: model-volume + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/tei/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +--- +# Source: chatqna/charts/tei/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +--- +# Source: chatqna/charts/tgi/templates/horizontalPorAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +--- +# Source: chatqna/charts/tgi/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 +--- +# Source: chatqna/templates/customMetrics.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/InferenceasAService/Kubernetes/chatqna-teirerank.yml b/InferenceasAService/Kubernetes/chatqna-teirerank.yml new file mode 100644 index 0000000000..a33d2f1064 --- /dev/null +++ b/InferenceasAService/Kubernetes/chatqna-teirerank.yml @@ -0,0 +1,165 @@ +--- +# Source: chatqna/charts/teirerank/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-teirerank-config + labels: + helm.sh/chart: teirerank-1.0.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-reranker-base" + PORT: "2082" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" +--- +# Source: chatqna/charts/teirerank/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-teirerank + labels: + helm.sh/chart: teirerank-1.0.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2082 + protocol: TCP + name: teirerank + selector: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna +--- +# Source: chatqna/charts/teirerank/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-teirerank + labels: + helm.sh/chart: teirerank-1.0.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + template: + metadata: + labels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: chatqna + spec: + securityContext: + {} + containers: + - name: teirerank + envFrom: + - configMapRef: + name: chatqna-teirerank-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2082 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} +--- +# Source: chatqna/charts/teirerank/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +--- +# Source: chatqna/charts/teirerank/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +--- +# Source: chatqna/charts/tgi/templates/horizontalPorAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +--- +# Source: chatqna/charts/tgi/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 +--- +# Source: chatqna/templates/customMetrics.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/InferenceasAService/Kubernetes/chatqna-tgi-llama.yml b/InferenceasAService/Kubernetes/chatqna-tgi-llama.yml new file mode 100644 index 0000000000..e0f0b97dd7 --- /dev/null +++ b/InferenceasAService/Kubernetes/chatqna-tgi-llama.yml @@ -0,0 +1,133 @@ +--- +# Source: chatqna/charts/tgi/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-tgi-llama-config + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "meta-llama/Meta-Llama-3.1-8B-Instruct" + PORT: "2080" + HF_TOKEN: "" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/tmp/.cache/huggingface" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" +--- +# Source: chatqna/charts/tgi/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-tgi-llama + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgillama + app.kubernetes.io/instance: tgillama + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2080 + protocol: TCP + name: tgillama + selector: + app.kubernetes.io/name: tgillama + app.kubernetes.io/instance: tgillama +--- +# Source: chatqna/charts/tgi/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-tgi-llama + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgillama + app.kubernetes.io/instance: tgillama + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tgillama + app.kubernetes.io/instance: tgillama + template: + metadata: + labels: + app.kubernetes.io/name: tgillama + app.kubernetes.io/instance: tgillama + spec: + securityContext: + {} + containers: + - name: tgi + envFrom: + - configMapRef: + name: chatqna-tgi-llama-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2080 + protocol: TCP + livenessProbe: + failureThreshold: 24 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + resources: + limits: + habana.ai/gaudi: 1 + volumes: + - name: model-volume + emptyDir: {} + - name: tmp + emptyDir: {} \ No newline at end of file diff --git a/InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml b/InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml new file mode 100644 index 0000000000..9b5899893d --- /dev/null +++ b/InferenceasAService/Kubernetes/chatqna-tgi-llama70b.yml @@ -0,0 +1,205 @@ +--- +# Source: chatqna/charts/tgi/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: chatqna-tgi-llamab-config + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: chatqna + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +data: + PORT: "2080" + HF_TOKEN: "" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/tmp/.cache/huggingface" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + NUM_SHARD: "8" + SHARDED: "true" + LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct" +--- + +# Source: chatqna/charts/tgi/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: chatqna-tgi-llamab + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgillama70b + app.kubernetes.io/instance: tgillama70b + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2080 + protocol: TCP + name: tgillama70b + selector: + app.kubernetes.io/name: tgillama70b + app.kubernetes.io/instance: tgillama70b +--- +# Source: chatqna/charts/tgi/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-tgi-llamab + labels: + helm.sh/chart: tgi-1.0.0 + app.kubernetes.io/name: tgillama70b + app.kubernetes.io/instance: tgillama70b + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: tgillama70b + app.kubernetes.io/instance: tgillama70b + template: + metadata: + labels: + app.kubernetes.io/name: tgillama70b + app.kubernetes.io/instance: tgillama70b + spec: + securityContext: + {} + hostIPC: true + containers: + - name: tgi + envFrom: + - configMapRef: + name: chatqna-tgi-llamab-config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + add: + - SYS_NICE + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + seccompProfile: + type: RuntimeDefault + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" + args: + - --model-id + - $(LLM_MODEL_ID) + - --sharded + - 'true' + - --num-shard + - $(NUM_SHARD) + - --max-input-length + - '1024' + - --max-total-tokens + - '2048' + - --max-batch-prefill-tokens + - '4096' + - --max-batch-total-tokens + - '524288' + - --waiting-served-ratio + - '1.2' + - --max-waiting-tokens + - '7' + - --max-concurrent-requests + - '512' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: '' + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + - name: MAX_TOTAL_TOKENS + value: '2048' + - name: BATCH_BUCKET_SIZE + value: '256' + - name: PREFILL_BATCH_BUCKET_SIZE + value: '4' + - name: PAD_SEQUENCE_TO_MULTIPLE_OF + value: '64' + - name: ENABLE_HPU_GRAPH + value: 'true' + - name: LIMIT_HPU_GRAPH + value: 'true' + - name: USE_FLASH_ATTENTION + value: 'true' + - name: FLASH_ATTENTION_RECOMPUTE + value: 'true' + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2080 + protocol: TCP + livenessProbe: + failureThreshold: 60 + initialDelaySeconds: 1800 + periodSeconds: 30 + tcpSocket: + port: http + readinessProbe: + initialDelaySeconds: 1800 + periodSeconds: 30 + tcpSocket: + port: http + startupProbe: + failureThreshold: 300 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + resources: + limits: + habana.ai/gaudi: 8 + cpu: 80 + memory: 600Gi + requests: + habana.ai/gaudi: 8 + cpu: 80 + memory: 600Gi + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: instance + operator: In + values: + - gaudiworker + volumes: + - name: model-volume + emptyDir: {} + - name: tmp + emptyDir: {} \ No newline at end of file