From de732607412d68d2121cbeacbd6350bf1b3fad70 Mon Sep 17 00:00:00 2001 From: xinyu Date: Thu, 19 Feb 2026 22:25:32 +0800 Subject: [PATCH] allowed more mem usage to avoid oom --- deploy/helm/values.yaml | 5 ++--- slm_server/config.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index 94a5b89..1631cf1 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -79,12 +79,11 @@ env: {} # Resource requests and limits for the container. # See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ -# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) on 1-CPU / 1 GB VPS nodes. -# Previous values for Q8_0 (805 MB): limits cpu=3/mem=800Mi, requests cpu=50m/mem=32Mi +# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes. resources: limits: cpu: 1 - memory: 700Mi + memory: 1Gi requests: cpu: 200m memory: 600Mi diff --git a/slm_server/config.py b/slm_server/config.py index 55eadfd..7e90577 100644 --- a/slm_server/config.py +++ b/slm_server/config.py @@ -62,7 +62,7 @@ class Settings(BaseSettings): description="Owner label for /models list. Set SLM_MODEL_OWNER to override.", ) n_ctx: int = Field( - 4096, description="Maximum context window (input + generated tokens)." + 8192, description="Maximum context window (input + generated tokens)." ) n_threads: int = Field( 2, description="Number of OpenMP threads llama‑cpp will spawn."