From de732607412d68d2121cbeacbd6350bf1b3fad70 Mon Sep 17 00:00:00 2001
From: xinyu <x3huang@ucsd.edu>
Date: Thu, 19 Feb 2026 22:25:32 +0800
Subject: [PATCH] allowed more mem usage to avoid oom

---
 deploy/helm/values.yaml | 5 ++---
 slm_server/config.py    | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml
index 94a5b89..1631cf1 100644
--- a/deploy/helm/values.yaml
+++ b/deploy/helm/values.yaml
@@ -79,12 +79,11 @@ env: {}
 
 # Resource requests and limits for the container.
 # See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
-# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) on 1-CPU / 1 GB VPS nodes.
-# Previous values for Q8_0 (805 MB): limits cpu=3/mem=800Mi, requests cpu=50m/mem=32Mi
+# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes.
 resources:
   limits:
     cpu: 1
-    memory: 700Mi
+    memory: 1Gi
   requests:
     cpu: 200m
     memory: 600Mi
diff --git a/slm_server/config.py b/slm_server/config.py
index 55eadfd..7e90577 100644
--- a/slm_server/config.py
+++ b/slm_server/config.py
@@ -62,7 +62,7 @@ class Settings(BaseSettings):
         description="Owner label for /models list. Set SLM_MODEL_OWNER to override.",
     )
     n_ctx: int = Field(
-        4096, description="Maximum context window (input + generated tokens)."
+        8192, description="Maximum context window (input + generated tokens)."
     )
     n_threads: int = Field(
         2, description="Number of OpenMP threads llama‑cpp will spawn."