CogStack · jocelyneholdbrook · Apr 23, 2026 · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/cogstack-cohorter/WebAPP/.dockerignore b/cogstack-cohorter/WebAPP/.dockerignore
@@ -3,5 +3,8 @@ client/node_modules/
 client-react/node_modules/
 server/node_modules/
 
-# Data files are never baked into the image — supply them via volume mount at runtime
+# Runtime-generated patient data and full production SNOMED files — these are
+# either generated at startup (random mode) or supplied via a volume mount
+# (production mode). They must NOT be baked into the image.
 server/data/
+
diff --git a/cogstack-cohorter/WebAPP/entrypoint.sh b/cogstack-cohorter/WebAPP/entrypoint.sh
@@ -3,26 +3,20 @@ set -e
 
 DATA_DIR=/usr/src/app/server/data
 
-# ── Step 1: extract archive if JSON data isn't already present ────────────────
-if [ ! -f "$DATA_DIR/snomed_terms.json" ]; then
-    if [ -f "$DATA_DIR/snomed_terms_data.tar.gz" ]; then
-        echo "[webapp] Extracting SNOMED data archive..."
-        tar xzvf "$DATA_DIR/snomed_terms_data.tar.gz" -C "$DATA_DIR"
-    else
-        echo "[webapp] ERROR: No data found at $DATA_DIR." >&2
-        echo "[webapp] Mount a directory containing snomed_terms.json (and related files)" >&2
-        echo "[webapp] or snomed_terms_data.tar.gz via a Docker volume:" >&2
-        echo "[webapp]   -v /your/data:/usr/src/app/server/data" >&2
-        exit 1
-    fi
-fi
+# Ensure the writable data directory exists (may be an emptyDir or PVC mount).
+mkdir -p "$DATA_DIR"
 
-# ── Step 2 (optional): generate random patient data ───────────────────────────
-# Set RANDOM_DATA=true in the container environment to generate synthetic data.
-if [ "${RANDOM_DATA}" = "true" ]; then
+# ── Generate random patient data on first startup (random/demo mode) ──────────
+if [ "${RANDOM_DATA}" = "true" ] && \
+   { [ ! -f "$DATA_DIR/ptt2age.json" ]      || \
+     [ ! -f "$DATA_DIR/ptt2sex.json" ]      || \
+     [ ! -f "$DATA_DIR/ptt2eth.json" ]      || \
+     [ ! -f "$DATA_DIR/ptt2dod.json" ]      || \
+     [ ! -f "$DATA_DIR/cui2ptt_pos.jsonl" ] || \
+     [ ! -f "$DATA_DIR/cui2ptt_tsp.jsonl" ]; }; then
     echo "[webapp] Generating random demo patient data..."
     node --max-old-space-size=32768 /usr/src/app/server/gen_random_data.js
 fi
 
-# ── Step 3: start the server ──────────────────────────────────────────────────
+# ── Start the server ──────────────────────────────────────────────────────────
 exec node --max-old-space-size=32768 server.js
diff --git a/cogstack-cohorter/WebAPP/server/data-example/cui_pt2ch_example.json b/cogstack-cohorter/WebAPP/server/data-example/cui_pt2ch_example.json
diff --git a/cogstack-cohorter/WebAPP/server/data-example/snomed_example.json b/cogstack-cohorter/WebAPP/server/data-example/snomed_example.json
diff --git a/cogstack-cohorter/WebAPP/server/gen_random_data.js b/cogstack-cohorter/WebAPP/server/gen_random_data.js
@@ -3,7 +3,7 @@
 // node --max-old-space-size=32768 gen_random_data.js
 console.log('Generating random data')
 const fs = require('fs');
-const snomed_terms = require('./data/snomed_terms.json');
+const snomed_terms = require('./data-example/snomed_example.json');
 
 // Returns a random integer between min (inclusive) and max (inclusive).
 function random_int(min, max) {

diff --git a/cogstack-cohorter/WebAPP/server/server.js b/cogstack-cohorter/WebAPP/server/server.js
@@ -33,9 +33,15 @@ const NL2DSL_SERVER = process.env.NL2DSL_SERVER || "http://localhost:3002/api/co
 let port = process.env.PORT || 3000;
 console.log('Loading data...');
 
-// index all the snomed concepts
-const snomed_terms = require('./data/snomed_terms.json');
-const cui_pt2ch = require('./data/cui_pt2ch.json');
+// In random/demo mode use the example subset baked into the image.
+// In production mode use the full SNOMED files supplied via volume mount.
+const isRandomMode = (process.env.RANDOM_DATA || 'true') === 'true';
+const snomed_terms = isRandomMode
+    ? require('./data-example/snomed_example.json')
+    : require('./data/snomed_terms.json');
+const cui_pt2ch = isRandomMode
+    ? require('./data-example/cui_pt2ch_example.json')
+    : require('./data/cui_pt2ch.json');
 
 // for admin login
 const admin_pwd = process.env.PASSWORD || 'admin_pass';

diff --git a/helm-charts/cogstack-cohorter-helm/ci/ci-values.yaml b/helm-charts/cogstack-cohorter-helm/ci/ci-values.yaml
@@ -1,36 +1,60 @@
 # CI smoke-test overrides.
-# An init container seeds the empty data volume with minimal stub SNOMED data
-# so the webapp entrypoint can proceed and RANDOM_DATA=true can generate
-# synthetic patient records without requiring a real data mount.
+# Only the webapp is deployed; NL2DSL, MedCAT, and Ollama are disabled to keep
+# the smoke test fast and resource-light.
 webapp:
   env:
     RANDOM_DATA: "true"
+
   persistence:
     enabled: false
+
+  # Seed the data volume before the webapp container starts so that:
+  #   1. gen_random_data.js is skipped (all six generated files are present).
   initContainers:
-    - name: init-snomed-stub
+    - name: init-ci-stub
       image: busybox
       command:
         - sh
         - -c
         - |
           mkdir -p /data
-          # Minimal snomed_terms.json — a few entries covering each clinical
-          # category that gen_random_data.js filters on.
-          cat > /data/snomed_terms.json << 'EOF'
-          [
-            {"cui":"73211009","str":"Diabetes mellitus (disorder)"},
-            {"cui":"44054006","str":"Diabetes mellitus type 2 (disorder)"},
-            {"cui":"38341003","str":"Hypertensive disorder (disorder)"},
-            {"cui":"195967001","str":"Asthma (disorder)"},
-            {"cui":"271807003","str":"Eruption of skin (finding)"},
-            {"cui":"386661006","str":"Fever (finding)"},
-            {"cui":"80146002","str":"Appendectomy (procedure)"},
-            {"cui":"387517004","str":"Paracetamol (substance)"}
-          ]
-          EOF
-          # cui_pt2ch.json — empty hierarchy is valid; server handles missing keys
+          # Minimal patient stubs so gen_random_data.js is skipped on startup.
+          # All six generated files must be present (entrypoint checks every one).
+          echo '{"0":25,"1":42,"2":67}' > /data/ptt2age.json
+          echo '{"0":"Male","1":"Female","2":"Male"}' > /data/ptt2sex.json
+          echo '{"0":"White","1":"Asian","2":"Black"}' > /data/ptt2eth.json
+          echo '{"0":0,"1":0,"2":0}' > /data/ptt2dod.json
+          echo '{"73211009":{"0":3},"386661006":{"1":1}}' > /data/cui2ptt_pos.jsonl
+          echo '{"73211009":{"0":1609459200},"386661006":{"1":1609459200}}' > /data/cui2ptt_tsp.jsonl
+          # Legacy stubs for old Docker Hub images that expect these files.
+          echo '[{"cui":"73211009","str":"Diabetes mellitus (disorder)"},{"cui":"386661006","str":"Fever (finding)"},{"cui":"80146002","str":"Appendectomy (procedure)"},{"cui":"387517004","str":"Paracetamol (substance)"}]' > /data/snomed_terms.json
           echo '{}' > /data/cui_pt2ch.json
       volumeMounts:
         - name: data
           mountPath: /data
+
+  # Relax probes for CI — image pull + npm start can be slow on shared runners.
+  livenessProbe:
+    httpGet:
+      path: /
+      port: http
+    initialDelaySeconds: 120
+    periodSeconds: 15
+    failureThreshold: 5
+
+  readinessProbe:
+    httpGet:
+      path: /
+      port: http
+    initialDelaySeconds: 60
+    periodSeconds: 10
+    failureThreshold: 6
+
+nl2dsl:
+  enabled: false
+
+medcat:
+  enabled: false
+
+ollama:
+  enabled: false
diff --git a/helm-charts/cogstack-cohorter-helm/values.yaml b/helm-charts/cogstack-cohorter-helm/values.yaml
@@ -114,19 +114,27 @@ webapp:
   # Environment variables injected into the webapp container.
   # NL2DSL_SERVER is set automatically from nl2dsl.service.port.
   env:
-    # Set to "true" to generate synthetic patient data on first startup (demo mode).
-    RANDOM_DATA: "false"
+    # true  → demo mode: bundled SNOMED data is used and synthetic patient records are generated on start-up.
+    # false → production mode: supply real MIMIC-IV shaped EHR data via data.downloadUrl or a pre-populated persistence.existingClaim.
+    RANDOM_DATA: "true"
+
+  # EHR data configuration (production / non-random mode).
+  # Ignored when RANDOM_DATA=true — the image's bundled demo data is used instead.
+  data:
+    # URL to download a data archive (tar.gz) at startup.
+    # The archive must contain snomed_terms.json, cui_pt2ch.json, and MIMIC-IV shaped patient data files.
+    # Example: "https://your-storage/ehr_data.tar.gz"
+    downloadUrl: ""
 
   service:
     type: ClusterIP
     port: 3000
 
-  # Persistent volume for SNOMED data directory (/usr/src/app/server/data).
-  # Populate the PVC with snomed_terms_data.tar.gz (auto-extracted on startup)
-  # or the pre-extracted files (snomed_terms.json, cui_pt2ch.json, patient data).
+  # Persistent volume for the data directory (/usr/src/app/server/data).
+  # In random/demo mode this stores the generated patient records across restarts.
+  # In production mode this is where real EHR data lives.
   persistence:
     enabled: true
-    # Use an existing PVC instead of creating a new one.
     existingClaim: ""
     storageClass: ""
     accessMode: ReadWriteOnce