From cbae356b40e0fb371081df61117a066f5652c26f Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Sat, 14 Feb 2026 12:28:52 -0500
Subject: [PATCH 01/38] chore: archive v1.1 milestone

Archive Deployment, Workflow & Competitive Parity milestone:
- 7 phases (8-14), 20 plans, 26/26 requirements complete
- Roadmap and requirements archived to .planning/milestones/
- PROJECT.md evolved with validated v1.1 requirements
- ROADMAP.md collapsed v1.1 into details tag

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/MILESTONES.md                       |  28 ++++
 .planning/PROJECT.md                          | 109 +++++--------
 .planning/ROADMAP.md                          | 143 +++---------------
 .planning/STATE.md                            | 109 +++----------
 .../v1.1-REQUIREMENTS.md}                     |  57 +------
 .planning/milestones/v1.1-ROADMAP.md          | 131 ++++++++++++++++
 6 files changed, 241 insertions(+), 336 deletions(-)
 rename .planning/{REQUIREMENTS.md => milestones/v1.1-REQUIREMENTS.md} (68%)
 create mode 100644 .planning/milestones/v1.1-ROADMAP.md

diff --git a/.planning/MILESTONES.md b/.planning/MILESTONES.md
index 23c5086..f4545bd 100644
--- a/.planning/MILESTONES.md
+++ b/.planning/MILESTONES.md
@@ -1,5 +1,33 @@
 # Project Milestones: DataVisor
 
+## v1.1 Deployment, Workflow & Competitive Parity (Shipped: 2026-02-13)
+
+**Delivered:** Production-ready Docker deployment, smart dataset ingestion UI, annotation editing, error triage workflows, interactive visualizations with grid filtering, keyboard shortcuts, and per-annotation TP/FP/FN classification.
+
+**Phases completed:** 8-14 (20 plans total)
+
+**Key accomplishments:**
+
+- Production-ready Docker stack (Caddy + FastAPI + Next.js) with single-user auth, GCP deployment scripts, and comprehensive documentation
+- Smart dataset ingestion wizard with auto-detection of COCO layouts (Roboflow/Standard/Flat) and multi-split support
+- Annotation editing via react-konva canvas (move, resize, draw, delete bounding boxes) with DuckDB persistence
+- Error triage workflow: per-sample tagging, per-annotation TP/FP/FN auto-classification via IoU matching, worst-images ranking, and highlight mode
+- Interactive data discovery: clickable confusion matrix, near-duplicate detection, histogram filtering, and find-similar — all piping results to the grid
+- Full keyboard navigation with 16 shortcuts across grid, modal, triage, and editing contexts
+
+**Stats:**
+
+- 171 files created/modified
+- ~19,460 lines of code added (9,306 Python + 10,154 TypeScript)
+- 7 phases, 20 plans, 97 commits
+- 2 days (Feb 12-13, 2026)
+
+**Git range:** `a83d6cf` → `1bed6cf`
+
+**What's next:** Format expansion (YOLO/VOC), PR curves, per-class AP metrics
+
+---
+
 ## v1.0 MVP (Shipped: 2026-02-12)
 
 **Delivered:** A unified CV dataset introspection tool with visual browsing, annotation overlays, model comparison, embedding visualization, error analysis, and AI-powered pattern detection.
diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md
index 61ab957..b7f7f13 100644
--- a/.planning/PROJECT.md
+++ b/.planning/PROJECT.md
@@ -8,80 +8,45 @@ DataVisor is an open-source dataset introspection tool for computer vision — a
 
 A single tool that replaces scattered one-off scripts: load any CV dataset, visually browse with annotation overlays, compare ground truth against predictions, cluster via embeddings, and surface mistakes — all in one workflow.
 
+## Current State
+
+**Shipped:** v1.1 (2026-02-13)
+**Codebase:** ~32K LOC (16,256 Python + 15,924 TypeScript) across 14 phases
+**Architecture:** FastAPI + DuckDB + Qdrant (backend), Next.js + Tailwind + deck.gl + Recharts (frontend), Pydantic AI (agents), Moondream2 (VLM)
+
 ## Requirements
 
 ### Validated
 
-- ✓ Multi-format ingestion (COCO) with streaming parser architecture — v1.0
-- ✓ DuckDB-backed metadata storage for fast analytical queries over 100K+ samples — v1.0
-- ✓ Virtualized infinite-scroll grid view with overlaid bounding box annotations — v1.0
-- ✓ Ground Truth vs Model Predictions comparison toggle (solid vs dashed lines) — v1.0
-- ✓ Deterministic class-to-color hashing (same class = same color across sessions) — v1.0
-- ✓ t-SNE embedding generation from images (DINOv2-base) — v1.0
-- ✓ deck.gl-powered 2D embedding scatterplot with zoom, pan, and lasso selection — v1.0
-- ✓ Lasso-to-grid filtering (select cluster points → filter grid to those images) — v1.0
-- ✓ Hover thumbnails on embedding map points — v1.0
-- ✓ Qdrant vector storage for embedding similarity search — v1.0
-- ✓ Error categorization: Hard False Positives, Label Errors, False Negatives — v1.0
-- ✓ Pydantic AI agent that monitors error distribution and recommends actions — v1.0
-- ✓ Pattern detection (e.g., "90% of False Negatives occur in low-light images") — v1.0
-- ✓ Import pre-computed predictions (JSON) — v1.0
-- ✓ BasePlugin class for Python extensibility — v1.0
-- ✓ Local disk and GCS image source support — v1.0
-- ✓ Dynamic metadata filtering (sidebar filters on any metadata field) — v1.0
-- ✓ VLM auto-tagging (Moondream2) for scene attribute tags — v1.0
-- ✓ Search by filename and sort by metadata — v1.0
-- ✓ Save and load filter configurations (saved views) — v1.0
-- ✓ Add/remove tags (individual + bulk) — v1.0
-- ✓ Sample detail modal with full-resolution image — v1.0
-- ✓ Dataset statistics dashboard (class distribution, annotation counts) — v1.0
-
-### Active
-
-- [ ] Dockerized deployment with single-user auth for secure cloud VM access
-- [ ] GCP deployment script + local run script with setup instructions
-- [ ] Smart dataset ingestion UI (point at folder → auto-detect train/val/test splits → import)
-- [ ] Annotation editing in the UI (move, resize, delete bounding boxes — depth TBD)
-- [ ] Error triage workflow (tag FP/TP/FN/mistake, highlight errors, dim non-errors)
-- [ ] Smart "worst images" ranking (combined score: errors + confidence + uniqueness)
-- [ ] Keyboard shortcuts for navigation
-- [ ] Competitive feature parity with FiftyOne/Encord (gaps TBD after research)
+- Streaming COCO ingestion with ijson at 100K+ scale, local + GCS sources — v1.0
+- DuckDB metadata storage with fast analytical queries — v1.0
+- Virtualized grid with SVG annotation overlays, deterministic color hashing — v1.0
+- GT vs Predictions comparison toggle — v1.0
+- t-SNE embeddings with deck.gl scatter plot, lasso-to-grid filtering — v1.0
+- Error categorization (TP/FP/FN/Label Error) + Qdrant similarity search — v1.0
+- Pydantic AI agent for error patterns + Moondream2 VLM auto-tagging — v1.0
+- Metadata filtering, search, saved views, bulk tagging — v1.0
+- Docker 3-service stack with Caddy auth, GCP deployment scripts — v1.1
+- Smart ingestion UI with auto-detection of COCO layouts and multi-split support — v1.1
+- Annotation editing via react-konva (move, resize, draw, delete) — v1.1
+- Error triage: sample tagging, per-annotation TP/FP/FN via IoU, worst-images ranking, highlight mode — v1.1
+- Interactive discovery: confusion matrix, near-duplicates, histogram filtering, find-similar — v1.1
+- Keyboard shortcuts: 16 shortcuts across grid, modal, triage, editing — v1.1
 
 ### Out of Scope
 
-- Multi-user collaboration — personal tool, single-user auth only for VM security
-- Video annotation support — image-only for now
-- Training pipeline integration — DataVisor inspects data, doesn't train models
+- Multi-user collaboration — personal tool, single-user auth only
+- Video annotation support — image-only
+- Training pipeline integration — DataVisor inspects data, doesn't train
 - Mobile/tablet interface — desktop browser only
-- Real-time streaming inference — batch-oriented analysis
-- Full annotation editor (draw new boxes, complex labeling workflows) — quick corrections only, not CVAT replacement
-
-## Current Milestone: v1.1 Deployment, Workflow & Competitive Parity
-
-**Goal:** Make DataVisor deployable (Docker + GCP), secure for cloud access, and close key workflow gaps vs FiftyOne/Encord — smart ingestion, error triage, annotation corrections, and keyboard-driven navigation.
-
-**Target features:**
-- Dockerized project with single-user auth (basic auth for cloud VM security)
-- GCP deployment script + local run script
-- Smart dataset ingestion UI (auto-detect folder structure, train/val/test splits)
-- Annotation management (organize + quick edit: move/resize/delete bboxes)
-- Error triage & data curation workflow (tag, highlight, rank worst images)
-- Keyboard shortcuts for navigation
-- Competitive gaps from FiftyOne/Encord analysis
-
-## Context
-
-Shipped v1.0 with 12,720 LOC (6,950 Python + 5,770 TypeScript) across 7 phases and 21 plans.
-Tech stack: FastAPI + DuckDB + Qdrant (backend), Next.js + Tailwind + deck.gl + Recharts (frontend), Pydantic AI (agents), Moondream2 (VLM).
-59 backend tests passing. TypeScript compiles with 0 errors.
-Architecture: 3 Zustand stores, FastAPI DI, source discriminator for GT/prediction separation, 4 SSE progress streams, lazy model loading.
+- Full annotation editor (polygons, segmentation) — bounding box only
 
 ## Constraints
 
 - **Tech Stack**: FastAPI + DuckDB + Qdrant (backend), Next.js + Tailwind + deck.gl (frontend), Pydantic AI (agents) — established
 - **Performance**: Must handle 100K+ images without UI lag; DuckDB for metadata queries, deck.gl for WebGL rendering, virtualized scrolling
 - **Storage**: Supports both local filesystem and GCS bucket sources
-- **GPU**: VLM inference (Moondream2) supports MPS/CUDA/CPU auto-detection; DINOv2 embeddings likewise
+- **GPU**: VLM inference (Moondream2) supports MPS/CUDA/CPU auto-detection; SigLIP embeddings likewise
 - **Extensibility**: BasePlugin architecture exists; hooks system ready for expansion
 - **Python**: 3.14+ (numba/umap-learn incompatible; using scikit-learn t-SNE)
 
@@ -89,16 +54,18 @@ Architecture: 3 Zustand stores, FastAPI DI, source discriminator for GT/predicti
 
 | Decision | Rationale | Outcome |
 |----------|-----------|---------|
-| DuckDB over SQLite | Analytical queries on metadata at scale; columnar storage for filtering 100K+ rows | ✓ Good |
-| Qdrant over FAISS | Payload filtering support; Rust-based performance; local deployment | ✓ Good |
-| deck.gl for embedding viz | WebGL-powered; handles millions of points; lasso/interaction built-in | ✓ Good |
-| Pydantic AI for agents | Type-safe agent definitions; native FastAPI/Pydantic integration | ✓ Good |
-| Deterministic color hashing | Class names hash to consistent colors across sessions; no manual palette | ✓ Good |
-| Plugin hooks over monolith | Ingestion/UI/transformation hooks enable domain-specific extensions without forking | ✓ Good |
-| Source discriminator column | Clean GT/prediction separation in annotations table via source field | ✓ Good |
-| Lazy model loading | VLM and Qdrant loaded on-demand, not at startup, to avoid memory pressure | ✓ Good |
-| t-SNE over UMAP | umap-learn blocked by Python 3.14 numba incompatibility; t-SNE via scikit-learn | ⚠️ Revisit when numba supports 3.14 |
-| Moondream2 via transformers | trust_remote_code with all_tied_weights_keys patch for transformers 5.x compat | ✓ Good (fragile — monitor updates) |
+| DuckDB over SQLite | Analytical queries on metadata at scale; columnar storage for filtering 100K+ rows | Good |
+| Qdrant over FAISS | Payload filtering support; Rust-based performance; local deployment | Good |
+| deck.gl for embedding viz | WebGL-powered; handles millions of points; lasso/interaction built-in | Good |
+| Pydantic AI for agents | Type-safe agent definitions; native FastAPI/Pydantic integration | Good |
+| Deterministic color hashing | Class names hash to consistent colors across sessions; no manual palette | Good |
+| Source discriminator column | Clean GT/prediction separation in annotations table via source field | Good |
+| Caddy over nginx | Auto-HTTPS, built-in basic_auth, simpler config | Good |
+| react-konva for editing | Canvas-based editing in modal; SVG stays for grid overlays | Good |
+| Gemini 2.0 Flash for agent | Fast, cheap, good structured output; replaced GPT-4o | Good |
+| Pre-computed agent prompt | All data in prompt, no tool calls; avoids Pydantic AI request_limit issues | Good |
+| t-SNE over UMAP | umap-learn blocked by Python 3.14 numba incompatibility | Revisit when numba supports 3.14 |
+| Moondream2 via transformers | trust_remote_code with all_tied_weights_keys patch for transformers 5.x | Fragile — monitor updates |
 
 ---
-*Last updated: 2026-02-12 after v1.1 scope redefinition*
+*Last updated: 2026-02-13 after v1.1 milestone completion*
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 6073dd0..20563c9 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -2,8 +2,8 @@
 
 ## Milestones
 
-- v1.0 MVP - Phases 1-7 (shipped 2026-02-12)
-- **v1.1 Deployment, Workflow & Competitive Parity** - Phases 8-14
+- v1.0 MVP - Phases 1-7 (shipped 2026-02-12) — [archive](.planning/milestones/v1.0-ROADMAP.md)
+- v1.1 Deployment, Workflow & Competitive Parity - Phases 8-14 (shipped 2026-02-13) — [archive](.planning/milestones/v1.1-ROADMAP.md)
 
 ## Phases
 
@@ -40,143 +40,40 @@
 
 </details>
 
-### v1.1 Deployment, Workflow & Competitive Parity
-
-**Milestone Goal:** Make DataVisor deployable (Docker + GCP), secure for cloud access, and close key workflow gaps vs FiftyOne/Encord -- smart ingestion, annotation editing, error triage, interactive visualizations, and keyboard-driven navigation.
-
-**Phase Numbering:**
-- Integer phases (8, 9, 10, ...): Planned milestone work
-- Decimal phases (9.1, 9.2): Urgent insertions (marked with INSERTED)
-
-Decimal phases appear between their surrounding integers in numeric order.
-
-- [x] **Phase 8: Docker Deployment & Auth** - Dockerized 3-service stack with Caddy reverse proxy, basic auth, and deployment scripts
-- [x] **Phase 9: Smart Ingestion** - No-code dataset import from folder path with auto-detection and confirmation
-- [x] **Phase 10: Annotation Editing** - Move, resize, delete, and draw bounding boxes via react-konva in sample detail modal
-- [x] **Phase 11: Error Triage** - Tag errors, highlight mode, and worst-images ranking with DuckDB persistence
-- [x] **Phase 12: Interactive Viz & Discovery** - Confusion matrix, near-duplicates, interactive histograms, and find-similar
-- [x] **Phase 13: Keyboard Shortcuts** - Keyboard navigation, triage hotkeys, edit shortcuts, and help overlay
-- [x] **Phase 14: Per-Annotation Triage** - Auto-discover TP/FP/FN per bounding box via IoU overlap, color-coded boxes in detail modal, click to override classifications
-
-## Phase Details
+<details>
+<summary>v1.1 Deployment, Workflow & Competitive Parity (Phases 8-14) - SHIPPED 2026-02-13</summary>
 
 ### Phase 8: Docker Deployment & Auth
-**Goal**: DataVisor runs as a deployable Docker stack with single-user auth, accessible securely on a cloud VM or locally with a single command
-**Depends on**: Phase 7 (v1.0 complete)
-**Requirements**: DEPLOY-01, DEPLOY-02, DEPLOY-03, DEPLOY-04, DEPLOY-05
-**Success Criteria** (what must be TRUE):
-  1. User can run `docker compose up` and access DataVisor at `http://localhost` with all features working (grid, embeddings, error analysis)
-  2. User is prompted for username/password before accessing any page or API endpoint, and unauthenticated requests are rejected
-  3. User can run a deployment script that provisions a GCP VM with persistent disk and starts DataVisor accessible at a public IP with HTTPS
-  4. User can follow deployment documentation to configure environment variables, deploy to GCP, and set up a custom domain
-  5. DuckDB data, Qdrant vectors, and thumbnail cache persist across container restarts without data loss
-**Plans**: 5 plans
-
-Plans:
-- [x] 08-01-PLAN.md -- Backend Dockerfile + config fixes (CORS, DuckDB CHECKPOINT)
-- [x] 08-02-PLAN.md -- Frontend Dockerfile + Caddyfile reverse proxy with auth
-- [x] 08-03-PLAN.md -- Docker Compose orchestration + .dockerignore + env config
-- [x] 08-04-PLAN.md -- Local run script + GCP deployment scripts
-- [x] 08-05-PLAN.md -- Deployment documentation + full stack verification
+**Goal**: Deployable Docker stack with single-user auth, accessible on cloud VM or locally
+**Plans**: 5 plans (complete)
 
 ### Phase 9: Smart Ingestion
-**Goal**: Users can import datasets from the UI by pointing at a folder, reviewing auto-detected structure, and confirming import -- no CLI or config files needed
-**Depends on**: Phase 8 (auth protects new endpoints)
-**Requirements**: INGEST-01, INGEST-02, INGEST-03, INGEST-04, INGEST-05
-**Success Criteria** (what must be TRUE):
-  1. User can enter a folder path in the UI and trigger a scan that returns detected dataset structure
-  2. Scanner correctly identifies COCO annotation files and image directories within the folder
-  3. Scanner detects train/val/test split subdirectories and presents them as separate importable splits
-  4. User sees the detected structure as a confirmation step and can approve or adjust before import begins
-  5. Import progress displays per-split status via real-time SSE updates until completion
-**Plans**: 2 plans
-
-Plans:
-- [x] 09-01-PLAN.md -- Backend FolderScanner service, scan/import API endpoints, split-aware ingestion pipeline
-- [x] 09-02-PLAN.md -- Frontend ingestion wizard (path input, scan results, import progress) + landing page link
+**Goal**: No-code dataset import from folder path with auto-detection and confirmation
+**Plans**: 2 plans (complete)
 
 ### Phase 10: Annotation Editing
-**Goal**: Users can make quick bounding box corrections directly in the sample detail modal without leaving DataVisor
-**Depends on**: Phase 8 (auth protects mutation endpoints)
-**Requirements**: ANNOT-01, ANNOT-02, ANNOT-03, ANNOT-04, ANNOT-05
-**Success Criteria** (what must be TRUE):
-  1. User can enter edit mode in the sample detail modal and drag a bounding box to a new position
-  2. User can grab resize handles on a bounding box and change its dimensions
-  3. User can delete a bounding box and the deletion persists after closing the modal
-  4. User can draw a new bounding box and assign it a class label
-  5. Only ground truth annotations show edit controls; prediction annotations remain read-only and non-interactive
-**Plans**: 3 plans
-
-Plans:
-- [x] 10-01-PLAN.md -- Backend annotation CRUD endpoints + frontend mutation hooks and types
-- [x] 10-02-PLAN.md -- Konva building blocks: coord-utils, EditableRect, DrawLayer, ClassPicker
-- [x] 10-03-PLAN.md -- AnnotationEditor composition, sample modal integration, annotation list delete
+**Goal**: Move, resize, delete, and draw bounding boxes via react-konva in sample detail modal
+**Plans**: 3 plans (complete)
 
 ### Phase 11: Error Triage
-**Goal**: Users can systematically review and tag errors with a focused triage workflow that persists decisions and surfaces the worst samples first
-**Depends on**: Phase 8 (extends v1.0 error analysis)
-**Requirements**: TRIAGE-01, TRIAGE-02, TRIAGE-03
-**Success Criteria** (what must be TRUE):
-  1. User can tag any sample or annotation as FP, TP, FN, or mistake, and the tag persists across page refreshes
-  2. User can activate highlight mode to dim non-error samples in the grid, making errors visually prominent
-  3. User can view a "worst images" ranking that surfaces samples with the highest combined error score (error count + confidence spread + uniqueness)
-**Plans**: 2 plans
-
-Plans:
-- [x] 11-01-PLAN.md -- Backend triage endpoints (set-triage-tag, worst-images scoring) + frontend hooks and types
-- [x] 11-02-PLAN.md -- Triage tag buttons in detail modal, highlight mode grid dimming, worst-images stats panel
+**Goal**: Tag errors, highlight mode, and worst-images ranking with DuckDB persistence
+**Plans**: 2 plans (complete)
 
 ### Phase 12: Interactive Viz & Discovery
-**Goal**: Users can explore dataset quality interactively -- clicking visualization elements filters the grid, finding similar samples and near-duplicates is one click away
-**Depends on**: Phase 11 (triage data informs confusion matrix), Phase 8 (auth protects endpoints)
-**Requirements**: ANNOT-06, TRIAGE-04, TRIAGE-05, TRIAGE-06
-**Success Criteria** (what must be TRUE):
-  1. User can click "Find Similar" on any sample to see nearest neighbors from Qdrant displayed in the grid
-  2. User can view a confusion matrix and click any cell to filter the grid to samples matching that GT/prediction pair
-  3. User can trigger near-duplicate detection and browse groups of visually similar images
-  4. User can click a bar in any statistics dashboard histogram to filter the grid to samples in that bucket
-**Plans**: 3 plans
-
-Plans:
-- [x] 12-01-PLAN.md -- Discovery filter foundation + Find Similar grid filtering + interactive histogram bars
-- [x] 12-02-PLAN.md -- Clickable confusion matrix cells with backend sample ID resolution
-- [x] 12-03-PLAN.md -- Near-duplicate detection via Qdrant pairwise search with SSE progress
+**Goal**: Confusion matrix, near-duplicates, interactive histograms, and find-similar
+**Plans**: 3 plans (complete)
 
 ### Phase 13: Keyboard Shortcuts
-**Goal**: Power users can navigate, triage, and edit entirely from the keyboard without reaching for the mouse
-**Depends on**: Phase 10 (annotation edit shortcuts), Phase 11 (triage shortcuts), Phase 12 (all UI features exist)
-**Requirements**: UX-01, UX-02, UX-03, UX-04
-**Success Criteria** (what must be TRUE):
-  1. User can navigate between samples in the grid and modal using arrow keys, j/k, Enter, and Escape
-  2. User can quick-tag errors during triage using number keys and toggle highlight mode with h
-  3. User can delete annotations and undo edits with keyboard shortcuts while in annotation edit mode
-  4. User can press ? to open a shortcut help overlay listing all available keyboard shortcuts
-**Plans**: 2 plans
-
-Plans:
-- [x] 13-01-PLAN.md -- Foundation (react-hotkeys-hook, shortcut registry, ui-store) + grid keyboard navigation
-- [x] 13-02-PLAN.md -- Modal shortcuts (navigation, triage, editing, undo) + help overlay
+**Goal**: Keyboard navigation, triage hotkeys, edit shortcuts, and help overlay
+**Plans**: 2 plans (complete)
 
 ### Phase 14: Per-Annotation Triage
-**Goal**: Users can see auto-discovered TP/FP/FN classifications per bounding box based on IoU overlap, with color-coded visualization in the detail modal and the ability to click individual annotations to override their classification
-**Depends on**: Phase 11 (extends triage system), Phase 6 (error analysis IoU matching)
-**Success Criteria** (what must be TRUE):
-  1. User opens a sample with GT and predictions and sees each bounding box color-coded as TP (green), FP (red), or FN (orange) based on automatic IoU matching
-  2. User can click an individual bounding box to override its auto-assigned classification (e.g. mark an auto-TP as a mistake)
-  3. Per-annotation triage decisions persist across page refreshes and are stored in DuckDB
-  4. Highlight mode dims samples that have no triage annotations, making triaged samples visually prominent
-**Plans**: 3 plans
-
-Plans:
-- [x] 14-01-PLAN.md -- Backend schema, IoU matching service, and annotation triage API endpoints
-- [x] 14-02-PLAN.md -- Frontend types, hooks, and clickable TriageOverlay SVG component
-- [x] 14-03-PLAN.md -- Wire TriageOverlay into sample modal + highlight mode integration
+**Goal**: Auto-discover TP/FP/FN per bounding box via IoU overlap, color-coded boxes in detail modal, click to override classifications
+**Plans**: 3 plans (complete)
 
-## Progress
+</details>
 
-**Execution Order:**
-Phases execute in numeric order: 8 -> 9 -> 10 -> 11 -> 12 -> 13 -> 14
-(Note: Phases 9, 10, 11 are independent after Phase 8. Execution is sequential but no inter-dependency exists between 9/10/11.)
+## Progress
 
 | Phase | Milestone | Plans Complete | Status | Completed |
 |-------|-----------|----------------|--------|-----------|
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 1526a0a..61f341d 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -2,17 +2,17 @@
 
 ## Project Reference
 
-See: .planning/PROJECT.md (updated 2026-02-12)
+See: .planning/PROJECT.md (updated 2026-02-13)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** v1.1 complete. All 14 phases delivered.
+**Current focus:** v1.1 shipped. No active milestone.
 
 ## Current Position
 
-Phase: 14 of 14 (Per-Annotation Triage)
-Plan: 3 of 3 in current phase
-Status: Complete
-Last activity: 2026-02-13 -- Phase 14 verified and complete
+Phase: --
+Plan: --
+Status: Between milestones (v1.1 shipped, v1.2 not started)
+Last activity: 2026-02-13 -- v1.1 milestone archived
 
 Progress: [████████████████████████████████████████████████████████████] v1.1: 41/41 plans complete
 
@@ -23,96 +23,16 @@ Progress: [███████████████████████
 - Average duration: 3.9 min
 - Total execution time: 82 min
 
-**By Phase (v1.0):**
-
-| Phase | Plans | Total | Avg/Plan |
-|-------|-------|-------|----------|
-| 1. Data Foundation | 4/4 | 14 min | 3.5 min |
-| 2. Visual Grid | 3/3 | 15 min | 5.0 min |
-| 3. Filtering & Search | 2/2 | 10 min | 5.0 min |
-| 4. Predictions & Comparison | 3/3 | 9 min | 3.0 min |
-| 5. Embeddings & Visualization | 4/4 | 16 min | 4.0 min |
-| 6. Error Analysis & Similarity | 2/2 | 9 min | 4.5 min |
-| 7. Intelligence & Agents | 3/3 | 9 min | 3.0 min |
-
-**By Phase (v1.1):**
-
-| Phase | Plans | Total | Avg/Plan |
-|-------|-------|-------|----------|
-| 8. Docker Deployment & Auth | 5/5 | 25 min | 5.0 min |
-| 9. Smart Ingestion | 2/2 | 10 min | 5.0 min |
-| 10. Annotation Editing | 3/3 | 9 min | 3.0 min |
-| 11. Error Triage | 2/2 | 6 min | 3.0 min |
-| 12. Interactive Viz & Discovery | 3/3 | 10 min | 3.3 min |
-| 13. Keyboard Shortcuts | 2/2 | 6 min | 3.0 min |
-| 14. Per-Annotation Triage | 3/3 | 7 min | 2.3 min |
+**Velocity (v1.1):**
+- Total plans completed: 20
+- Average duration: 3.7 min
+- Total execution time: 73 min
 
 ## Accumulated Context
 
 ### Decisions
 
 Decisions are logged in PROJECT.md Key Decisions table.
-Recent decisions affecting current work:
-
-- [v1.1 Roadmap]: Keep Qdrant in local mode for Docker (single-user <1M vectors)
-- [v1.1 Roadmap]: Caddy over nginx for reverse proxy (auto-HTTPS, built-in basic_auth)
-- [v1.1 Roadmap]: react-konva for annotation editing in detail modal only (SVG stays for grid)
-- [v1.1 Roadmap]: FastAPI HTTPBasic DI over middleware (testable, composable)
-- [08-01]: CPU-only PyTorch via post-sync replacement in Dockerfile (uv sync then uv pip install from CPU index)
-- [08-01]: CORS restricted to localhost:3000 in dev, disabled entirely behind proxy (DATAVISOR_BEHIND_PROXY=true)
-- [08-02]: NEXT_PUBLIC_API_URL=/api baked at build time for same-origin API via Caddy
-- [08-02]: Caddy handles all auth at proxy layer -- zero application code changes
-- [08-03]: Directory bind mount ./data:/app/data for DuckDB WAL + Qdrant + thumbnails persistence
-- [08-03]: AUTH_PASSWORD_HASH has no default -- forces explicit auth configuration before deployment
-- [08-03]: Only Caddy exposes ports 80/443 -- backend and frontend are Docker-internal only
-- [08-04]: VM startup script does NOT auto-start docker compose -- requires manual .env setup first
-- [08-04]: GCP config via env vars with defaults (only GCP_PROJECT_ID required)
-- [08-05]: 10-section deployment docs covering local Docker, GCP, custom domain HTTPS, data persistence, troubleshooting
-- [08-05]: opencv-python-headless replaces opencv-python in Docker builder stage (no X11/GUI libs in slim images)
-- [09-01]: Three-layout priority detection: Roboflow > Standard COCO > Flat
-- [09-01]: ijson peek at top-level keys for COCO detection (max 10 keys, files >500MB skipped)
-- [09-01]: Optional dataset_id param on ingest_with_progress for multi-split ID sharing
-- [09-01]: INSERT-or-UPDATE pattern for dataset record across multi-split imports
-- [09-02]: POST SSE streaming via fetch + ReadableStream (not EventSource, which is GET-only)
-- [09-02]: FolderScanner refactored to accept StorageBackend for GCS support
-- [09-02]: Split-prefixed IDs for collision avoidance in multi-split import
-- [10-01]: get_cursor DI for annotation router (auto-close cursor)
-- [10-01]: source='ground_truth' enforced in SQL WHERE clauses for PUT/DELETE safety
-- [10-01]: Dataset counts refreshed via subquery UPDATE (no race conditions)
-- [10-02]: useDrawLayer hook pattern (handlers + ReactNode) instead of separate component
-- [10-02]: Transformer scale reset to 1 on transformEnd (Konva best practice)
-- [10-03]: AnnotationEditor loaded via next/dynamic with ssr:false (prevents Konva SSR errors)
-- [10-03]: Draw completion shows ClassPicker before creating annotation (requires category selection)
-- [10-03]: Delete buttons only appear on ground_truth rows when edit mode is active
-- [11-01]: Dual router pattern (samples_router + datasets_router) from single triage module
-- [11-01]: Atomic triage tag replacement via list_filter + list_append single SQL
-- [11-01]: get_db DI pattern for triage router (matching statistics.py style)
-- [11-02]: Triage buttons always visible in detail modal (not gated by edit mode)
-- [11-02]: Highlight toggle uses yellow-500 active styling to distinguish from edit buttons
-- [11-02]: Triage tag badges show short label (TP/FP/FN/MISTAKE) instead of full prefix
-- [12-01]: Lasso selection takes priority over discovery filter (effectiveIds = lassoSelectedIds ?? sampleIdFilter)
-- [12-01]: "Show in Grid" button only appears after similarity results load (progressive disclosure)
-- [12-01]: getState() pattern for store access in Recharts onClick handlers (non-reactive)
-- [12-01]: DiscoveryFilterChip in dataset header for cross-tab visibility
-- [12-02]: Imperative fetch function (not hook) for one-shot confusion cell sample lookups
-- [12-02]: Greedy IoU matching replayed per sample for consistent CM cell membership
-- [12-02]: getState() pattern for Zustand store writes in async callbacks
-- [12-03]: Tab bar always visible so Near Duplicates is accessible without predictions
-- [12-03]: Union-find with path compression for O(alpha(n)) grouping of pairwise matches
-- [12-03]: Progress updates throttled to every 10 points to avoid excessive state updates
-- [13-01]: isFocused passed as prop from ImageGrid (avoids N store subscriptions per GridCell)
-- [13-01]: Central shortcut registry pattern: all shortcuts as data in lib/shortcuts.ts
-- [13-02]: Single useHotkeys('1, 2, 3, 4') with event.key dispatch (avoids rules-of-hooks violation)
-- [13-02]: Single-level undo stack via React state for annotation delete undo
-- [13-02]: Triage number keys disabled during edit mode (prevents Konva focus confusion)
-- [13-02]: groupByCategory via reduce instead of Object.groupBy (avoids es2024 lib dep)
-- [14-01]: Reuse _compute_iou_matrix from evaluation.py (no duplicate IoU code)
-- [14-01]: Auto-computed labels ephemeral (computed on GET, not stored); overrides persist in annotation_triage table
-- [14-01]: triage:annotated sample tag bridges per-annotation triage to highlight mode
-- [14-02]: TriageOverlay is separate from AnnotationOverlay (interactive vs non-interactive SVG)
-- [14-02]: Click handler delegates to parent via callback (overlay does not manage mutations)
-- [14-02]: Annotations not in triageMap skipped (handles GT-only samples gracefully)
-- [14-03]: GT boxes show category name only, predictions show category + confidence% (color conveys triage type)
 
 ### Pending Todos
 
@@ -120,10 +40,15 @@ None.
 
 ### Blockers/Concerns
 
-- [RESOLVED] SVG-to-Canvas coordinate mismatch resolved by coord-utils.ts (10-02)
+None active.
+
+### Roadmap Evolution
+
+- v1.0: 7 phases (1-7), 21 plans — shipped 2026-02-12
+- v1.1: 7 phases (8-14), 20 plans — shipped 2026-02-13
 
 ## Session Continuity
 
 Last session: 2026-02-13
-Stopped at: Phase 14 complete, v1.1 milestone complete
+Stopped at: v1.1 milestone archived
 Resume file: None
diff --git a/.planning/REQUIREMENTS.md b/.planning/milestones/v1.1-REQUIREMENTS.md
similarity index 68%
rename from .planning/REQUIREMENTS.md
rename to .planning/milestones/v1.1-REQUIREMENTS.md
index a8c8887..12b090b 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/milestones/v1.1-REQUIREMENTS.md
@@ -1,11 +1,10 @@
-# Requirements: DataVisor v1.1
+# Requirements Archive: DataVisor v1.1
 
 **Defined:** 2026-02-12
-**Core Value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes — all in one workflow.
+**Completed:** 2026-02-13
+**Core Value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
 
-## v1.1 Requirements
-
-Requirements for Deployment, Workflow & Competitive Parity milestone.
+## v1.1 Requirements (All Complete)
 
 ### Deployment & Infrastructure
 
@@ -39,7 +38,7 @@ Requirements for Deployment, Workflow & Competitive Parity milestone.
 - [x] **TRIAGE-03**: "Worst images" ranking surfaces samples with highest combined error score (error count + confidence spread + uniqueness)
 - [x] **TRIAGE-04**: Interactive confusion matrix that filters grid when a cell is clicked
 - [x] **TRIAGE-05**: Near-duplicate detection surfaces visually similar images in the dataset
-- [x] **TRIAGE-06**: Interactive histograms on the statistics dashboard — clicking a bar filters the grid
+- [x] **TRIAGE-06**: Interactive histograms on the statistics dashboard -- clicking a bar filters the grid
 
 ### UX
 
@@ -48,46 +47,8 @@ Requirements for Deployment, Workflow & Competitive Parity milestone.
 - [x] **UX-03**: Keyboard shortcuts for annotation editing (Delete, Ctrl+Z, e for edit mode)
 - [x] **UX-04**: Shortcut help overlay triggered by ? key
 
-## v1.2 Requirements
-
-Deferred to future milestone. Tracked but not in current roadmap.
-
-### Format Expansion
-
-- **FMT-01**: YOLO format parser (.txt annotation files with class_id + normalized xywh)
-- **FMT-02**: Pascal VOC format parser (XML annotation files)
-- **FMT-03**: Dataset export in COCO and YOLO formats
-
-### Evaluation
-
-- **EVAL-01**: PR curves per class
-- **EVAL-02**: Per-class AP metrics dashboard
-
-### Advanced
-
-- **ADV-01**: Model zoo / in-app inference (ONNX/TorchScript)
-- **ADV-02**: Custom workspaces / panel layouts
-- **ADV-03**: Customizable keyboard shortcut remapping
-- **ADV-04**: CVAT/Label Studio integration for complex annotation workflows
-
-## Out of Scope
-
-Explicitly excluded. Documented to prevent scope creep.
-
-| Feature | Reason |
-|---------|--------|
-| Multi-user collaboration / RBAC | Personal tool — single-user auth for VM security only |
-| Video annotation support | Image-only for now; multiplies complexity |
-| Training pipeline integration | DataVisor inspects data, doesn't train models |
-| Mobile/tablet interface | Desktop browser only |
-| Real-time streaming inference | Batch-oriented analysis |
-| 3D point cloud visualization | Different rendering pipeline entirely |
-| Full annotation editor (polygon, segmentation) | Bounding box CRUD only for v1.1 |
-
 ## Traceability
 
-Which phases cover which requirements. Updated during roadmap creation.
-
 | Requirement | Phase | Status |
 |-------------|-------|--------|
 | DEPLOY-01 | Phase 8 | Complete |
@@ -117,11 +78,7 @@ Which phases cover which requirements. Updated during roadmap creation.
 | UX-03 | Phase 13 | Complete |
 | UX-04 | Phase 13 | Complete |
 
-**Coverage:**
-- v1.1 requirements: 26 total
-- Mapped to phases: 26
-- Unmapped: 0
+**Coverage:** 26/26 requirements complete (100%)
 
 ---
-*Requirements defined: 2026-02-12*
-*Last updated: 2026-02-13 — Phase 13 requirements marked Complete (v1.1 milestone complete)*
+*Archived: 2026-02-13*
diff --git a/.planning/milestones/v1.1-ROADMAP.md b/.planning/milestones/v1.1-ROADMAP.md
new file mode 100644
index 0000000..ad66394
--- /dev/null
+++ b/.planning/milestones/v1.1-ROADMAP.md
@@ -0,0 +1,131 @@
+# Milestone v1.1: Deployment, Workflow & Competitive Parity
+
+**Status:** SHIPPED 2026-02-13
+**Phases:** 8-14
+**Total Plans:** 20
+
+## Overview
+
+Make DataVisor deployable (Docker + GCP), secure for cloud access, and close key workflow gaps vs FiftyOne/Encord -- smart ingestion, annotation editing, error triage, interactive visualizations, and keyboard-driven navigation.
+
+## Phases
+
+### Phase 8: Docker Deployment & Auth
+
+**Goal**: DataVisor runs as a deployable Docker stack with single-user auth, accessible securely on a cloud VM or locally with a single command
+**Depends on**: Phase 7 (v1.0 complete)
+**Requirements**: DEPLOY-01, DEPLOY-02, DEPLOY-03, DEPLOY-04, DEPLOY-05
+**Plans**: 5 plans
+
+Plans:
+- [x] 08-01: Backend Dockerfile + config fixes (CORS, DuckDB CHECKPOINT)
+- [x] 08-02: Frontend Dockerfile + Caddyfile reverse proxy with auth
+- [x] 08-03: Docker Compose orchestration + .dockerignore + env config
+- [x] 08-04: Local run script + GCP deployment scripts
+- [x] 08-05: Deployment documentation + full stack verification
+
+### Phase 9: Smart Ingestion
+
+**Goal**: Users can import datasets from the UI by pointing at a folder, reviewing auto-detected structure, and confirming import -- no CLI or config files needed
+**Depends on**: Phase 8 (auth protects new endpoints)
+**Requirements**: INGEST-01, INGEST-02, INGEST-03, INGEST-04, INGEST-05
+**Plans**: 2 plans
+
+Plans:
+- [x] 09-01: Backend FolderScanner service, scan/import API endpoints, split-aware ingestion pipeline
+- [x] 09-02: Frontend ingestion wizard (path input, scan results, import progress) + landing page link
+
+### Phase 10: Annotation Editing
+
+**Goal**: Users can make quick bounding box corrections directly in the sample detail modal without leaving DataVisor
+**Depends on**: Phase 8 (auth protects mutation endpoints)
+**Requirements**: ANNOT-01, ANNOT-02, ANNOT-03, ANNOT-04, ANNOT-05
+**Plans**: 3 plans
+
+Plans:
+- [x] 10-01: Backend annotation CRUD endpoints + frontend mutation hooks and types
+- [x] 10-02: Konva building blocks: coord-utils, EditableRect, DrawLayer, ClassPicker
+- [x] 10-03: AnnotationEditor composition, sample modal integration, annotation list delete
+
+### Phase 11: Error Triage
+
+**Goal**: Users can systematically review and tag errors with a focused triage workflow that persists decisions and surfaces the worst samples first
+**Depends on**: Phase 8 (extends v1.0 error analysis)
+**Requirements**: TRIAGE-01, TRIAGE-02, TRIAGE-03
+**Plans**: 2 plans
+
+Plans:
+- [x] 11-01: Backend triage endpoints (set-triage-tag, worst-images scoring) + frontend hooks and types
+- [x] 11-02: Triage tag buttons in detail modal, highlight mode grid dimming, worst-images stats panel
+
+### Phase 12: Interactive Viz & Discovery
+
+**Goal**: Users can explore dataset quality interactively -- clicking visualization elements filters the grid, finding similar samples and near-duplicates is one click away
+**Depends on**: Phase 11 (triage data informs confusion matrix), Phase 8 (auth protects endpoints)
+**Requirements**: ANNOT-06, TRIAGE-04, TRIAGE-05, TRIAGE-06
+**Plans**: 3 plans
+
+Plans:
+- [x] 12-01: Discovery filter foundation + Find Similar grid filtering + interactive histogram bars
+- [x] 12-02: Clickable confusion matrix cells with backend sample ID resolution
+- [x] 12-03: Near-duplicate detection via Qdrant pairwise search with SSE progress
+
+### Phase 13: Keyboard Shortcuts
+
+**Goal**: Power users can navigate, triage, and edit entirely from the keyboard without reaching for the mouse
+**Depends on**: Phase 10 (annotation edit shortcuts), Phase 11 (triage shortcuts), Phase 12 (all UI features exist)
+**Requirements**: UX-01, UX-02, UX-03, UX-04
+**Plans**: 2 plans
+
+Plans:
+- [x] 13-01: Foundation (react-hotkeys-hook, shortcut registry, ui-store) + grid keyboard navigation
+- [x] 13-02: Modal shortcuts (navigation, triage, editing, undo) + help overlay
+
+### Phase 14: Per-Annotation Triage
+
+**Goal**: Users can see auto-discovered TP/FP/FN classifications per bounding box based on IoU overlap, with color-coded visualization in the detail modal and the ability to click individual annotations to override their classification
+**Depends on**: Phase 11 (extends triage system), Phase 6 (error analysis IoU matching)
+**Plans**: 3 plans
+
+Plans:
+- [x] 14-01: Backend schema, IoU matching service, and annotation triage API endpoints
+- [x] 14-02: Frontend types, hooks, and clickable TriageOverlay SVG component
+- [x] 14-03: Wire TriageOverlay into sample modal + highlight mode integration
+
+## Milestone Summary
+
+**Key Decisions:**
+
+- Caddy over nginx for reverse proxy (auto-HTTPS, built-in basic_auth)
+- CPU-only PyTorch via post-sync replacement in Dockerfile
+- react-konva for annotation editing (SVG stays for grid overlays)
+- FastAPI HTTPBasic DI over middleware (testable, composable)
+- Atomic triage tag replacement via list_filter + list_append single SQL
+- Union-find with path compression for near-duplicate grouping
+- Central shortcut registry pattern (all shortcuts as data)
+- Auto-computed triage labels ephemeral (computed on GET); overrides persist in annotation_triage table
+- Switched AI agent from OpenAI GPT-4o to Google Gemini 2.0 Flash
+- Pre-compute all data for AI agent prompt (no tool calls needed)
+
+**Issues Resolved:**
+
+- opencv-python-headless for Docker slim images (no X11 libs needed)
+- DuckDB WAL stale file recovery via CHECKPOINT on shutdown
+- PyTorch CPU install order (uv sync first, then replace with CPU wheel)
+- Pydantic AI request_limit exceeded by Gemini tool-call loop (eliminated tools)
+- GEMINI_API_KEY not loading (load_dotenv for third-party libs)
+- pyvips missing for Moondream2 auto-tag (added dependency)
+
+**Issues Deferred:**
+
+- UMAP blocked by Python 3.14 numba incompatibility (using t-SNE)
+- Moondream2 trust_remote_code fragile with transformers updates
+
+**Technical Debt Incurred:**
+
+- Module-level cache for Intelligence panel results (should use React Query cache)
+- Old triage tags filtered client-side (OBSOLETE_TRIAGE_TAGS set in grid-cell.tsx)
+
+---
+
+_For current project status, see .planning/ROADMAP.md_

From 928cfb2f720d13bc6e106950fd714f734fba7924 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Mon, 16 Feb 2026 16:41:09 -0500
Subject: [PATCH 02/38] feat: add class filter checkboxes to statistics
 overview tab

Allow users to exclude specific classes from statistics computation
to analyze data slices by label. Collapsible checkbox panel with
select-all/deselect-all controls filters class distribution chart
and recomputes summary stats client-side.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/components/stats/class-filter.tsx     | 115 ++++++++++++++++++
 .../src/components/stats/stats-dashboard.tsx  |  78 ++++++++++--
 2 files changed, 185 insertions(+), 8 deletions(-)
 create mode 100644 frontend/src/components/stats/class-filter.tsx

diff --git a/frontend/src/components/stats/class-filter.tsx b/frontend/src/components/stats/class-filter.tsx
new file mode 100644
index 0000000..8a65a1d
--- /dev/null
+++ b/frontend/src/components/stats/class-filter.tsx
@@ -0,0 +1,115 @@
+"use client";
+
+/**
+ * Collapsible class filter with checkboxes for selecting which categories
+ * to include in statistics computation.
+ *
+ * Provides select-all / deselect-all controls and displays the active
+ * filter count as a badge when collapsed.
+ */
+
+import { useState } from "react";
+
+interface ClassFilterProps {
+  /** All available category names (from class distribution data). */
+  categories: string[];
+  /** Set of category names currently excluded from statistics. */
+  excludedClasses: Set<string>;
+  /** Toggle a single category's inclusion. */
+  onToggle: (category: string) => void;
+  /** Include all categories. */
+  onSelectAll: () => void;
+  /** Exclude all categories. */
+  onDeselectAll: () => void;
+}
+
+export function ClassFilter({
+  categories,
+  excludedClasses,
+  onToggle,
+  onSelectAll,
+  onDeselectAll,
+}: ClassFilterProps) {
+  const [isExpanded, setIsExpanded] = useState(false);
+  const includedCount = categories.length - excludedClasses.size;
+  const isFiltered = excludedClasses.size > 0;
+
+  return (
+    <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900">
+      {/* Header - always visible */}
+      <button
+        onClick={() => setIsExpanded(!isExpanded)}
+        className="w-full flex items-center justify-between px-4 py-2.5 text-left"
+      >
+        <div className="flex items-center gap-2">
+          <svg
+            className={`w-4 h-4 text-zinc-400 transition-transform ${isExpanded ? "rotate-90" : ""}`}
+            fill="none"
+            viewBox="0 0 24 24"
+            stroke="currentColor"
+            strokeWidth={2}
+          >
+            <path strokeLinecap="round" strokeLinejoin="round" d="M9 5l7 7-7 7" />
+          </svg>
+          <span className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
+            Class Filter
+          </span>
+          {isFiltered && (
+            <span className="px-1.5 py-0.5 rounded-full text-xs font-medium bg-blue-100 text-blue-700 dark:bg-blue-900/40 dark:text-blue-300">
+              {includedCount}/{categories.length}
+            </span>
+          )}
+        </div>
+        {isFiltered && !isExpanded && (
+          <span className="text-xs text-zinc-400 dark:text-zinc-500">
+            {excludedClasses.size} class{excludedClasses.size !== 1 ? "es" : ""} hidden
+          </span>
+        )}
+      </button>
+
+      {/* Expandable body */}
+      {isExpanded && (
+        <div className="px-4 pb-3 border-t border-zinc-100 dark:border-zinc-800">
+          {/* Bulk actions */}
+          <div className="flex items-center gap-3 py-2">
+            <button
+              onClick={onSelectAll}
+              disabled={excludedClasses.size === 0}
+              className="text-xs text-blue-600 dark:text-blue-400 hover:underline disabled:opacity-40 disabled:no-underline disabled:cursor-default"
+            >
+              Select all
+            </button>
+            <span className="text-zinc-300 dark:text-zinc-600">|</span>
+            <button
+              onClick={onDeselectAll}
+              disabled={excludedClasses.size === categories.length}
+              className="text-xs text-blue-600 dark:text-blue-400 hover:underline disabled:opacity-40 disabled:no-underline disabled:cursor-default"
+            >
+              Deselect all
+            </button>
+          </div>
+
+          {/* Checkbox list */}
+          <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-4 gap-x-4 gap-y-1 max-h-48 overflow-y-auto">
+            {categories.map((name) => (
+              <label
+                key={name}
+                className="flex items-center gap-2 py-0.5 cursor-pointer group"
+              >
+                <input
+                  type="checkbox"
+                  checked={!excludedClasses.has(name)}
+                  onChange={() => onToggle(name)}
+                  className="rounded border-zinc-300 dark:border-zinc-600 text-blue-500 focus:ring-blue-500 focus:ring-offset-0 h-3.5 w-3.5"
+                />
+                <span className="text-xs text-zinc-600 dark:text-zinc-400 truncate group-hover:text-zinc-900 dark:group-hover:text-zinc-200">
+                  {name}
+                </span>
+              </label>
+            ))}
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/frontend/src/components/stats/stats-dashboard.tsx b/frontend/src/components/stats/stats-dashboard.tsx
index b1f602d..bc72d3f 100644
--- a/frontend/src/components/stats/stats-dashboard.tsx
+++ b/frontend/src/components/stats/stats-dashboard.tsx
@@ -11,13 +11,14 @@
  * - Intelligence: AI-powered error pattern analysis and recommendations
  */
 
-import { useState } from "react";
+import { useState, useMemo, useCallback } from "react";
 
 import { useStatistics } from "@/hooks/use-statistics";
 import { useFilterFacets } from "@/hooks/use-filter-facets";
 import { useSplit, useFilterStore } from "@/stores/filter-store";
 import { AnnotationSummary } from "@/components/stats/annotation-summary";
 import { ClassDistribution } from "@/components/stats/class-distribution";
+import { ClassFilter } from "@/components/stats/class-filter";
 import { SplitBreakdown } from "@/components/stats/split-breakdown";
 import { EvaluationPanel } from "@/components/stats/evaluation-panel";
 import { ErrorAnalysisPanel } from "@/components/stats/error-analysis-panel";
@@ -54,9 +55,59 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
   const { data: facets } = useFilterFacets(datasetId);
   const { data: stats, isLoading, error } = useStatistics(datasetId, split);
   const [activeTab, setActiveTab] = useState<SubTab>("overview");
+  const [excludedClasses, setExcludedClasses] = useState<Set<string>>(new Set());
 
   const availableSplits = facets?.splits.map((s) => s.name) ?? [];
-  const hasPredictions = stats && stats.summary.pred_annotations > 0;
+
+  // All category names from the unfiltered class distribution
+  const allCategories = useMemo(
+    () => stats?.class_distribution.map((c) => c.category_name) ?? [],
+    [stats],
+  );
+
+  // Derive filtered class distribution and recomputed summary
+  const filteredStats = useMemo(() => {
+    if (!stats) return null;
+    if (excludedClasses.size === 0) return stats;
+
+    const filteredDist = stats.class_distribution.filter(
+      (c) => !excludedClasses.has(c.category_name),
+    );
+    const gtAnnotations = filteredDist.reduce((sum, c) => sum + c.gt_count, 0);
+    const predAnnotations = filteredDist.reduce((sum, c) => sum + c.pred_count, 0);
+
+    return {
+      ...stats,
+      class_distribution: filteredDist,
+      summary: {
+        ...stats.summary,
+        gt_annotations: gtAnnotations,
+        pred_annotations: predAnnotations,
+        total_categories: filteredDist.length,
+      },
+    };
+  }, [stats, excludedClasses]);
+
+  const hasPredictions = filteredStats && filteredStats.summary.pred_annotations > 0;
+
+  const handleToggleClass = useCallback((category: string) => {
+    setExcludedClasses((prev) => {
+      const next = new Set(prev);
+      if (next.has(category)) {
+        next.delete(category);
+      } else {
+        next.add(category);
+      }
+      return next;
+    });
+  }, []);
+
+  const handleSelectAll = useCallback(() => setExcludedClasses(new Set()), []);
+
+  const handleDeselectAll = useCallback(
+    () => setExcludedClasses(new Set(allCategories)),
+    [allCategories],
+  );
 
   if (error) {
     return (
@@ -170,12 +221,23 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
 
       {activeTab === "overview" && (
         <>
+          {/* Class Filter */}
+          {allCategories.length > 1 && (
+            <ClassFilter
+              categories={allCategories}
+              excludedClasses={excludedClasses}
+              onToggle={handleToggleClass}
+              onSelectAll={handleSelectAll}
+              onDeselectAll={handleDeselectAll}
+            />
+          )}
+
           {/* Summary Stats */}
           <section>
             <h2 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
               Summary
             </h2>
-            {isLoading || !stats ? (
+            {isLoading || !filteredStats ? (
               <div className="grid grid-cols-2 lg:grid-cols-4 gap-4">
                 <SkeletonCard />
                 <SkeletonCard />
@@ -183,7 +245,7 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
                 <SkeletonCard />
               </div>
             ) : (
-              <AnnotationSummary summary={stats.summary} />
+              <AnnotationSummary summary={filteredStats.summary} />
             )}
           </section>
 
@@ -192,11 +254,11 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
             <h2 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
               Class Distribution
             </h2>
-            {isLoading || !stats ? (
+            {isLoading || !filteredStats ? (
               <SkeletonChart height="h-[300px]" />
             ) : (
               <>
-                <ClassDistribution data={stats.class_distribution} />
+                <ClassDistribution data={filteredStats.class_distribution} />
                 <p className="mt-1 text-xs text-zinc-400 dark:text-zinc-500">
                   Click any bar to filter the grid by category
                 </p>
@@ -209,10 +271,10 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
             <h2 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
               Split Breakdown
             </h2>
-            {isLoading || !stats ? (
+            {isLoading || !filteredStats ? (
               <SkeletonChart height="h-[250px]" />
             ) : (
-              <SplitBreakdown data={stats.split_breakdown} />
+              <SplitBreakdown data={filteredStats.split_breakdown} />
             )}
           </section>
         </>

From a468c629d40b8916a09472c2d8be19dbc4223096 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Mon, 16 Feb 2026 16:49:09 -0500
Subject: [PATCH 03/38] feat: apply class filter to evaluation tab with cached
 recomputation

Lift ClassFilter above sub-tab navigation so it's shared across all
tabs. EvaluationPanel now receives excludedClasses and uses a new
useFilteredEvaluation hook that:
- Filters PR curves, per-class metrics, and confusion matrix rows/cols
- Recomputes mAP as mean of filtered per-class AP values
- Synthesizes a new "all" PR curve from included classes (COCO 101-pt)
- Caches results in a Map keyed by serialized excluded set, so revisiting
  the same combination is O(1)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/components/stats/evaluation-panel.tsx |   7 +-
 .../src/components/stats/stats-dashboard.tsx  |  24 +--
 frontend/src/hooks/use-filtered-evaluation.ts | 172 ++++++++++++++++++
 3 files changed, 189 insertions(+), 14 deletions(-)
 create mode 100644 frontend/src/hooks/use-filtered-evaluation.ts

diff --git a/frontend/src/components/stats/evaluation-panel.tsx b/frontend/src/components/stats/evaluation-panel.tsx
index fa323e1..d38e90e 100644
--- a/frontend/src/components/stats/evaluation-panel.tsx
+++ b/frontend/src/components/stats/evaluation-panel.tsx
@@ -11,6 +11,7 @@ import { useState, useEffect, useMemo, useCallback } from "react";
 
 import { useFilterFacets } from "@/hooks/use-filter-facets";
 import { useEvaluation } from "@/hooks/use-evaluation";
+import { useFilteredEvaluation } from "@/hooks/use-filtered-evaluation";
 import { fetchConfusionCellSamples } from "@/hooks/use-confusion-cell";
 import { useFilterStore } from "@/stores/filter-store";
 import { useUIStore } from "@/stores/ui-store";
@@ -22,6 +23,7 @@ import { PerClassTable } from "@/components/stats/per-class-table";
 interface EvaluationPanelProps {
   datasetId: string;
   split: string | null;
+  excludedClasses: Set<string>;
 }
 
 function useDebouncedValue<T>(value: T, delay: number): T {
@@ -50,7 +52,7 @@ function SkeletonChart({ height }: { height: string }) {
   );
 }
 
-export function EvaluationPanel({ datasetId, split }: EvaluationPanelProps) {
+export function EvaluationPanel({ datasetId, split, excludedClasses }: EvaluationPanelProps) {
   const { data: facets } = useFilterFacets(datasetId);
 
   // Available prediction sources (exclude ground_truth)
@@ -76,13 +78,14 @@ export function EvaluationPanel({ datasetId, split }: EvaluationPanelProps) {
   const debouncedIou = useDebouncedValue(iouThreshold, 300);
   const debouncedConf = useDebouncedValue(confThreshold, 300);
 
-  const { data, isLoading } = useEvaluation(
+  const { data: rawData, isLoading } = useEvaluation(
     datasetId,
     source,
     debouncedIou,
     debouncedConf,
     split,
   );
+  const data = useFilteredEvaluation(rawData, excludedClasses);
 
   const handleCellClick = useCallback(
     async (actualClass: string, predictedClass: string) => {
diff --git a/frontend/src/components/stats/stats-dashboard.tsx b/frontend/src/components/stats/stats-dashboard.tsx
index bc72d3f..0f7ffb8 100644
--- a/frontend/src/components/stats/stats-dashboard.tsx
+++ b/frontend/src/components/stats/stats-dashboard.tsx
@@ -151,6 +151,17 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
         </div>
       )}
 
+      {/* Class filter (shared across sub-tabs, like the split selector) */}
+      {allCategories.length > 1 && (
+        <ClassFilter
+          categories={allCategories}
+          excludedClasses={excludedClasses}
+          onToggle={handleToggleClass}
+          onSelectAll={handleSelectAll}
+          onDeselectAll={handleDeselectAll}
+        />
+      )}
+
       {/* Sub-tab navigation (always visible -- Near Duplicates works without predictions) */}
       <div className="flex gap-1 border-b border-zinc-200 dark:border-zinc-700">
         <button
@@ -221,17 +232,6 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
 
       {activeTab === "overview" && (
         <>
-          {/* Class Filter */}
-          {allCategories.length > 1 && (
-            <ClassFilter
-              categories={allCategories}
-              excludedClasses={excludedClasses}
-              onToggle={handleToggleClass}
-              onSelectAll={handleSelectAll}
-              onDeselectAll={handleDeselectAll}
-            />
-          )}
-
           {/* Summary Stats */}
           <section>
             <h2 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
@@ -281,7 +281,7 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
       )}
 
       {activeTab === "evaluation" && hasPredictions && (
-        <EvaluationPanel datasetId={datasetId} split={split} />
+        <EvaluationPanel datasetId={datasetId} split={split} excludedClasses={excludedClasses} />
       )}
 
       {activeTab === "error_analysis" && hasPredictions && (
diff --git a/frontend/src/hooks/use-filtered-evaluation.ts b/frontend/src/hooks/use-filtered-evaluation.ts
new file mode 100644
index 0000000..7496be7
--- /dev/null
+++ b/frontend/src/hooks/use-filtered-evaluation.ts
@@ -0,0 +1,172 @@
+/**
+ * Derives a class-filtered EvaluationResponse from raw server data.
+ *
+ * Filters PR curves, per-class metrics, and confusion matrix by excluded
+ * classes, then recomputes aggregate mAP and a synthetic "all" PR curve
+ * from the included subset.
+ *
+ * Results are cached in a Map keyed by the serialized excluded-class set,
+ * so revisiting the same filter combination is O(1).
+ */
+
+import { useRef, useMemo } from "react";
+
+import type {
+  EvaluationResponse,
+  APMetrics,
+  PRCurve,
+} from "@/types/evaluation";
+
+/** Stable cache key from a Set of excluded class names. */
+function cacheKey(excluded: Set<string>): string {
+  if (excluded.size === 0) return "";
+  return [...excluded].sort().join("\0");
+}
+
+/** Average per-class AP values into aggregate mAP metrics. */
+function recomputeMapMetrics(
+  perClass: EvaluationResponse["per_class_metrics"],
+): APMetrics {
+  if (perClass.length === 0) {
+    return { map50: 0, map75: 0, map50_95: 0 };
+  }
+  const n = perClass.length;
+  return {
+    map50: perClass.reduce((s, m) => s + m.ap50, 0) / n,
+    map75: perClass.reduce((s, m) => s + m.ap75, 0) / n,
+    map50_95: perClass.reduce((s, m) => s + m.ap50_95, 0) / n,
+  };
+}
+
+/**
+ * Synthesize an "all" PR curve by averaging precision across included
+ * per-class curves at each of the 101 recall grid points (COCO convention).
+ */
+function synthesizeAllCurve(perClassCurves: PRCurve[]): PRCurve {
+  if (perClassCurves.length === 0) {
+    return { class_name: "all", points: [], ap: 0 };
+  }
+
+  const GRID = 101;
+  const points: PRCurve["points"] = [];
+
+  for (let i = 0; i < GRID; i++) {
+    const recall = i / (GRID - 1);
+    let precisionSum = 0;
+    let confSum = 0;
+    let count = 0;
+
+    for (const curve of perClassCurves) {
+      if (curve.points.length === 0) continue;
+
+      // Find max precision at recall >= grid point (monotonic envelope)
+      let maxP = 0;
+      let bestConf = 0;
+      for (const pt of curve.points) {
+        if (pt.recall >= recall && pt.precision > maxP) {
+          maxP = pt.precision;
+          bestConf = pt.confidence;
+        }
+      }
+      precisionSum += maxP;
+      confSum += bestConf;
+      count++;
+    }
+
+    if (count > 0) {
+      points.push({
+        recall,
+        precision: precisionSum / count,
+        confidence: confSum / count,
+      });
+    }
+  }
+
+  // AP = mean precision across recall grid (COCO 101-point interpolation)
+  const ap =
+    points.length > 0
+      ? points.reduce((s, p) => s + p.precision, 0) / points.length
+      : 0;
+
+  return { class_name: "all", points, ap };
+}
+
+/**
+ * Filter an EvaluationResponse by excluding certain classes.
+ *
+ * Removes excluded classes from PR curves, per-class metrics, and confusion
+ * matrix, then recomputes the "all" PR curve and mAP from the remaining set.
+ */
+function filterEvaluation(
+  data: EvaluationResponse,
+  excluded: Set<string>,
+): EvaluationResponse {
+  // Filter per-class data
+  const filteredPerClass = data.per_class_metrics.filter(
+    (m) => !excluded.has(m.class_name),
+  );
+  const filteredCurves = data.pr_curves.filter(
+    (c) => c.class_name !== "all" && !excluded.has(c.class_name),
+  );
+
+  // Recompute aggregate metrics from filtered subset
+  const apMetrics = recomputeMapMetrics(filteredPerClass);
+  const allCurve = synthesizeAllCurve(filteredCurves);
+
+  // Filter confusion matrix: keep only included class indices
+  const includedIndices: number[] = [];
+  const includedLabels: string[] = [];
+  for (let i = 0; i < data.confusion_matrix_labels.length; i++) {
+    if (!excluded.has(data.confusion_matrix_labels[i])) {
+      includedIndices.push(i);
+      includedLabels.push(data.confusion_matrix_labels[i]);
+    }
+  }
+  const filteredMatrix = includedIndices.map((ri) =>
+    includedIndices.map((ci) => data.confusion_matrix[ri][ci]),
+  );
+
+  return {
+    ...data,
+    pr_curves: [allCurve, ...filteredCurves],
+    ap_metrics: apMetrics,
+    per_class_metrics: filteredPerClass,
+    confusion_matrix: filteredMatrix,
+    confusion_matrix_labels: includedLabels,
+  };
+}
+
+/**
+ * Hook that returns a class-filtered EvaluationResponse with cross-combination
+ * caching. When `excludedClasses` is empty, returns the original data as-is.
+ *
+ * The cache is keyed by the serialized excluded set and scoped to the
+ * lifetime of the raw `data` reference (cache resets when server data changes).
+ */
+export function useFilteredEvaluation(
+  data: EvaluationResponse | undefined,
+  excludedClasses: Set<string>,
+): EvaluationResponse | undefined {
+  const cacheRef = useRef<{
+    sourceData: EvaluationResponse | undefined;
+    results: Map<string, EvaluationResponse>;
+  }>({ sourceData: undefined, results: new Map() });
+
+  return useMemo(() => {
+    if (!data) return undefined;
+    if (excludedClasses.size === 0) return data;
+
+    // Reset cache when upstream data changes
+    if (cacheRef.current.sourceData !== data) {
+      cacheRef.current = { sourceData: data, results: new Map() };
+    }
+
+    const key = cacheKey(excludedClasses);
+    const cached = cacheRef.current.results.get(key);
+    if (cached) return cached;
+
+    const result = filterEvaluation(data, excludedClasses);
+    cacheRef.current.results.set(key, result);
+    return result;
+  }, [data, excludedClasses]);
+}

From aec9d6b52772076a2665fba36fe5b5c6bd986f27 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 16:26:38 -0500
Subject: [PATCH 04/38] docs: start milestone v1.2 Classification Dataset
 Support

---
 .planning/PROJECT.md | 26 +++++++++++++++++++++++++-
 .planning/STATE.md   | 18 ++++++++----------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md
index b7f7f13..ac3425b 100644
--- a/.planning/PROJECT.md
+++ b/.planning/PROJECT.md
@@ -8,6 +8,19 @@ DataVisor is an open-source dataset introspection tool for computer vision — a
 
 A single tool that replaces scattered one-off scripts: load any CV dataset, visually browse with annotation overlays, compare ground truth against predictions, cluster via embeddings, and surface mistakes — all in one workflow.
 
+## Current Milestone: v1.2 Classification Dataset Support
+
+**Goal:** Add first-class support for single-label classification datasets with full feature parity to detection workflows.
+
+**Target features:**
+- Auto-detect dataset type (detection vs classification) from annotation format
+- JSONL classification ingestion (Roboflow format: image/prefix/suffix)
+- Grid browsing with class label overlays
+- Classification prediction import and GT vs predicted comparison
+- Classification-specific stats: accuracy, F1, per-class precision/recall, confusion matrix
+- Embedding visualization and clustering for classification datasets
+- Filter/search by class label
+
 ## Current State
 
 **Shipped:** v1.1 (2026-02-13)
@@ -33,6 +46,16 @@ A single tool that replaces scattered one-off scripts: load any CV dataset, visu
 - Interactive discovery: confusion matrix, near-duplicates, histogram filtering, find-similar — v1.1
 - Keyboard shortcuts: 16 shortcuts across grid, modal, triage, editing — v1.1
 
+### Active
+
+- [ ] Auto-detect dataset type from annotation format (COCO JSON → detection, JSONL → classification)
+- [ ] JSONL classification ingestion with multi-split support
+- [ ] Grid browsing with class label overlays for classification datasets
+- [ ] Classification prediction import and GT vs predicted comparison
+- [ ] Classification stats: accuracy, F1, per-class precision/recall, confusion matrix
+- [ ] Embedding visualization for classification datasets
+- [ ] Filter/search by class label
+
 ### Out of Scope
 
 - Multi-user collaboration — personal tool, single-user auth only
@@ -40,6 +63,7 @@ A single tool that replaces scattered one-off scripts: load any CV dataset, visu
 - Training pipeline integration — DataVisor inspects data, doesn't train
 - Mobile/tablet interface — desktop browser only
 - Full annotation editor (polygons, segmentation) — bounding box only
+- Multi-label classification — single-label per image only for v1.2
 
 ## Constraints
 
@@ -68,4 +92,4 @@ A single tool that replaces scattered one-off scripts: load any CV dataset, visu
 | Moondream2 via transformers | trust_remote_code with all_tied_weights_keys patch for transformers 5.x | Fragile — monitor updates |
 
 ---
-*Last updated: 2026-02-13 after v1.1 milestone completion*
+*Last updated: 2026-02-18 after v1.2 milestone started*
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 61f341d..18f5796 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -2,19 +2,17 @@
 
 ## Project Reference
 
-See: .planning/PROJECT.md (updated 2026-02-13)
+See: .planning/PROJECT.md (updated 2026-02-18)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** v1.1 shipped. No active milestone.
+**Current focus:** v1.2 Classification Dataset Support
 
 ## Current Position
 
-Phase: --
-Plan: --
-Status: Between milestones (v1.1 shipped, v1.2 not started)
-Last activity: 2026-02-13 -- v1.1 milestone archived
-
-Progress: [████████████████████████████████████████████████████████████] v1.1: 41/41 plans complete
+Phase: Not started (defining requirements)
+Plan: —
+Status: Defining requirements
+Last activity: 2026-02-18 — Milestone v1.2 started
 
 ## Performance Metrics
 
@@ -49,6 +47,6 @@ None active.
 
 ## Session Continuity
 
-Last session: 2026-02-13
-Stopped at: v1.1 milestone archived
+Last session: 2026-02-18
+Stopped at: v1.2 milestone initialization
 Resume file: None

From 7f9043dfcdbaf5165b3c763c693e73ed7622762f Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 16:39:58 -0500
Subject: [PATCH 05/38] docs: complete project research

---
 .planning/research/ARCHITECTURE.md | 1332 +++++++++-------------------
 .planning/research/FEATURES.md     | 1176 ++++++------------------
 .planning/research/PITFALLS.md     |  871 ++++++------------
 .planning/research/STACK.md        |  897 +++++--------------
 .planning/research/SUMMARY.md      |  739 ++++-----------
 5 files changed, 1362 insertions(+), 3653 deletions(-)

diff --git a/.planning/research/ARCHITECTURE.md b/.planning/research/ARCHITECTURE.md
index 536938c..22a4287 100644
--- a/.planning/research/ARCHITECTURE.md
+++ b/.planning/research/ARCHITECTURE.md
@@ -1,1057 +1,549 @@
-# Architecture Research: v1.1 Feature Integration
+# Architecture Patterns: Classification Dataset Support
 
-**Domain:** CV Dataset Introspection Tooling -- Feature Integration into Existing Architecture
-**Researched:** 2026-02-12
-**Confidence:** HIGH (grounded in codebase analysis of 12,720 LOC across 50+ source files)
+**Domain:** Single-label classification integration into existing detection-centric DataVisor
+**Researched:** 2026-02-18
+**Confidence:** HIGH -- based on direct codebase analysis, no external dependencies needed
 
 ---
 
-## Existing Architecture Snapshot
+## Executive Summary
 
-Before defining integration points, here is the current v1.0 architecture as built (not as planned -- verified against actual source files):
+Classification support requires threading a `dataset_type` discriminator through every layer of the stack: schema, ingestion, API responses, frontend rendering, and evaluation. The key architectural decision is to **reuse the existing `annotations` table with nullable bbox columns** rather than creating a separate table. This preserves all existing query patterns, filtering, and statistics while classification annotations simply have `NULL` bbox values. The frontend conditionally renders class labels (pill/chip) vs bounding boxes based on the dataset type, and the evaluation service branches between detection metrics (mAP/IoU) and classification metrics (accuracy/precision/recall/F1).
 
-```
-CURRENT ARCHITECTURE (v1.0 -- 12,720 LOC)
-==========================================
-
-Frontend (Next.js 16 + React 19)                    Backend (FastAPI + Python 3.14)
---------------------------------------               ------------------------------------
-app/page.tsx          -- Dataset list                app/main.py          -- Lifespan, CORS, router mounts
-app/datasets/[id]/    -- Dataset view                app/config.py        -- Pydantic Settings (env prefix DATAVISOR_)
-                                                     app/dependencies.py  -- DI: get_db, get_cursor, get_*_service
-3 Zustand stores:
-  filter-store.ts     -- Filters, selection          9 Routers:
-  ui-store.ts         -- Modal, tabs, sources          datasets.py, samples.py, images.py, views.py,
-  embedding-store.ts  -- Lasso selection               statistics.py, embeddings.py, similarity.py,
-                                                       agent.py, vlm.py
-14 Hooks (TanStack Query):
-  use-samples.ts      -- Infinite scroll             7 Services:
-  use-annotations.ts  -- Batch + per-sample            ingestion.py, embedding_service.py, reduction_service.py,
-  use-error-analysis.ts                                similarity_service.py, vlm_service.py,
-  use-evaluation.ts                                    error_analysis.py, evaluation.py, agent_service.py
-  use-embedding-progress.ts                            filter_builder.py, image_service.py
-  use-vlm-progress.ts
-  ... (8 more)                                       Data Layer:
-                                                       DuckDB (data/datavisor.duckdb) -- 6 tables
-lib/api.ts           -- apiFetch, apiPost, etc.        Qdrant (data/qdrant/) -- local mode, disk-persisted
-lib/constants.ts     -- API_BASE, PAGE_SIZE            StorageBackend (fsspec: local + GCS)
-lib/color-hash.ts    -- Deterministic class colors
-
-Component Tree:                                      DuckDB Tables:
-  grid/image-grid.tsx    (TanStack Virtual)            datasets, samples, annotations, categories,
-  grid/grid-cell.tsx                                   saved_views, embeddings
-  grid/annotation-overlay.tsx (SVG-based)
-  detail/sample-modal.tsx (HTML dialog)
-  detail/annotation-list.tsx
-  detail/similarity-panel.tsx
-  embedding/embedding-scatter.tsx (deck.gl)
-  embedding/lasso-overlay.tsx
-  filters/filter-sidebar.tsx
-  stats/stats-dashboard.tsx (6 sub-panels)
-  toolbar/auto-tag-button.tsx
-```
+---
 
-### Key Architectural Properties
+## Existing Architecture Snapshot (Relevant Surfaces)
 
-1. **DuckDB is single-connection, cursor-per-request** (`app/dependencies.py:24-32`)
-2. **Qdrant runs in LOCAL mode** (no Docker service -- `QdrantClient(path=...)` in `similarity_service.py:27`)
-3. **SSE pattern established** -- 4 existing SSE streams (ingestion, embeddings, reduction, VLM)
-4. **Services are injected via `app.state`** at lifespan startup, retrieved via `get_*` dependencies
-5. **Annotation overlay uses SVG** (NOT react-konva) -- `annotation-overlay.tsx` renders `<svg>` with `<rect>` elements
-6. **Frontend talks to `http://localhost:8000`** by default (`NEXT_PUBLIC_API_URL` env var)
-7. **No auth exists** -- CORS allows all origins (`allow_origins=["*"]`)
-8. **No Docker files exist** -- project runs via `uvicorn` and `next dev` directly
+Before defining integration points, here are the exact existing structures that classification support touches:
 
----
+### DuckDB Schema (from `duckdb_repo.py`)
+```sql
+-- datasets: NO dataset_type column. format is always "coco".
+datasets(id, name, format, source_path, image_dir, image_count,
+         annotation_count, category_count, prediction_count, created_at, metadata)
 
-## Feature 1: Docker Deployment
+-- annotations: bbox columns are NOT NULL. Every row must have bbox values.
+annotations(id, dataset_id, sample_id, category_name,
+            bbox_x DOUBLE NOT NULL, bbox_y DOUBLE NOT NULL,
+            bbox_w DOUBLE NOT NULL, bbox_h DOUBLE NOT NULL,
+            area, is_crowd, source, confidence, metadata)
 
-### Compose Topology
+-- samples: dataset-agnostic, works for both detection and classification
+samples(id, dataset_id, file_name, width, height, thumbnail_path, split, metadata, image_dir, tags)
 
-```
-docker-compose.yml
-==================
-
-                    +-----------------+
-                    |     nginx       |  :80 / :443
-                    |  (reverse proxy)|
-                    +--------+--------+
-                             |
-              +--------------+--------------+
-              |                             |
-    +---------v--------+          +---------v--------+
-    |     backend      |          |     frontend     |
-    |  FastAPI + DuckDB|          |   Next.js 16     |
-    |  (uvicorn :8000) |          |  (standalone)    |
-    |                  |          |  (:3000)          |
-    +--------+---------+          +------------------+
-             |
-    +--------v---------+
-    |      qdrant       |
-    |  (qdrant/qdrant)  |
-    |  :6333 (REST)     |
-    |  :6334 (gRPC)     |
-    +-------------------+
-
-Volumes:
-  - data_volume:/app/data        (DuckDB + thumbnails, mounted into backend)
-  - qdrant_storage:/qdrant/storage (Qdrant persistent data)
-  - images:/data/images           (bind mount for local image datasets)
+-- categories: dataset-agnostic, works for both types
+categories(dataset_id, category_id, name, supercategory)
 ```
 
-### Integration Points
+### Ingestion Pipeline (from `ingestion.py`, `coco_parser.py`, `folder_scanner.py`)
+- `FolderScanner` detects COCO layouts only (checks for `"images"` key in JSON)
+- `IngestionService.ingest_with_progress()` hardcodes `COCOParser()`
+- `ScanResult.format` is always `"coco"`
+- All parsers yield DataFrames with bbox columns
 
-**Files to create:**
-| File | Purpose |
-|------|---------|
-| `Dockerfile.backend` | Multi-stage: Python 3.14, install deps, copy app/, expose 8000 |
-| `Dockerfile.frontend` | Multi-stage: Node 22, build standalone, expose 3000 |
-| `docker-compose.yml` | 4 services: backend, frontend, qdrant, nginx |
-| `nginx/default.conf` | Reverse proxy: `/api/*` -> backend:8000, `/*` -> frontend:3000 |
-| `.env.docker` | Docker-specific env vars |
+### Evaluation (from `evaluation.py`)
+- `compute_evaluation()` builds `sv.Detections` objects with xyxy bounding boxes
+- IoU matching is hardcoded throughout (no concept of non-spatial matching)
+- `_load_detections()` queries `bbox_x, bbox_y, bbox_w, bbox_h` from annotations
+- Response model: `EvaluationResponse` has `pr_curves`, `ap_metrics`, `iou_threshold`
 
-**Files to modify:**
-| File | Change | Rationale |
-|------|--------|-----------|
-| `app/config.py` | Add `qdrant_url` setting (default `None` = local mode, set to `http://qdrant:6333` in Docker) | Switch between local Qdrant (dev) and Docker Qdrant (prod) |
-| `app/services/similarity_service.py` | Conditional: `QdrantClient(path=...)` vs `QdrantClient(url=...)` based on `qdrant_url` setting | Current code hardcodes local mode |
-| `frontend/next.config.ts` | Add `output: "standalone"` for Docker-optimized builds | Reduces image size from ~1GB to ~100MB |
-| `frontend/src/lib/constants.ts` | Already uses `NEXT_PUBLIC_API_URL` env var -- no change needed | Works as-is with Docker |
+### Frontend (from `annotation-overlay.tsx`, `grid-cell.tsx`, `sample-modal.tsx`)
+- `AnnotationOverlay` renders SVG `<rect>` elements using `ann.bbox_x/y/w/h`
+- `Annotation` type has required `bbox_x/y/w/h: number` fields
+- `SampleModal` shows annotation editor (Konva bbox editing), annotation table with bbox columns
+- `EvaluationPanel` shows PR curves, mAP cards, per-class AP table, confusion matrix
 
-### Qdrant Mode Switch Design
+---
 
-The critical architecture decision is Qdrant's mode. Currently `SimilarityService.__init__` creates a local-mode client:
+## Recommended Architecture
 
-```python
-# CURRENT (app/services/similarity_service.py:27)
-self.client = QdrantClient(path=str(path))
-
-# PROPOSED -- conditional based on settings
-settings = get_settings()
-if settings.qdrant_url:
-    # Docker mode: connect to Qdrant service
-    self.client = QdrantClient(url=settings.qdrant_url)
-else:
-    # Dev mode: local embedded storage
-    path = Path(qdrant_path)
-    path.mkdir(parents=True, exist_ok=True)
-    self.client = QdrantClient(path=str(path))
-```
+### High-Level Integration Pattern
 
-### DuckDB in Docker
-
-DuckDB is embedded (in-process) -- it runs INSIDE the backend container. The `.duckdb` file must persist across container restarts via a Docker volume:
-
-```yaml
-# docker-compose.yml (backend service)
-services:
-  backend:
-    build:
-      context: .
-      dockerfile: Dockerfile.backend
-    volumes:
-      - data_volume:/app/data          # DuckDB + thumbnails persist here
-      - /path/to/images:/data/images:ro # Bind-mount image datasets (read-only)
-    environment:
-      - DATAVISOR_DB_PATH=/app/data/datavisor.duckdb
-      - DATAVISOR_THUMBNAIL_CACHE_DIR=/app/data/thumbnails
-      - DATAVISOR_QDRANT_URL=http://qdrant:6333
-    ports:
-      - "8000:8000"
 ```
-
-**Single worker constraint remains:** DuckDB requires `--workers 1` in Docker too. The `CMD` should be `uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 1`.
-
-### Backend Dockerfile Pattern
-
-```dockerfile
-# Multi-stage build for Python 3.14 + uv
-FROM python:3.14-slim AS builder
-WORKDIR /app
-COPY pyproject.toml uv.lock ./
-RUN pip install uv && uv sync --frozen --no-dev
-
-FROM python:3.14-slim AS runtime
-WORKDIR /app
-COPY --from=builder /app/.venv /app/.venv
-COPY app/ ./app/
-COPY plugins/ ./plugins/
-ENV PATH="/app/.venv/bin:$PATH"
-EXPOSE 8000
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
+                    dataset_type = "detection" | "classification"
+                              |
+        +---------------------+---------------------+
+        |                     |                     |
+    Ingestion             Rendering            Evaluation
+    (parser per          (conditional          (metric strategy
+     format)              overlay)              per type)
 ```
 
-**GPU consideration:** The base image does NOT include CUDA. For VLM/embedding features in Docker, users either (a) use CPU-only inference (slow but works), or (b) use `nvidia/cuda` base image with GPU passthrough. Recommend CPU-only as default Docker profile, GPU as optional override.
-
-### Frontend Dockerfile Pattern
-
-```dockerfile
-FROM node:22-alpine AS builder
-WORKDIR /app
-COPY frontend/package.json frontend/package-lock.json ./
-RUN npm ci
-COPY frontend/ .
-ENV NEXT_PUBLIC_API_URL=/api
-RUN npm run build
-
-FROM node:22-alpine AS runner
-WORKDIR /app
-COPY --from=builder /app/.next/standalone ./
-COPY --from=builder /app/.next/static ./.next/static
-COPY --from=builder /app/public ./public
-EXPOSE 3000
-CMD ["node", "server.js"]
-```
+The `dataset_type` field on the `datasets` table is the single source of truth that drives conditional behavior across all layers. Every component reads this value and branches accordingly. Simple if/else branching at well-defined boundary points -- no polymorphism or plugin system needed.
 
-### Nginx Reverse Proxy
+### Component Boundaries
 
-```nginx
-# nginx/default.conf
-upstream backend {
-    server backend:8000;
-}
-upstream frontend {
-    server frontend:3000;
-}
+| Component | Responsibility | Communicates With | Change Type |
+|-----------|---------------|-------------------|-------------|
+| `datasets` table | Stores `dataset_type` column | All components read it | ADD column |
+| `annotations` table | Stores all annotations (bbox nullable for classification) | Parsers write, API reads | ALTER bbox to nullable |
+| `ClassificationFolderParser` | Parses folder-of-folders layout | IngestionService | NEW |
+| `ClassificationPredictionParser` | Parses classification prediction CSV/JSON | Ingestion router | NEW |
+| `IngestionService` | Routes to correct parser based on format | Parsers, DuckDB | MODIFY |
+| `FolderScanner` | Auto-detects dataset format | Ingestion router | MODIFY |
+| `classification_evaluation.py` | Computes accuracy/F1/confusion matrix | Statistics router | NEW |
+| `AnnotationOverlay` (frontend) | Renders bbox SVG or class label pill | GridCell, SampleModal | MODIFY |
+| `EvaluationPanel` (frontend) | Shows detection or classification metrics | Stats dashboard | MODIFY |
+| `DatasetResponse` / `AnnotationResponse` | API response models | Frontend types | MODIFY |
+
+### Data Flow: Classification Ingestion
 
-server {
-    listen 80;
-
-    # API routes -> FastAPI backend
-    location /api/ {
-        proxy_pass http://backend/;
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_buffering off;           # Required for SSE streams
-        proxy_read_timeout 300s;       # Long-running SSE connections
-    }
-
-    # Everything else -> Next.js frontend
-    location / {
-        proxy_pass http://frontend;
-        proxy_set_header Host $host;
-    }
-}
+```
+User points scanner at folder
+    |
+    v
+FolderScanner detects structure:
+    folder-of-folders?  -> format = "classification_folders"
+    CSV with labels?    -> format = "classification_csv"
+    COCO with bbox?     -> format = "coco" (existing, unchanged)
+    |
+    v
+ScanResult returned with format string + detected splits
+    |
+    v
+IngestionService dispatches to ClassificationFolderParser
+    |
+    v
+Parser yields sample batches (same schema as detection -- width/height from PIL)
+    |
+    v
+Parser yields annotation batches:
+    - category_name = folder name (class label)
+    - bbox_x/y/w/h = NULL
+    - area = NULL
+    - source = "ground_truth"
+    - ONE annotation per sample (single-label classification)
+    |
+    v
+DuckDB bulk insert (same INSERT INTO annotations pattern)
+    |
+    v
+datasets row created with dataset_type = "classification"
 ```
 
-### Build Order Implication
+### Data Flow: Classification Evaluation
 
-Docker deployment is **independent of all other v1.1 features** and should be built first. It creates the deployment scaffold that other features (auth, ingestion UI) build upon.
+```
+GET /datasets/{id}/evaluation
+    |
+    v
+Router reads dataset_type from datasets table
+    |
+    v
+if dataset_type == "classification":
+    compute_classification_evaluation()    # NEW function
+else:
+    compute_evaluation()                   # existing detection path
+    |
+    v
+Classification evaluation:
+    1. Load GT: one annotation per sample (source='ground_truth')
+    2. Load preds: highest-confidence prediction per sample
+    3. Match by sample_id (no IoU, no spatial matching)
+    4. Build confusion matrix (no "background" row/col)
+    5. Compute per-class precision/recall/F1
+    6. Compute overall accuracy, macro-F1, weighted-F1
+    7. Return ClassificationEvaluationResponse
+```
 
 ---
 
-## Feature 2: Single-User Auth Middleware
+## Key Architectural Decisions
 
-### Architecture Decision: Dependency Injection (not middleware)
+### Decision 1: Reuse `annotations` Table with Nullable Bbox
 
-**Recommendation: Use FastAPI's `Depends()` pattern, NOT ASGI middleware.**
+**Recommendation:** Reuse the existing `annotations` table. Make bbox columns nullable.
 
-Rationale from research and codebase analysis:
-1. The codebase already uses `Depends()` extensively (9 dependency functions in `dependencies.py`). Adding auth as another dependency is consistent.
-2. Middleware approach would wrap ALL routes including `/health` and SSE streams, requiring complex exclusion logic.
-3. The FastAPI community consensus (GitHub Discussion #8867, #3277) strongly favors DI for auth because it is testable, composable, and explicit per-route.
-4. Single-user auth is simple: one username/password from environment variables, verified via HTTP Basic Auth.
+**Why this is clearly the right choice:**
+- Every existing query path (statistics, filtering, batch annotations, triage) filters on `category_name`, `source`, `dataset_id` -- none require non-null bbox.
+- The statistics endpoint (`GROUP BY category_name`) works identically for classification.
+- Saved views, tags, embeddings, similarity search -- all sample-level features work without changes.
+- A separate `classification_annotations` table would require parallel query paths in every service, doubling the maintenance surface.
 
-### Integration Point
+**Schema migration (in `duckdb_repo.py:initialize_schema`):**
+```sql
+-- Add dataset_type to datasets
+ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection';
 
-**File to create:**
-| File | Purpose |
-|------|---------|
-| `app/auth.py` | `verify_credentials()` dependency using `fastapi.security.HTTPBasic` |
+-- Make bbox columns nullable for classification support
+-- DuckDB supports DROP NOT NULL via ALTER TABLE
+ALTER TABLE annotations ALTER COLUMN bbox_x DROP NOT NULL;
+ALTER TABLE annotations ALTER COLUMN bbox_y DROP NOT NULL;
+ALTER TABLE annotations ALTER COLUMN bbox_w DROP NOT NULL;
+ALTER TABLE annotations ALTER COLUMN bbox_h DROP NOT NULL;
+```
 
-**Files to modify:**
-| File | Change |
-|------|--------|
-| `app/config.py` | Add `auth_username: str = "admin"` and `auth_password: str` settings |
-| `app/main.py` | Add auth dependency to ALL router includes (single line each) |
-| `app/routers/*.py` | No changes -- auth applied at router level via `dependencies=[Depends(verify_auth)]` |
+**Risk note (MEDIUM confidence):** DuckDB's `ALTER COLUMN DROP NOT NULL` syntax needs verification against current DuckDB version. Fallback approach: change the `CREATE TABLE IF NOT EXISTS annotations` statement to remove `NOT NULL` from bbox columns. Since the table already exists, this alone does nothing -- but combined with a migration that creates a new table, copies data, drops old, and renames, it works. Verify during implementation.
 
-### Implementation Pattern
+**Simpler fallback:** Change the `CREATE TABLE` statement to not have `NOT NULL` on bbox columns. For existing databases, store classification bbox as `0.0` instead of `NULL`. This avoids ALTER entirely but is semantically less clean. The code paths would check `bbox_w == 0 AND bbox_h == 0` as "no bbox" rather than `IS NULL`.
 
-```python
-# app/auth.py
-import secrets
-from fastapi import Depends, HTTPException, status
-from fastapi.security import HTTPBasic, HTTPBasicCredentials
-from app.config import get_settings
-
-security = HTTPBasic()
-
-def verify_auth(credentials: HTTPBasicCredentials = Depends(security)) -> str:
-    """Verify single-user basic auth credentials.
-
-    Returns the username on success. Raises 401 on failure.
-    Uses secrets.compare_digest to prevent timing attacks.
-    """
-    settings = get_settings()
-    correct_username = secrets.compare_digest(
-        credentials.username.encode("utf-8"),
-        settings.auth_username.encode("utf-8"),
-    )
-    correct_password = secrets.compare_digest(
-        credentials.password.encode("utf-8"),
-        settings.auth_password.encode("utf-8"),
-    )
-    if not (correct_username and correct_password):
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Invalid credentials",
-            headers={"WWW-Authenticate": "Basic"},
-        )
-    return credentials.username
-```
+### Decision 2: `dataset_type` on `datasets` Table
 
-### Router-Level Application
+**Recommendation:** Yes. Add `dataset_type VARCHAR DEFAULT 'detection'`.
 
-Apply auth at the router include level in `main.py` so every endpoint on every router requires auth, without modifying individual router files:
+**Why:**
+- Single source of truth for conditional behavior across all layers.
+- Default of `'detection'` means zero migration impact on existing datasets.
+- Frontend reads it once per dataset load and threads it through props.
+- Evaluation router uses it to select the metric strategy.
+- Future types (segmentation, keypoints) extend the same pattern.
 
-```python
-# app/main.py -- modified includes
-from app.auth import verify_auth
+**Not on annotations:** All annotations in a dataset share the same type. There is no mixed detection+classification dataset in DataVisor's model. The dataset-level discriminator is sufficient.
 
-app.include_router(datasets.router, dependencies=[Depends(verify_auth)])
-app.include_router(samples.router, dependencies=[Depends(verify_auth)])
-# ... repeat for all routers
+### Decision 3: Frontend Conditional Rendering
 
-# /health remains unprotected (no dependency)
-@app.get("/health")
-async def health_check() -> dict[str, str]:
-    return {"status": "ok"}
-```
+**Recommendation:** Thread `datasetType` through component props from the dataset query. Branch at component boundaries, not deep inside components.
 
-### Frontend Auth Integration
+**Where conditional rendering applies:**
 
-The frontend `api.ts` functions (`apiFetch`, `apiPost`, `apiPatch`, `apiDelete`) all call `fetch()` directly. For Basic Auth, add the `Authorization` header:
+| Component | Detection Behavior | Classification Behavior |
+|-----------|-------------------|------------------------|
+| `AnnotationOverlay` | SVG bbox rectangles with class labels | Class label pill/chip in top-left corner |
+| `GridCell` | Overlay shows boxes | Overlay shows label pill |
+| `SampleModal` image area | SVG bbox overlays | Class label overlay (no boxes) |
+| `SampleModal` annotation table | Columns: class, bbox, area, source | Columns: class, confidence, source (no bbox) |
+| `AnnotationEditor` (Konva) | Draggable/resizable bbox editing | Class picker dropdown (no Konva canvas) |
+| `DrawLayer` / `EditableRect` | Shown in edit mode | Hidden (no bbox to draw) |
+| `EvaluationPanel` header | mAP@50, mAP@75, mAP@50:95 cards | Accuracy, Macro-F1, Weighted-F1 cards |
+| `PRCurveChart` | Shown (per-class PR curves) | Hidden (not meaningful for classification) |
+| `PerClassTable` | Columns: AP50, AP75, AP50:95, P, R | Columns: Precision, Recall, F1, Support |
+| `ConfusionMatrix` | Has "background" row/col for FP/FN | No "background" -- pure NxN class matrix |
+| `ErrorAnalysis` panel | IoU-based error categories | Misclassification categories (simpler) |
+| `PredictionImportDialog` | Accepts COCO results JSON | Accepts classification CSV/JSON |
+| Filter sidebar | Bbox area filter shown | Bbox area filter hidden |
 
+**Implementation pattern:**
 ```typescript
-// frontend/src/lib/api.ts -- modified
-function authHeaders(): HeadersInit {
-  // Credentials stored in environment variables at build time
-  // or passed via cookie/session after initial login
-  const creds = btoa(`${process.env.NEXT_PUBLIC_AUTH_USER}:${process.env.NEXT_PUBLIC_AUTH_PASS}`);
-  return { Authorization: `Basic ${creds}` };
+// Dataset type flows from page -> components
+const { data: dataset } = useDataset(datasetId);
+const datasetType = dataset?.dataset_type ?? "detection";
+
+// AnnotationOverlay branches at the top
+export function AnnotationOverlay({ annotations, imageWidth, imageHeight, datasetType }) {
+  if (datasetType === "classification") {
+    return <ClassificationLabel annotations={annotations} />;
+  }
+  // Existing SVG bbox rendering unchanged
+  return <svg viewBox={...}>...</svg>;
 }
 ```
 
-**Alternative (better UX):** Add a login page that stores credentials in `sessionStorage`, then include them in all API calls. This avoids browser Basic Auth popup.
-
-### SSE Stream Auth
+### Decision 4: Separate Evaluation Function (NOT Shared)
 
-The existing SSE pattern uses `new EventSource(url)` which does NOT support custom headers. Two options:
-
-1. **Cookie-based session** (recommended): After initial Basic Auth, set an HTTP-only session cookie. EventSource sends cookies automatically.
-2. **Query parameter token**: Pass auth token as `?token=xxx` in SSE URLs. Less secure but simpler.
-
-**Recommendation:** Use cookie-based sessions via `fastapi-sessions` or a simple signed cookie. This is the only viable approach because the existing `useEmbeddingProgress` and `useVlmProgress` hooks use `EventSource` which cannot set `Authorization` headers.
-
-### Build Order Implication
-
-Auth must come AFTER Docker (needs HTTPS for secure credential transmission) but BEFORE smart ingestion UI (new endpoints need auth).
-
----
-
-## Feature 3: Smart Ingestion UI
-
-### Current Ingestion Flow
-
-The current ingestion is API-only (`POST /datasets/ingest` with `annotation_path` and `image_dir` as strings). There is no UI -- users must know exact file paths.
-
-### Smart Ingestion Architecture
-
-The smart ingestion feature adds three new components:
-
-```
-User points at folder
-        |
-        v
-POST /datasets/scan { root_path }           <-- NEW endpoint
-        |
-        v
-FolderScanner service                        <-- NEW service
-  - Walk directory tree
-  - Detect COCO annotation files (*.json with "images" key)
-  - Detect image directories (dirs with .jpg/.png files)
-  - Detect train/val/test splits by directory naming
-  - Return structured scan result
-        |
-        v
-Response: ScanResult {
-  annotation_files: [{ path, format, est_images }],
-  image_dirs: [{ path, image_count, split_guess }],
-  suggested_imports: [{ annotation, image_dir, split, name }]
-}
-        |
-        v
-Frontend: Ingestion wizard UI               <-- NEW page/component
-  - Shows detected files and directories
-  - User confirms/adjusts import configuration
-  - Clicks "Import" -> triggers SSE ingestion stream
-        |
-        v
-POST /datasets/ingest (existing endpoint)    <-- REUSE with minor extension
-  - SSE progress stream (existing pattern)
-  - New: accept optional split parameter
-```
+**Recommendation:** Add `compute_classification_evaluation()` as a separate function. Do NOT retrofit the detection evaluation to handle both.
 
-### Backend Integration Points
-
-**Files to create:**
-| File | Purpose |
-|------|---------|
-| `app/services/folder_scanner.py` | `FolderScanner` class: walk dir tree, detect formats, suggest imports |
-| `app/models/scan.py` | Pydantic models: `ScanRequest`, `ScanResult`, `DetectedFile`, `SuggestedImport` |
-| `app/routers/ingestion.py` | New router: `POST /ingestion/scan`, mounted under auth |
-
-**Files to modify:**
-| File | Change |
-|------|--------|
-| `app/main.py` | Add `app.include_router(ingestion.router)` |
-| `app/dependencies.py` | Add `get_folder_scanner()` dependency |
-| `app/models/dataset.py` | Add optional `split` field to `IngestRequest` |
-| `app/services/ingestion.py` | Pass `split` to image batch builder (set `split` column during parsing) |
-| `app/ingestion/coco_parser.py` | Accept optional `split` parameter in `build_image_batches()` |
-
-**Frontend files to create:**
-| File | Purpose |
-|------|---------|
-| `frontend/src/app/ingest/page.tsx` | Ingestion wizard page |
-| `frontend/src/components/ingest/scan-results.tsx` | Display detected files with checkboxes |
-| `frontend/src/components/ingest/import-progress.tsx` | SSE progress display (reuses existing pattern) |
-| `frontend/src/hooks/use-scan.ts` | TanStack Query mutation for scan endpoint |
-| `frontend/src/hooks/use-ingest.ts` | SSE hook for ingestion progress (similar to `use-embedding-progress.ts`) |
-| `frontend/src/types/scan.ts` | TypeScript types matching backend Pydantic models |
-
-### Folder Scanner Design
+**Why the detection code cannot be reused:**
+- Detection evaluation builds `sv.Detections` objects with xyxy bounding boxes -- classification has none.
+- Detection uses IoU matching to determine TP/FP/FN -- classification matches by sample_id.
+- Detection's confusion matrix includes a "background" class -- classification does not.
+- Detection computes AP (area under PR curve at multiple IoU thresholds) -- classification computes F1.
+- The `_load_detections()` helper queries bbox columns that are NULL for classification.
 
+**New response model:**
 ```python
-# app/services/folder_scanner.py
-
-class FolderScanner:
-    """Walk a directory tree and detect importable CV datasets.
-
-    Detection heuristics:
-    1. JSON files containing "images" key at top level -> COCO annotation file
-    2. Directories containing 10+ image files (.jpg/.jpeg/.png) -> image directory
-    3. Directory names matching train/val/test/validation -> split assignment
-    4. Paired annotation + image directory at same level -> suggested import
-    """
+class ClassificationEvaluationResponse(BaseModel):
+    """Evaluation payload for classification datasets."""
+    accuracy: float
+    macro_precision: float
+    macro_recall: float
+    macro_f1: float
+    weighted_f1: float
+    per_class_metrics: list[ClassificationPerClassMetrics]
+    confusion_matrix: list[list[int]]
+    confusion_matrix_labels: list[str]
+    conf_threshold: float
+
+class ClassificationPerClassMetrics(BaseModel):
+    class_name: str
+    precision: float
+    recall: float
+    f1: float
+    support: int  # number of GT samples for this class
+```
+
+**Router branching:**
+```python
+@router.get("/{dataset_id}/evaluation")
+def get_evaluation(dataset_id, source, iou_threshold, conf_threshold, split, db):
+    cursor = db.connection.cursor()
+    dataset_type = cursor.execute(
+        "SELECT dataset_type FROM datasets WHERE id = ?", [dataset_id]
+    ).fetchone()[0]
 
-    def scan(self, root_path: str) -> ScanResult:
-        ...
+    if dataset_type == "classification":
+        return compute_classification_evaluation(
+            cursor, dataset_id, source, conf_threshold, split
+        )
+    else:
+        return compute_evaluation(
+            cursor, dataset_id, source, iou_threshold, conf_threshold, split
+        )
 ```
 
-### SSE Pattern Reuse
-
-The existing SSE pattern in `datasets.py:37-73` (wrapping `IngestionService.ingest_with_progress()` as a `StreamingResponse`) is directly reusable. The new ingestion UI will call the same `POST /datasets/ingest` endpoint and consume the same SSE event format.
-
-### Build Order Implication
-
-Smart ingestion depends on Docker (for deployment context) and auth (new endpoints need auth). Can be built independently of annotation editing and error triage.
-
----
-
-## Feature 4: Annotation Editing (Browser-Based BBox Editing)
-
-### Critical Observation: Current Overlay is SVG, NOT react-konva
-
-The milestone context mentions react-konva, but **react-konva is NOT in the project**. The current annotation rendering in `annotation-overlay.tsx` is pure SVG:
+**Frontend union type:**
+```typescript
+// The hook returns different shapes based on dataset_type
+// Use discriminated union or simply check for presence of `accuracy` field
+type EvaluationData = DetectionEvaluationResponse | ClassificationEvaluationResponse;
 
-```tsx
-// CURRENT: frontend/src/components/grid/annotation-overlay.tsx
-<svg viewBox={`0 0 ${imageWidth} ${imageHeight}`} ...>
-  {annotations.map((ann) => (
-    <rect x={ann.bbox_x} y={ann.bbox_y} width={ann.bbox_w} height={ann.bbox_h} ... />
-  ))}
-</svg>
+function isClassificationEval(data: EvaluationData): data is ClassificationEvaluationResponse {
+  return "accuracy" in data;
+}
 ```
 
-### Architecture Decision: Use Konva ONLY in Detail Modal, Keep SVG for Grid
+### Decision 5: Ingestion Auto-Detection via FolderScanner
 
-**Recommendation: Do NOT replace the grid overlay with react-konva.** Introduce react-konva ONLY in the sample detail modal for editing.
-
-Rationale:
-1. The SVG grid overlay works well for read-only display at scale (dozens of cells visible simultaneously). Replacing SVG with canvas per grid cell would multiply canvas contexts and hurt performance.
-2. Editing happens in the detail modal (`sample-modal.tsx`), where only ONE image is displayed at a time. This is the right place for an interactive canvas.
-3. Konva's `Transformer` component provides native drag/resize handles for bounding boxes.
-4. The grid overlay continues rendering the latest annotation data from the server (refetched after edits).
-
-### Component Architecture for Annotation Editing
+**Recommendation:** Extend `FolderScanner.scan()` to detect classification layouts BEFORE falling through to COCO detection. Classification layouts are cheaper to detect (structural directory patterns, no JSON parsing needed).
 
+**Classification layout: folder-of-folders (ImageNet-style)**
 ```
-sample-modal.tsx (MODIFIED)
-  |
-  +-- [Read-only mode] AnnotationOverlay (SVG, existing)
-  |
-  +-- [Edit mode] AnnotationEditor (NEW, react-konva)
-        |
-        +-- <Stage> with <Layer>
-        |     |
-        |     +-- <Image> (full-res image as Konva.Image)
-        |     +-- <Rect> per annotation (draggable)
-        |     +-- <Transformer> (attached to selected rect)
-        |
-        +-- EditToolbar (NEW)
-        |     |
-        |     +-- Select / Move / Delete buttons
-        |     +-- Save / Cancel buttons
-        |
-        +-- Zustand: useAnnotationEditStore (NEW store)
-              |
-              +-- editingAnnotations: Annotation[] (local copy during edit)
-              +-- selectedAnnotationId: string | null
-              +-- isDirty: boolean
-              +-- saveEdits() -> PATCH /annotations/batch
-              +-- discardEdits()
+dataset/
+  train/
+    cat/          # class label = directory name
+      img001.jpg
+      img002.jpg
+    dog/
+      img003.jpg
+  val/
+    cat/
+      img004.jpg
+    dog/
+      img005.jpg
 ```
 
-### Integration Points
+Detection heuristic:
+1. Root or split subdirectories contain subdirectories whose names are NOT known split names.
+2. Those subdirectories contain image files (no JSON files).
+3. Multiple sibling class directories exist (>= 2 classes).
 
-**New npm dependency:**
+**Classification layout: CSV labels**
 ```
-npm install react-konva konva
+dataset/
+  labels.csv       # columns: filename, label (or image, class)
+  images/
+    img001.jpg
 ```
 
-**Files to create:**
-| File | Purpose |
-|------|---------|
-| `frontend/src/components/detail/annotation-editor.tsx` | react-konva Stage with draggable/resizable Rects |
-| `frontend/src/components/detail/edit-toolbar.tsx` | Edit mode controls (select, delete, save, cancel) |
-| `frontend/src/stores/annotation-edit-store.ts` | Zustand store for edit-mode state (NEW 4th store) |
-| `frontend/src/types/annotation-edit.ts` | Types for annotation edit operations |
-
-**Backend files to create:**
-| File | Purpose |
-|------|---------|
-| `app/routers/annotations.py` | New router: `PATCH /annotations/batch`, `DELETE /annotations/{id}` |
-| `app/models/annotation.py` (modify) | Add `AnnotationUpdateRequest` model |
-
-**Files to modify:**
-| File | Change |
-|------|--------|
-| `frontend/src/components/detail/sample-modal.tsx` | Add toggle between read (SVG) and edit (Konva) modes |
-| `frontend/package.json` | Add `react-konva` and `konva` dependencies |
-| `app/main.py` | Add `app.include_router(annotations.router)` |
-
-### Backend API for Annotation Updates
+Detection heuristic:
+1. Root contains a CSV file.
+2. CSV has 2+ columns, first column values match filenames in an image directory.
 
+**Scanner modification (in `folder_scanner.py`):**
 ```python
-# app/routers/annotations.py (NEW)
-
-@router.patch("/annotations/batch")
-def update_annotations_batch(
-    request: AnnotationBatchUpdateRequest,
-    db: DuckDBRepo = Depends(get_db),
-) -> dict:
-    """Update bbox coordinates for multiple annotations.
-
-    Used by the frontend annotation editor to save moved/resized boxes.
-    Only ground_truth annotations are editable (predictions are immutable).
-    """
-    cursor = db.connection.cursor()
-    try:
-        for update in request.updates:
-            cursor.execute(
-                "UPDATE annotations SET bbox_x = ?, bbox_y = ?, bbox_w = ?, bbox_h = ?, "
-                "area = ? * ? WHERE id = ? AND source = 'ground_truth'",
-                [update.bbox_x, update.bbox_y, update.bbox_w, update.bbox_h,
-                 update.bbox_w, update.bbox_h, update.id],
-            )
-    finally:
-        cursor.close()
-    return {"updated": len(request.updates)}
-
-
-@router.delete("/annotations/{annotation_id}")
-def delete_annotation(
-    annotation_id: str,
-    db: DuckDBRepo = Depends(get_db),
-) -> None:
-    """Delete a single annotation. Only ground_truth annotations are deletable."""
-    cursor = db.connection.cursor()
-    try:
-        cursor.execute(
-            "DELETE FROM annotations WHERE id = ? AND source = 'ground_truth'",
-            [annotation_id],
-        )
-    finally:
-        cursor.close()
-```
+def scan(self, root_path: str) -> ScanResult:
+    # 1. Try classification folder-of-folders (cheapest check)
+    splits = self._try_classification_folders(root, warnings)
+    if splits:
+        return ScanResult(format="classification_folders", splits=splits, ...)
 
-### Konva Transformer Integration
+    # 2. Try classification CSV
+    splits = self._try_classification_csv(root, warnings)
+    if splits:
+        return ScanResult(format="classification_csv", splits=splits, ...)
 
-The key technical pattern from Konva docs: the Transformer changes `scaleX`/`scaleY`, not `width`/`height`. On `onTransformEnd`, compute the new bbox from the node's position and scale:
-
-```typescript
-// Pattern for annotation-editor.tsx
-const handleTransformEnd = (e: KonvaEventObject<Event>) => {
-  const node = e.target;
-  const scaleX = node.scaleX();
-  const scaleY = node.scaleY();
-
-  // Reset scale, apply to dimensions
-  node.scaleX(1);
-  node.scaleY(1);
-
-  const updated: AnnotationUpdate = {
-    id: node.id(),
-    bbox_x: node.x(),
-    bbox_y: node.y(),
-    bbox_w: Math.max(5, node.width() * scaleX),
-    bbox_h: Math.max(5, node.height() * scaleY),
-  };
-
-  editStore.updateAnnotation(updated);
-};
+    # 3. Fall through to existing COCO detection (unchanged)
+    splits = self._try_layout_b(root, warnings)
+    if not splits:
+        splits = self._try_layout_a(root, warnings)
+    if not splits:
+        splits = self._try_layout_c(root, warnings)
+    return ScanResult(format="coco", splits=splits, ...)
 ```
 
-### Data Flow for Annotation Edits
+**Important:** The `ScanResult.format` field currently is always `"coco"`. This now becomes the actual detected format string that drives parser dispatch in `IngestionService`.
 
-```
-User clicks "Edit" in sample modal
-  -> AnnotationEditStore.startEditing(annotations)  // copy current annotations
-  -> Modal switches from SVG AnnotationOverlay to Konva AnnotationEditor
-  -> User drags/resizes boxes (Konva handles visual updates in real-time)
-  -> User clicks "Save"
-  -> AnnotationEditStore.saveEdits()
-    -> PATCH /annotations/batch { updates: [...] }
-    -> On success: invalidate TanStack Query cache for this sample's annotations
-    -> Modal switches back to SVG AnnotationOverlay
-    -> Grid refetches batch annotations (sees updated boxes)
-```
-
-### Build Order Implication
+---
 
-Annotation editing depends on the sample modal existing (already built). It is independent of Docker, auth, and smart ingestion. Can be built in parallel with error triage.
+## New Components to Build
+
+### Backend
+
+| Component | File | Purpose |
+|-----------|------|---------|
+| `ClassificationFolderParser` | `app/ingestion/classification_folder_parser.py` | Parse ImageNet-style folder-of-folders into samples + annotations |
+| `ClassificationCSVParser` | `app/ingestion/classification_csv_parser.py` | Parse CSV label files into samples + annotations |
+| `ClassificationPredictionParser` | `app/ingestion/classification_prediction_parser.py` | Parse classification prediction CSV/JSON |
+| `compute_classification_evaluation` | `app/services/classification_evaluation.py` | Accuracy/F1/confusion matrix (pure numpy) |
+| `ClassificationEvaluationResponse` | `app/models/evaluation.py` | Response model for classification metrics |
+| Schema migration | `app/repositories/duckdb_repo.py` | `dataset_type` column, nullable bbox columns |
+| Scanner extensions | `app/services/folder_scanner.py` | `_try_classification_folders()`, `_try_classification_csv()` |
+
+### Frontend
+
+| Component | File | Purpose |
+|-----------|------|---------|
+| `ClassificationLabel` | `src/components/grid/classification-label.tsx` | Class label pill overlay for grid cells and modal |
+| `ClassificationEvaluationPanel` | `src/components/stats/classification-eval-panel.tsx` | Accuracy/F1 metrics display with confusion matrix |
+| `ClassificationPerClassTable` | `src/components/stats/classification-per-class-table.tsx` | Per-class P/R/F1/Support table |
+
+### Modified Components (Existing Files)
+
+| Component | File | What Changes |
+|-----------|------|-------------|
+| `DuckDBRepo.initialize_schema` | `duckdb_repo.py` | Add `dataset_type` column, make bbox nullable |
+| `FolderScanner` | `folder_scanner.py` | Add classification layout detection methods |
+| `IngestionService` | `services/ingestion.py` | Parser dispatch by format (registry pattern) |
+| `DatasetResponse` | `models/dataset.py` | Add `dataset_type: str = "detection"` field |
+| `AnnotationResponse` | `models/annotation.py` | Make bbox fields `Optional[float] = None` |
+| `AnnotationCreate` | `models/annotation.py` | Make bbox fields optional |
+| `BaseParser` | `ingestion/base_parser.py` | Relax bbox requirement in docstring |
+| `get_evaluation` router | `routers/statistics.py` | Branch on dataset_type |
+| `get_dataset_statistics` router | `routers/statistics.py` | Adjust summary labels for classification |
+| `AnnotationOverlay` | `annotation-overlay.tsx` | Conditional bbox vs label rendering |
+| `GridCell` | `grid-cell.tsx` | Pass `datasetType` prop to overlay |
+| `SampleModal` | `sample-modal.tsx` | Conditional annotation display, hide bbox editing for classification |
+| `StatsDashboard` | `stats-dashboard.tsx` | Route to correct evaluation panel |
+| `EvaluationPanel` | `evaluation-panel.tsx` | Branch on dataset type |
+| `AnnotationList` | `annotation-list.tsx` | Hide bbox columns for classification |
+| `ScanResults` UI | `scan-results.tsx` | Show correct format badge |
+| `PredictionImportDialog` | `prediction-import-dialog.tsx` | Support classification prediction format |
+| `Dataset` type | `types/dataset.ts` | Add `dataset_type` field |
+| `Annotation` type | `types/annotation.ts` | Make bbox fields optional (`number | null`) |
+| `useEvaluation` hook | `hooks/use-evaluation.ts` | Handle union response type |
+| `useFilteredEvaluation` hook | `hooks/use-filtered-evaluation.ts` | Handle classification eval response |
 
 ---
 
-## Feature 5: Error Triage Workflow
+## Patterns to Follow
 
-### Current Error Analysis State
+### Pattern 1: Type Discriminator Threading
 
-The error analysis system already exists (`error_analysis.py` service, `error-analysis-panel.tsx` component). It categorizes detections into TP, Hard FP, Label Error, and FN. However, it is **read-only** -- there is no way to:
-- Tag individual errors (confirm/dismiss/flag for review)
-- Highlight errors while dimming non-errors in the grid
-- Rank "worst" images by error severity
+**What:** Pass `dataset_type` as a prop from the top-level dataset query down to components that need conditional behavior. Never re-fetch it inside child components.
 
-### Triage Workflow Architecture
+**When:** Any component that renders differently for detection vs classification.
 
-```
-Error Triage Flow
-=================
-
-error-analysis-panel.tsx (EXISTING -- add triage actions)
-  |
-  +-- ErrorSamplesGrid (EXISTING -- add "Tag as reviewed" button)
-  |
-  +-- TriageActionBar (NEW component)
-  |     |
-  |     +-- "Mark as FP" / "Mark as TP" / "Mark as Mistake" buttons
-  |     +-- "Highlight errors only" toggle
-  |     +-- "Rank worst images" button
-  |
-  +-- useTriageStore (NEW Zustand store -- 4th store)
-        |
-        +-- triageLabels: Map<string, TriageLabel>    // annotation_id -> label
-        +-- highlightMode: "all" | "errors_only"
-        +-- worstImagesRanking: ScoredSample[]
-        +-- setTriageLabel(annotationId, label)
-        +-- toggleHighlightMode()
+**Why:** Single fetch, single source of truth. Components remain pure.
+
+```typescript
+// Page level: fetch once
+const { data: dataset } = useDataset(datasetId);
+
+// Thread to children
+<ImageGrid datasetType={dataset.dataset_type} ... />
+<StatsDashboard datasetType={dataset.dataset_type} ... />
+<SampleModal datasetType={dataset.dataset_type} ... />
 ```
 
-### New DuckDB Table: `triage_labels`
+### Pattern 2: Parser Registry for Ingestion Dispatch
 
-The triage labels need to persist. Add a new table:
+**What:** Map format strings to parser classes instead of hardcoding `COCOParser()`.
 
-```sql
-CREATE TABLE IF NOT EXISTS triage_labels (
-    annotation_id   VARCHAR NOT NULL,
-    dataset_id      VARCHAR NOT NULL,
-    label           VARCHAR NOT NULL,     -- 'confirmed', 'dismissed', 'needs_review', 'mistake'
-    created_at      TIMESTAMP DEFAULT current_timestamp
-)
-```
+**When:** `IngestionService` creates a parser for ingestion.
 
-### Backend Integration Points
+```python
+PARSER_REGISTRY: dict[str, type[BaseParser]] = {
+    "coco": COCOParser,
+    "classification_folders": ClassificationFolderParser,
+    "classification_csv": ClassificationCSVParser,
+}
 
-**Files to create:**
-| File | Purpose |
-|------|---------|
-| `app/routers/triage.py` | New router: `POST /triage/label`, `GET /triage/labels`, `GET /triage/worst-images` |
-| `app/models/triage.py` | Pydantic models: `TriageLabelRequest`, `TriageLabelResponse`, `ScoredSample` |
-| `app/services/triage_service.py` | Triage label CRUD + "worst images" ranking algorithm |
+# In ingest_with_progress():
+parser_class = PARSER_REGISTRY.get(format)
+if parser_class is None:
+    raise ValueError(f"Unsupported format: {format}")
+parser = parser_class(batch_size=1000)
+```
 
-**Files to modify:**
-| File | Change |
-|------|--------|
-| `app/repositories/duckdb_repo.py` | Add `triage_labels` table creation in `initialize_schema()` |
-| `app/main.py` | Add `app.include_router(triage.router)` |
-| `app/dependencies.py` | Add `get_triage_service()` dependency |
+### Pattern 3: Evaluation Strategy Selection
 
-### "Worst Images" Ranking Algorithm
+**What:** The evaluation router reads `dataset_type` and dispatches to the correct evaluation function. Each function returns its own response model.
 
-The ranking combines multiple error signals into a single score:
+**When:** Evaluation endpoint is called.
 
 ```python
-# app/services/triage_service.py
-
-def rank_worst_images(
-    cursor: DuckDBPyConnection,
-    dataset_id: str,
-    source: str,
-    limit: int = 50,
-) -> list[ScoredSample]:
-    """Rank images by combined error severity score.
-
-    Score = (2 * hard_fp_count) + (3 * label_error_count) + (1 * fn_count)
-            + (0.5 * low_confidence_count) - (0.1 * tp_count)
-
-    Higher score = worse image (more problems).
-    """
-    # Use existing error_analysis.categorize_errors() to get per-sample breakdown
-    # Then aggregate and sort
-    ...
+if dataset_type == "classification":
+    return compute_classification_evaluation(cursor, dataset_id, source, conf_threshold, split)
+else:
+    return compute_evaluation(cursor, dataset_id, source, iou_threshold, conf_threshold, split)
 ```
 
-### Frontend Integration Points
-
-**Files to create:**
-| File | Purpose |
-|------|---------|
-| `frontend/src/stores/triage-store.ts` | Zustand store for triage state (5th store) |
-| `frontend/src/components/stats/triage-action-bar.tsx` | Triage controls and actions |
-| `frontend/src/components/stats/worst-images-panel.tsx` | Ranked worst images display |
-| `frontend/src/hooks/use-triage.ts` | TanStack Query hooks for triage API |
-| `frontend/src/types/triage.ts` | TypeScript types |
-
-**Files to modify:**
-| File | Change |
-|------|--------|
-| `frontend/src/components/stats/error-analysis-panel.tsx` | Add triage action buttons to error samples |
-| `frontend/src/components/stats/error-samples-grid.tsx` | Add per-sample triage label badges |
-| `frontend/src/components/grid/grid-cell.tsx` | Support highlight/dim mode from triage store |
-| `frontend/src/stores/ui-store.ts` | Add `highlightMode: "all" \| "errors_only"` state |
-
-### Highlight/Dim Mode in Grid
-
-When triage highlight mode is active, grid cells for non-error images get reduced opacity:
-
-```tsx
-// grid-cell.tsx modification
-const triageHighlight = useTriageStore((s) => s.highlightMode);
-const isError = useTriageStore((s) => s.errorSampleIds.has(sample.id));
-
-const opacity = triageHighlight === "errors_only" && !isError ? 0.2 : 1.0;
-
-return (
-  <div style={{ opacity }} className="...">
-    ...
-  </div>
-);
-```
+### Pattern 4: One Annotation Per Sample for Classification
+
+**What:** Classification datasets have exactly one ground-truth annotation per sample (the class label). Predictions also have one annotation per sample (the predicted class with confidence). This is enforced by the parser, not by the schema.
 
-### Build Order Implication
+**When:** Classification ingestion and evaluation.
 
-Error triage depends on the existing error analysis system (already built). It extends the stats dashboard. Independent of Docker, auth, smart ingestion, and annotation editing.
+**Why this matters:** The evaluation service can safely do `GROUP BY sample_id` and take the first row, rather than needing to handle multiple annotations per sample.
 
 ---
 
-## Feature 6: Keyboard Shortcuts
+## Anti-Patterns to Avoid
 
-### Architecture Decision: react-hotkeys-hook
+### Anti-Pattern 1: Separate Tables for Each Dataset Type
 
-**Recommendation: Use `react-hotkeys-hook` (v5.x) library** rather than building custom keyboard handling.
+**What:** Creating `classification_annotations`, `detection_annotations`, etc.
 
-Rationale:
-1. Actively maintained (last published 9 days ago as of research date)
-2. Lightweight (~4KB)
-3. Supports scoped shortcuts (component-level) and global shortcuts
-4. Works with React 19 and Next.js 16
-5. Handles modifier keys, key combinations, and key sequences
-6. Prevents shortcuts from firing when user is typing in inputs
+**Why bad:** Every query in the codebase touches the `annotations` table. Statistics (`GROUP BY category_name`), filtering, batch fetch, triage -- all would need parallel implementations. The codebase has ~15 queries against the annotations table across 6 services.
 
-### Integration Point: Global vs Component-Level Shortcuts
+**Instead:** Nullable bbox columns in the existing table. Classification rows have NULL bbox.
 
-```
-Shortcut Architecture
-=====================
-
-Global shortcuts (active everywhere):
-  ?           -> Show shortcut help modal
-  Escape      -> Close any open modal / exit edit mode / clear selection
-  /           -> Focus search input
-  g           -> Switch to Grid tab
-  s           -> Switch to Statistics tab
-  e           -> Switch to Embeddings tab
-
-Component-level shortcuts (active when component is focused):
-
-  Grid View:
-    j/k       -> Navigate samples (down/up)
-    Enter     -> Open detail modal for focused sample
-    x         -> Toggle selection mode
-    Shift+A   -> Select all visible
-
-  Detail Modal:
-    Left/Right arrow -> Previous/next sample
-    d          -> Delete annotation (in edit mode)
-    Ctrl+S     -> Save annotation edits
-    Escape     -> Close modal / cancel edit
-
-  Error Triage:
-    1          -> Mark as confirmed TP
-    2          -> Mark as needs review
-    3          -> Mark as mistake
-    h          -> Toggle highlight mode
-```
+### Anti-Pattern 2: Repurposing Detection Response Fields
 
-### Integration Points
+**What:** Stuffing accuracy into `map50`, precision into `map75`, etc. to avoid a new response model.
 
-**New npm dependency:**
-```
-npm install react-hotkeys-hook
-```
+**Why bad:** The frontend would need to know that `map50` really means "accuracy" when `dataset_type === "classification"`. Field names become lies. API consumers are confused.
 
-**Files to create:**
-| File | Purpose |
-|------|---------|
-| `frontend/src/hooks/use-keyboard-shortcuts.ts` | Central shortcut registration hook |
-| `frontend/src/components/shortcuts/shortcut-help-modal.tsx` | Help modal showing all available shortcuts |
-| `frontend/src/lib/shortcuts.ts` | Shortcut definitions map (key -> action -> description) |
-
-**Files to modify:**
-| File | Change |
-|------|--------|
-| `frontend/src/app/datasets/[datasetId]/page.tsx` | Register global shortcuts (tab switching, search focus) |
-| `frontend/src/components/grid/image-grid.tsx` | Register grid navigation shortcuts (j/k, Enter, x) |
-| `frontend/src/components/detail/sample-modal.tsx` | Register modal shortcuts (arrows, Escape, d, Ctrl+S) |
-| `frontend/src/components/stats/error-analysis-panel.tsx` | Register triage shortcuts (1/2/3, h) |
-| `frontend/src/stores/ui-store.ts` | Add `shortcutHelpOpen: boolean` state |
-| `frontend/package.json` | Add `react-hotkeys-hook` dependency |
-
-### Implementation Pattern
+**Instead:** Separate response models. The frontend discriminates on the response shape.
 
-```typescript
-// frontend/src/hooks/use-keyboard-shortcuts.ts
-import { useHotkeys } from 'react-hotkeys-hook';
-import { useUIStore } from '@/stores/ui-store';
-
-export function useGlobalShortcuts() {
-  const setActiveTab = useUIStore((s) => s.setActiveTab);
-  const openShortcutHelp = useUIStore((s) => s.setShortcutHelpOpen);
-
-  // ? -> show help
-  useHotkeys('shift+/', () => openShortcutHelp(true), { preventDefault: true });
-
-  // g/s/e -> tab switching
-  useHotkeys('g', () => setActiveTab('grid'), { preventDefault: true });
-  useHotkeys('s', () => setActiveTab('statistics'), { preventDefault: true });
-  useHotkeys('e', () => setActiveTab('embeddings'), { preventDefault: true });
-
-  // / -> focus search
-  useHotkeys('/', () => {
-    document.querySelector<HTMLInputElement>('[data-shortcut-target="search"]')?.focus();
-  }, { preventDefault: true });
-}
-```
+### Anti-Pattern 3: Making the Detection Evaluation Handle Both Types
 
-### Build Order Implication
+**What:** Adding `if dataset_type == "classification"` branches inside `compute_evaluation()`, `_load_detections()`, `_build_detections()`, etc.
 
-Keyboard shortcuts are the most independent feature. They layer on top of existing components without changing data flow or APIs. Can be built last or in parallel with any other feature.
+**Why bad:** The detection evaluation is deeply spatial -- every helper function deals with bounding boxes, xyxy conversion, IoU matrices. Grafting classification logic into this creates an unmaintainable chimera.
 
----
+**Instead:** A separate, clean `compute_classification_evaluation()` function. Classification evaluation is simple (array comparison, confusion matrix) -- it does not need supervision library or IoU machinery.
 
-## New Components Summary
-
-### Backend (New Files)
-
-| File | Feature | Type |
-|------|---------|------|
-| `Dockerfile.backend` | Docker | Build |
-| `Dockerfile.frontend` | Docker | Build |
-| `docker-compose.yml` | Docker | Config |
-| `nginx/default.conf` | Docker | Config |
-| `.env.docker` | Docker | Config |
-| `app/auth.py` | Auth | Module |
-| `app/services/folder_scanner.py` | Smart Ingestion | Service |
-| `app/models/scan.py` | Smart Ingestion | Model |
-| `app/routers/ingestion.py` | Smart Ingestion | Router |
-| `app/routers/annotations.py` | Annotation Editing | Router |
-| `app/routers/triage.py` | Error Triage | Router |
-| `app/models/triage.py` | Error Triage | Model |
-| `app/services/triage_service.py` | Error Triage | Service |
-
-### Frontend (New Files)
-
-| File | Feature | Type |
-|------|---------|------|
-| `src/app/ingest/page.tsx` | Smart Ingestion | Page |
-| `src/components/ingest/scan-results.tsx` | Smart Ingestion | Component |
-| `src/components/ingest/import-progress.tsx` | Smart Ingestion | Component |
-| `src/components/detail/annotation-editor.tsx` | Annotation Editing | Component |
-| `src/components/detail/edit-toolbar.tsx` | Annotation Editing | Component |
-| `src/stores/annotation-edit-store.ts` | Annotation Editing | Store |
-| `src/components/stats/triage-action-bar.tsx` | Error Triage | Component |
-| `src/components/stats/worst-images-panel.tsx` | Error Triage | Component |
-| `src/stores/triage-store.ts` | Error Triage | Store |
-| `src/hooks/use-scan.ts` | Smart Ingestion | Hook |
-| `src/hooks/use-ingest.ts` | Smart Ingestion | Hook |
-| `src/hooks/use-triage.ts` | Error Triage | Hook |
-| `src/components/shortcuts/shortcut-help-modal.tsx` | Shortcuts | Component |
-| `src/hooks/use-keyboard-shortcuts.ts` | Shortcuts | Hook |
-| `src/lib/shortcuts.ts` | Shortcuts | Lib |
-
-### Modified Files
-
-| File | Features Affecting It |
-|------|----------------------|
-| `app/config.py` | Docker (qdrant_url), Auth (credentials) |
-| `app/main.py` | Auth (router dependencies), New routers (ingestion, annotations, triage) |
-| `app/dependencies.py` | Smart Ingestion (folder_scanner), Error Triage (triage_service) |
-| `app/repositories/duckdb_repo.py` | Error Triage (triage_labels table) |
-| `app/services/similarity_service.py` | Docker (conditional Qdrant client mode) |
-| `app/services/ingestion.py` | Smart Ingestion (split parameter) |
-| `app/ingestion/coco_parser.py` | Smart Ingestion (split parameter) |
-| `app/models/dataset.py` | Smart Ingestion (split field on IngestRequest) |
-| `app/models/annotation.py` | Annotation Editing (update models) |
-| `frontend/next.config.ts` | Docker (standalone output) |
-| `frontend/package.json` | Annotation Editing (react-konva), Shortcuts (react-hotkeys-hook) |
-| `frontend/src/lib/api.ts` | Auth (credentials header) |
-| `frontend/src/stores/ui-store.ts` | Shortcuts (help modal), Triage (highlight mode) |
-| `frontend/src/components/detail/sample-modal.tsx` | Annotation Editing (edit mode toggle), Shortcuts |
-| `frontend/src/components/stats/error-analysis-panel.tsx` | Error Triage (action buttons) |
-| `frontend/src/components/stats/error-samples-grid.tsx` | Error Triage (label badges) |
-| `frontend/src/components/grid/grid-cell.tsx` | Error Triage (highlight/dim mode) |
-| `frontend/src/app/datasets/[datasetId]/page.tsx` | Shortcuts (global registration) |
-| `frontend/src/components/grid/image-grid.tsx` | Shortcuts (grid navigation) |
+### Anti-Pattern 4: Frontend Feature Detection Instead of Type Discrimination
 
----
-
-## Data Flow Changes
+**What:** Checking `if annotations[0]?.bbox_x === null` to determine rendering mode.
 
-### New DuckDB Tables
+**Why bad:** Fragile. Fails on samples with no annotations. Requires loading annotations before knowing how to render. Creates subtle bugs.
 
-| Table | Feature | Schema |
-|-------|---------|--------|
-| `triage_labels` | Error Triage | `annotation_id VARCHAR, dataset_id VARCHAR, label VARCHAR, created_at TIMESTAMP` |
+**Instead:** Use `dataset_type` from the dataset metadata (loaded once, always available). The type determines rendering, not the data shape.
 
-### New API Endpoints
-
-| Method | Path | Feature | SSE? |
-|--------|------|---------|------|
-| `POST` | `/ingestion/scan` | Smart Ingestion | No |
-| `PATCH` | `/annotations/batch` | Annotation Editing | No |
-| `DELETE` | `/annotations/{id}` | Annotation Editing | No |
-| `POST` | `/triage/label` | Error Triage | No |
-| `GET` | `/triage/labels?dataset_id=X` | Error Triage | No |
-| `GET` | `/triage/worst-images?dataset_id=X` | Error Triage | No |
+---
 
-### New Zustand Stores
+## Scalability Considerations
 
-| Store | Feature | Slices |
-|-------|---------|--------|
-| `annotation-edit-store.ts` | Annotation Editing | editingAnnotations, selectedId, isDirty, save/discard actions |
-| `triage-store.ts` | Error Triage | triageLabels map, highlightMode, worstImagesRanking |
+| Concern | At 1K images | At 100K images | At 1M images |
+|---------|-------------|---------------|-------------|
+| Classification annotations (1 per image) | 1K rows, trivial | 100K rows, fast | 1M rows, may want index on (dataset_id, sample_id) |
+| Confusion matrix computation | In-memory numpy, instant | In-memory numpy, <1s | In-memory numpy, ~2s (1M label comparisons) |
+| Folder-of-folders ingestion (many small files) | Fast | Moderate (100K filesystem stats) | Slow -- but same as image loading |
+| NULL bbox storage | None (DuckDB columnar compression) | None | None -- NULLs compress to near-zero in columnar |
+| Statistics queries on mixed tables | No impact | No impact | No impact -- DuckDB predicate pushdown handles it |
 
-Total stores: 3 existing + 2 new = **5 Zustand stores**
+Classification datasets are strictly simpler than detection: 1 annotation per image, no spatial matching, no IoU. The existing architecture handles the scale without modification.
 
 ---
 
 ## Suggested Build Order
 
-Based on dependency analysis:
+Build order follows data flow dependencies: schema before parsers, parsers before frontend display, evaluation needs data.
 
-```
-Phase 1: Docker Deployment
-  - Dockerfile.backend + Dockerfile.frontend
-  - docker-compose.yml (backend, frontend, qdrant, nginx)
-  - Qdrant client mode switch (local vs server)
-  - Next.js standalone output
-  - Nginx reverse proxy with SSE support
-  DEPENDS ON: nothing
-  ENABLES: cloud deployment, auth
-
-Phase 2: Single-User Auth
-  - app/auth.py (HTTPBasic + verify_credentials)
-  - Router-level dependency injection
-  - Frontend credential handling
-  - SSE auth via cookies
-  DEPENDS ON: Docker (for HTTPS context)
-  ENABLES: secure cloud access
-
-Phase 3: Smart Ingestion UI
-  - FolderScanner service
-  - /ingestion/scan endpoint
-  - Ingestion wizard page + components
-  - Split detection in existing parser
-  DEPENDS ON: Auth (new endpoints need it)
-  ENABLES: no-code dataset import
-
-Phase 4: Error Triage Workflow
-  - triage_labels table
-  - Triage API endpoints
-  - Triage Zustand store
-  - Worst images ranking
-  - Grid highlight/dim mode
-  DEPENDS ON: existing error analysis (already built)
-  CAN PARALLEL WITH: Phase 3
-
-Phase 5: Annotation Editing
-  - react-konva integration in detail modal
-  - AnnotationEditor component with Transformer
-  - AnnotationEditStore (new Zustand store)
-  - PATCH /annotations/batch endpoint
-  DEPENDS ON: nothing new (builds on existing modal)
-  CAN PARALLEL WITH: Phases 3, 4
-
-Phase 6: Keyboard Shortcuts
-  - react-hotkeys-hook integration
-  - Global and component-level shortcuts
-  - Shortcut help modal
-  DEPENDS ON: all other UI features complete (shortcuts reference them)
-  BUILD LAST: shortcuts layer on top of everything
-```
+| Order | What | Dependencies | Rationale |
+|-------|------|--------------|-----------|
+| 1 | Schema migration + API model updates | None | Foundation: must exist before anything else |
+| 2 | Classification folder parser + scanner detection | Step 1 | End-to-end ingestion works |
+| 3 | Frontend conditional rendering (grid + modal) | Step 2 | Users can see classification datasets |
+| 4 | Classification evaluation service + frontend | Step 3 | Metrics for classification predictions |
+| 5 | Classification prediction import | Step 1 | Import predictions for evaluation |
+| 6 | CSV parser + additional format support | Step 1 | Secondary ingestion format, lower priority |
 
-### Dependency Graph
+**Critical path:** Steps 1 -> 2 -> 3 -> 4 are sequential. Steps 5 and 6 can proceed in parallel after step 1.
 
-```
-Phase 1 (Docker)
-    |
-    v
-Phase 2 (Auth)
-    |
-    v
-Phase 3 (Smart Ingestion)
-
-Phase 4 (Error Triage)     -- parallel, independent
-Phase 5 (Annotation Edit)  -- parallel, independent
-Phase 6 (Shortcuts)         -- last, references all UI
-```
+**What stays unchanged (no work needed):**
+- Embeddings + scatter plot (sample-level, no bbox dependency)
+- Similarity search (sample-level, no bbox dependency)
+- Saved views (filter state, no bbox dependency)
+- Tags / triage (annotation-level, uses category_name not bbox)
+- Thumbnail generation (image-level, no annotation dependency)
 
 ---
 
 ## Sources
 
-### HIGH Confidence (Official Documentation + Codebase Analysis)
-- DataVisor codebase: `app/main.py`, `app/dependencies.py`, `app/config.py`, `app/repositories/duckdb_repo.py`, `app/services/similarity_service.py`, `app/services/ingestion.py`, `app/services/error_analysis.py` -- verified existing architecture
-- DataVisor codebase: `frontend/src/stores/*.ts`, `frontend/src/lib/api.ts`, `frontend/src/components/**/*.tsx` -- verified frontend architecture
-- [FastAPI Dependency Injection vs Middleware (GitHub Discussion #8867)](https://github.com/fastapi/fastapi/discussions/8867) -- DI recommended for auth
-- [FastAPI Auth with Dependency Injection (PropelAuth)](https://www.propelauth.com/post/fastapi-auth-with-dependency-injection) -- DI pattern reference
-- [Konva Transformer for React](https://konvajs.org/docs/react/Transformer.html) -- select/resize/rotate shapes
-- [Konva Drag and Resize Limits](https://konvajs.org/docs/select_and_transform/Resize_Limits.html) -- boundBoxFunc for clamping
-- [Qdrant Installation Docker](https://qdrant.tech/documentation/guides/installation/) -- Docker service configuration
-- [Qdrant Python Client](https://python-client.qdrant.tech/qdrant_client.qdrant_client) -- local mode vs server mode
-- [react-hotkeys-hook (npm)](https://www.npmjs.com/package/react-hotkeys-hook) -- v5.2.4, actively maintained
-- [DuckDB Docker Container](https://duckdb.org/docs/stable/operations_manual/duckdb_docker) -- volume mount patterns
-- [Next.js Standalone Output](https://nextjs.org/docs/pages/api-reference/config/next-config-js/output) -- Docker-optimized builds
-
-### MEDIUM Confidence (WebSearch + Cross-Verification)
-- [FastAPI + Next.js Docker examples (GitHub)](https://github.com/YsrajSingh/nextjs-fastapi-docker) -- compose topology reference
-- [Qdrant Docker Compose configuration (DeepWiki)](https://deepwiki.com/qdrant/qdrant_demo/2.1-quick-start-with-docker-compose) -- service definition
-- [Building canvas-based editors with Konva](https://www.alikaraki.me/blog/canvas-editors-konva) -- production patterns for drag/resize
-- [Next.js Dockerization 2025 guide](https://medium.com/front-end-world/dockerizing-a-next-js-application-in-2025-bacdca4810fe) -- multi-stage build best practices
-
----
-*Architecture research for: DataVisor v1.1 Feature Integration*
-*Researched: 2026-02-12*
-*Grounded in: 12,720 LOC codebase analysis + official documentation*
+- **Direct codebase analysis:** `duckdb_repo.py` (schema), `evaluation.py` (metrics), `coco_parser.py` + `base_parser.py` (ingestion), `folder_scanner.py` (detection), `annotation-overlay.tsx` + `grid-cell.tsx` + `sample-modal.tsx` (frontend rendering), `statistics.py` (API), `evaluation-panel.tsx` (frontend metrics display)
+- **DuckDB ALTER TABLE:** Need to verify `ALTER COLUMN DROP NOT NULL` support in current version -- MEDIUM confidence on exact syntax
+- **ImageNet folder-of-folders convention:** Standard classification dataset layout -- HIGH confidence
+- **scikit-learn classification metrics patterns:** Standard accuracy/precision/recall/F1 computation -- HIGH confidence (though we use pure numpy, not sklearn)
diff --git a/.planning/research/FEATURES.md b/.planning/research/FEATURES.md
index ee1d75b..e69f2e2 100644
--- a/.planning/research/FEATURES.md
+++ b/.planning/research/FEATURES.md
@@ -1,1038 +1,428 @@
-# Feature Gap Analysis: DataVisor v1.1 vs FiftyOne & Encord
+# Feature Landscape: Classification Dataset Support
 
-**Domain:** Computer Vision Dataset Introspection / Exploration Tooling
-**Researched:** 2026-02-12
-**Mode:** Competitive analysis (FiftyOne + Encord vs DataVisor)
-**Overall Confidence:** HIGH (grounded in official documentation from both platforms)
+**Domain:** Single-label image classification dataset introspection
+**Researched:** 2026-02-18
+**Scope:** NEW features needed for classification support -- does NOT repeat existing detection features
 
 ---
 
-## How to Read This Document
+## How Classification Differs from Detection
 
-Each feature gap is categorized by:
-- **Priority:** Table Stakes (expected by CV engineers) / Differentiator (competitive edge) / Nice-to-Have (marginal value for v1.1)
-- **Complexity:** Low (< 1 day) / Medium (1-3 days) / High (3+ days)
-- **Depends On:** Existing DataVisor v1.0 features or new features needed first
-- **Competitor Reference:** Specific documentation or behavior observed
+Understanding these differences drives every feature decision below.
 
----
-
-## 1. Dataset Ingestion & Format Support
-
-### 1A. Multi-Format Import (YOLO, VOC, KITTI, TFRecords, BDD)
-
-**What competitors do:**
-
-FiftyOne's `fo.Dataset.from_dir()` supports 15+ formats out of the box:
-- COCO Detection, VOC Detection, YOLOv4, YOLOv5, KITTI Detection
-- TFRecords (classification + detection), BDD100K, CVAT (image + video)
-- OpenLABEL, DICOM, GeoJSON, GeoTIFF
-- Image/Video classification directory trees
-- FiftyOne native format
-
-The API requires explicit `dataset_type` specification -- there is no automatic format detection. Example:
-```python
-dataset = fo.Dataset.from_dir(
-    dataset_dir="/path/to/data",
-    dataset_type=fo.types.COCODetectionDataset,
-    label_field="ground_truth",
-)
-```
-
-Encord ingestion is cloud-native: users register files from AWS S3, GCS, Azure, or OTC OSS buckets. Local upload is supported but the primary workflow is cloud storage integration via SDK. Encord's SDK enables programmatic ETL pipelines for ingestion.
+| Aspect | Detection (current) | Classification (new) |
+|--------|---------------------|---------------------|
+| **Label granularity** | Per-annotation (many per image) | Per-image (one label per image) |
+| **Spatial info** | Bounding boxes (x, y, w, h) | None -- label applies to entire image |
+| **Matching logic** | IoU-based greedy matching | Direct string comparison (GT label vs predicted label) |
+| **Error types** | TP, Hard FP, Label Error, FN | Correct, Misclassified (with confused pair) |
+| **Key metrics** | mAP, AP@50/75, per-class AP | Accuracy, macro/micro F1, per-class precision/recall |
+| **Confusion matrix** | Background row/col for unmatched | No background -- every image has exactly one GT and one prediction |
+| **Display** | SVG bbox overlays on thumbnails | Text badge/label on thumbnail corner |
+| **Ingestion format** | COCO JSON (images + annotations arrays) | JSONL (one JSON object per line: image, prefix, suffix) |
 
-**What DataVisor has:** COCO format only via streaming ijson parser.
+### Classification Matching (the "IoU equivalent")
 
-**Gap:** DataVisor only supports COCO. CV engineers commonly have datasets in YOLO (especially YOLOv5/v8 from Ultralytics) and VOC (legacy Pascal datasets). Missing YOLO support is the most critical gap -- it is the most popular training format today.
+In classification, matching is trivially simple: compare the predicted label to the ground truth label for each image. There is no spatial matching, no IoU threshold. The evaluation reduces to a standard confusion matrix.
 
-**Priority:** TABLE STAKES -- YOLO and VOC are the two most common formats after COCO. Missing them means users must convert externally before loading.
+FiftyOne's `evaluate_classifications()` supports three methods:
+- **"simple"** (default): Direct GT label vs prediction label comparison. Each sample is marked correct/incorrect. This is what DataVisor needs.
+- **"top-k"**: Prediction is correct if GT label appears in top-k predicted classes. Requires multi-class probability output (not applicable to DataVisor's single-prediction format).
+- **"binary"**: Binary classification with configurable positive class. Missing labels treated as negative class.
 
-**Complexity:** MEDIUM per format. Each format needs: (a) parser that maps to DataVisor's internal schema, (b) path resolution for images/labels, (c) tests with real-world datasets.
-
-- YOLO: Parse `dataset.yaml` for class names, read `.txt` label files (class_id cx cy w h), resolve image paths from `images/` directory structure
-- VOC: Parse XML annotation files with `<object>` elements, resolve image paths from `JPEGImages/`
-- KITTI: Parse space-delimited `.txt` files with 15 columns per object
-
-**Depends on:** Existing ingestion pipeline. DataVisor's streaming parser architecture should be extended with a format-detection step before parsing begins.
-
-**Recommendation:** Add YOLO and VOC for v1.1. KITTI and others can wait for v1.2+. Design a `FormatDetector` class that inspects folder contents (presence of `*.yaml`, `*.xml`, `*.json`) and recommends the parser.
+For DataVisor's use case (single-label classification with one prediction per image), the "simple" method is the only one that matters.
 
 ---
 
-### 1B. Train/Val/Test Split Handling
-
-**What competitors do:**
-
-FiftyOne handles splits via the `split` parameter on `add_dir()`:
-```python
-dataset = fo.Dataset(name)
-for split in ["train", "val", "test"]:
-    dataset.add_dir(
-        dataset_dir=dataset_dir,
-        dataset_type=fo.types.YOLOv5Dataset,
-        split=split,
-        tags=split,  # Tags each sample with its split name
-    )
-```
-
-This means: (a) each split is loaded separately, (b) samples are tagged with their split, (c) users can filter by split tag in the App. FiftyOne also has `Brain.compute_leaky_splits()` to detect data leakage between train/test.
+## Table Stakes
 
-Encord handles splits at the project level -- datasets are created per split, and projects reference specific datasets. The platform does not auto-detect folder structure.
+Features users expect when inspecting classification datasets. Missing any of these would feel like a broken product.
 
-**What DataVisor has:** No split awareness. The ingestion UI takes a single annotations file and image directory.
+### TS-1: JSONL Ingestion Parser
 
-**Gap:** Most real-world datasets have train/val/test directories. Users must currently load each split separately and cannot filter by split. There is no detection of the common `train/`, `val/`, `test/` folder pattern.
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | Classification datasets from Roboflow export as JSONL. This is the target format. |
+| **Complexity** | Medium |
+| **Depends on** | `BaseParser` abstract class (existing), `DuckDBRepo` schema (needs extension) |
 
-**Priority:** TABLE STAKES -- every real dataset has splits. Without this, the first thing a user does after loading is wonder "where are my val images?"
+**What to build:** A `ClassificationParser` that reads JSONL files where each line is `{"image":"filename.jpg","prefix":"prompt","suffix":"class_label"}`. The parser maps `suffix` to `category_name` and stores one annotation per image with sentinel bbox values (0,0,width,height -- full image) or a new schema approach.
 
-**Complexity:** MEDIUM.
-- Auto-detect: Scan for `train/`, `val/`, `test/` subdirectories; check for YOLO's `dataset.yaml` split definitions
-- Tag on ingest: Add a `split` metadata field to each sample during ingestion
-- Filter: The existing sidebar filtering system handles this automatically once the field exists
+**Schema decision:** The current `annotations` table requires `bbox_x/y/w/h`. Two options:
+1. **Store with sentinel values** (bbox = full image dimensions). Simpler, avoids schema migration, but semantically wrong.
+2. **Add `task_type` column to datasets table** and make bbox columns nullable. Cleaner, enables task-aware rendering throughout the app.
 
-**Depends on:** Ingestion pipeline (existing). Sidebar filtering (existing -- works on any metadata field).
+Recommendation: Option 2 -- add `task_type VARCHAR DEFAULT 'detection'` to `datasets` table. Make bbox columns `DOUBLE` (already are) and allow NULLs for classification. This is a one-line schema change and pays forward for future task types (segmentation, etc.).
 
-**Recommendation:** During ingestion, scan the target directory for split subdirectories. If found, present them in the UI and let the user select which splits to load. Tag each sample with its split name. The existing metadata filtering will handle split-based browsing.
+**Folder scanner:** Extend `folder_scanner.py` to detect JSONL files alongside images. A JSONL file with `{"image":..., "suffix":...}` structure identifies a classification dataset. The existing split detection (`train/`, `valid/`, `test/` directories) works as-is since Roboflow classification exports use the same directory structure.
 
 ---
 
-### 1C. Smart Folder Detection UI
+### TS-2: Class Label Display on Thumbnails
 
-**What competitors do:**
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | Every classification tool shows the class label on the thumbnail. Without it, users see unlabeled images. |
+| **Complexity** | Low |
+| **Depends on** | Grid cell component (existing), dataset `task_type` field (TS-1) |
 
-FiftyOne requires Python code to load datasets -- there is no folder-detection UI. The user must know the format and write `fo.Dataset.from_dir(...)` with the correct type. This is a pain point evidenced by multiple GitHub issues about `from_dir()` failing on slightly non-standard folder structures (issues #1780, #1781, #1951).
+**What to build:** A `ClassificationBadge` component that replaces the `AnnotationOverlay` (SVG bboxes) when `task_type === 'classification'`. Renders as a text badge in the top-left corner of the thumbnail.
 
-Encord's workflow is: (1) register cloud storage, (2) create a dataset in the platform, (3) upload/sync files. It is guided but requires configuration.
+**How competitors do it:**
+- **Roboflow:** Classification label displayed as text in the top-left corner of the image with a semi-transparent colored background.
+- **FiftyOne:** Classification fields shown as text tags in the sample's sidebar panel, not overlaid on the image in grid view. Image-level labels appear as fields, not spatial overlays.
+- **Label Studio:** Text label below the image during annotation.
 
-Neither competitor has a "point at folder and auto-detect" experience.
+**Design decision:** Top-left corner badge with semi-transparent background, colored by class (using the existing `color-hash.ts`). Show GT label by default; when predictions exist, show both with GT solid and prediction as an outline/dashed badge below. This mirrors the existing convention where GT is solid stroke and predictions are dashed stroke.
 
-**What DataVisor has:** Manual file selection in the ingestion UI.
-
-**Gap:** There is an opportunity to leapfrog both competitors with a smart ingestion UI that: (a) accepts a root directory, (b) scans for annotation files and image directories, (c) infers the format, (d) detects splits, (e) shows a preview before import.
-
-**Priority:** DIFFERENTIATOR -- neither FiftyOne nor Encord does this well. FiftyOne forces Python; Encord forces cloud config. A "drag-and-drop folder" experience is genuinely better.
-
-**Complexity:** MEDIUM.
-- Directory scanner: Look for `*.json` (COCO), `*.yaml` + `*.txt` (YOLO), `*.xml` (VOC)
-- Preview: Show detected format, split count, image count, class count before import
-- Confirmation: Let user override detected format if wrong
-
-**Depends on:** Multi-format import (1A). Split handling (1B).
-
-**Recommendation:** Build a `DatasetDetector` service that returns a `DetectionResult` with: format type, annotation paths, image directories, splits found, sample counts. The frontend renders this as a confirmation dialog before ingestion begins.
+**When GT and prediction differ:** Show both badges stacked, with the prediction badge having a red tint or strikethrough to indicate misclassification. When they match, show a single green-tinted badge. This gives immediate visual signal without opening the detail modal.
 
 ---
 
-### 1D. Dataset Zoo / Pre-Built Datasets
-
-**What competitors do:**
-
-FiftyOne has a Dataset Zoo with one-line loading of 20+ benchmark datasets:
-```python
-import fiftyone.zoo as foz
-dataset = foz.load_zoo_dataset("coco-2017", split="validation")
-```
-Available datasets include COCO-2017, CIFAR-10/100, ImageNet, BDD100K, Open Images, Cityscapes, ActivityNet, KITTI, and a `quickstart` dataset with 200 samples for demos.
+### TS-3: Classification Evaluation Metrics
 
-Encord does not have a dataset zoo -- users bring their own data.
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | Accuracy, F1, precision, recall are the universal classification metrics. Every ML practitioner expects these. |
+| **Complexity** | Medium |
+| **Depends on** | Evaluation service (existing, needs classification branch), prediction import (existing) |
 
-**What DataVisor has:** No pre-built dataset loading.
+**What to build:** A `ClassificationEvaluationService` that computes:
 
-**Gap:** The quickstart experience matters. FiftyOne users can go from `pip install` to exploring a dataset in 30 seconds. DataVisor users must have their own COCO dataset ready.
+**Aggregate metrics:**
+- **Accuracy:** correct / total
+- **Macro F1:** unweighted average of per-class F1 scores (treats all classes equally)
+- **Micro F1:** equivalent to accuracy for single-label classification
+- **Weighted F1:** F1 weighted by class support (handles imbalance)
 
-**Priority:** NICE-TO-HAVE for v1.1 (a single demo dataset is sufficient). TABLE STAKES for onboarding/documentation purposes.
+**Per-class metrics:**
+- Precision, Recall, F1, Support (count of GT instances)
 
-**Complexity:** LOW. Bundle a small demo dataset (50-100 COCO images with annotations and predictions) for first-run experience. Not a full zoo.
+**Why these specific metrics:** For the jersey number dataset (43 classes, likely imbalanced), accuracy alone is misleading. Macro F1 exposes classes with poor performance regardless of their frequency. Weighted F1 gives the overall picture accounting for class sizes. Per-class precision/recall identifies which specific classes the model struggles with.
 
-**Depends on:** Nothing. Just needs a sample dataset bundled or downloadable.
+**Implementation:** Use `sklearn.metrics.classification_report()` and `sklearn.metrics.confusion_matrix()` rather than building from scratch. scikit-learn is already a transitive dependency (via supervision). Classification evaluation is dramatically simpler than detection evaluation -- no IoU, no confidence sweeping, no greedy matching. The entire evaluation is one confusion matrix computation.
 
-**Recommendation:** Ship a `quickstart` command or UI button that loads a bundled demo dataset. This is critical for documentation, demos, and first-time users. Defer a full dataset zoo indefinitely -- it is not core to the tool's value.
+**Response model:** New `ClassificationEvaluationResponse` alongside the existing detection `EvaluationResponse`. The router checks `task_type` and dispatches to the correct service.
 
 ---
 
-### 1E. Dataset Export
-
-**What competitors do:**
-
-FiftyOne exports to all the same formats it imports, via:
-```python
-dataset_or_view.export(
-    export_dir="/path/to/export",
-    dataset_type=fo.types.YOLOv5Dataset,
-    label_field="ground_truth",
-)
-```
-Key parameters: `export_media` (copy/move/symlink/omit), `abs_paths`, `classes` (explicit class list), `data_path`/`labels_path` (separate media and labels). Views can be exported -- so a filtered subset exports only matching samples.
+### TS-4: Classification Confusion Matrix
 
-Encord exports labels via SDK in JSON format and supports integration with training pipelines.
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | The confusion matrix is THE diagnostic tool for classification. It shows which classes get confused with which. |
+| **Complexity** | Low (existing confusion matrix component needs minor adaptation) |
+| **Depends on** | Confusion matrix component (existing), classification evaluation (TS-3) |
 
-**What DataVisor has:** No export functionality.
+**What to build:** Adapt the existing `ConfusionMatrix` component for classification.
 
-**Gap:** After users curate a dataset (filter, tag errors, exclude bad samples), they need to export the cleaned subset for training. Without export, the curation work is stranded inside DataVisor.
+**Key differences from detection confusion matrix:**
+1. **No "background" row/column.** In classification, every image has exactly one GT label and one predicted label. There are no unmatched items.
+2. **Simpler cell semantics.** Each cell (i, j) = count of images with GT class i predicted as class j. No IoU threshold.
+3. **No IoU threshold slider.** The existing evaluation panel has IoU/confidence threshold controls -- these should be hidden for classification datasets.
+4. **Confidence threshold still relevant.** If predictions have confidence scores, filtering by confidence can still be useful (exclude low-confidence predictions).
 
-**Priority:** TABLE STAKES -- the end-to-end workflow is: load -> explore -> curate -> export for training. Export completes the loop.
+**The existing `ConfusionMatrix` component and `use-confusion-cell.ts` hook already support click-to-filter** (clicking a cell shows the contributing samples). This works perfectly for classification -- clicking cell (i, j) shows all images where GT=class_i and prediction=class_j. The only change is the backend query: instead of IoU-based matching, do a simple SQL join on sample_id between GT and prediction annotations.
 
-**Complexity:** MEDIUM.
-- Export a DatasetView (filtered subset) to COCO, YOLO, or VOC format
-- Handle media: copy vs symlink vs manifest-only
-- Write annotation files in target format
-
-**Depends on:** Multi-format import (1A, for the format writers). Saved views / filtering (existing).
-
-**Recommendation:** Implement export for COCO and YOLO formats in v1.1. The current view (with all active filters) should be exportable. Support `copy` and `symlink` media modes. This closes the curation loop.
+**For 43 classes (jersey numbers):** The matrix will be 43x43. The existing component needs to handle this density well. Consider adding: (a) row/column sorting by error count, (b) a "most confused pairs" summary table showing the top-N off-diagonal cells. FiftyOne surfaces "most confused" pairs as a first-class concept and it is extremely useful.
 
 ---
 
-## 2. Annotation Management
-
-### 2A. In-App Annotation Editing (Move, Resize, Delete Bounding Boxes)
-
-**What competitors do:**
-
-FiftyOne does NOT have in-app annotation editing. It delegates to external tools (CVAT, Label Studio, Labelbox) via `dataset.annotate()`:
-```python
-anno_key = "corrections"
-view.annotate(
-    anno_key,
-    backend="cvat",
-    label_field="ground_truth",
-    allow_additions=True,
-    allow_deletions=True,
-    allow_spatial_edits=True,
-)
-# Later, after editing in CVAT:
-view.load_annotations(anno_key)
-```
-This is a roundtrip: FiftyOne -> CVAT -> FiftyOne. Annotations are not editable in the FiftyOne App itself.
+### TS-5: Classification Error Analysis
 
-Encord has a full-featured annotation editor with:
-- Bounding boxes, rotatable bounding boxes, polygons, polylines, keypoints, bitmasks, object primitives
-- Vertex management: add/remove/move vertices on polygons
-- Brush tool and eraser for freehand polygon refinement
-- Copy/paste labels across frames (Ctrl+C, Ctrl+V)
-- Undo/redo (Ctrl+Z, Ctrl+Shift+Z)
-- Merge and subtract polygons
-- SAM 2 model-assisted segmentation
-- Interpolation for frame-to-frame tracking
-- Bulk label operations: merge objects, mass-delete by class/confidence/frame range
-- Wacom tablet support
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | Users need to know not just aggregate metrics but which specific images are wrong and why. |
+| **Complexity** | Medium |
+| **Depends on** | Error analysis service (existing, needs classification branch), classification evaluation (TS-3) |
 
-**What DataVisor has:** Read-only annotation display. No editing.
+**What to build:** A `ClassificationErrorAnalysis` service that categorizes each image as:
 
-**Gap:** DataVisor's PROJECT.md scopes this as "quick corrections only, not CVAT replacement." This is the right call. The question is: what is the minimum viable annotation editing for an introspection tool?
+| Category | Detection Equivalent | Definition |
+|----------|---------------------|------------|
+| **Correct** | True Positive | GT label == predicted label |
+| **Misclassified** | Label Error | GT label != predicted label (the GT/predicted pair is recorded) |
+| **Missing prediction** | False Negative | Image has GT but no prediction |
+| **Spurious prediction** | False Positive | Image has prediction but no GT (rare for classification) |
 
-**Priority:** TABLE STAKES at the "quick correction" level. When a user spots a wrong bounding box during error triage, they should be able to fix it immediately without context-switching to CVAT.
+**No "Hard FP" category.** In detection, Hard FP means a prediction with no nearby GT box. In classification, there is no spatial component -- a wrong prediction is simply a misclassification. The error taxonomy is simpler.
 
-**Complexity:** HIGH for full editing. MEDIUM for the minimum viable set:
-- Delete a bounding box (click -> delete key)
-- Move a bounding box (drag)
-- Resize a bounding box (drag corners/edges)
-- Change class label (dropdown or hotkey)
-- Undo/redo (Ctrl+Z / Ctrl+Shift+Z)
+**Per-class error breakdown:** For each class, show: TP count, misclassified count (broken down by which class they were confused with), missed count. This is richer than the detection per-class table because we can show the confusion target.
 
-Full polygon editing, brush tools, interpolation, etc. are out of scope per PROJECT.md.
-
-**Depends on:** Sample detail modal (existing). Annotation overlay rendering (existing).
-
-**Recommendation:** Implement bbox-only editing in the sample detail modal: select -> move/resize/delete -> save. No polygon editing, no new annotation creation (that is CVAT territory). Add undo/redo with a simple command stack. This covers 90% of "quick correction" needs.
+**"Most confused pairs" summary:** Extract the top-N off-diagonal confusion matrix cells and present as a ranked list: "Class '3' confused with '8' (N=23 times)" etc. This is the single most actionable view for classification debugging.
 
 ---
 
-### 2B. Create New Annotations
-
-**What competitors do:**
-
-FiftyOne: Not possible in-app. Must use CVAT/Label Studio integration.
+### TS-6: Classification Prediction Import
 
-Encord: Full creation workflow -- select a tool (bbox, polygon, etc.), draw on the image, assign class from ontology, save. Ontology-driven: classes are defined upfront in a project ontology.
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | Users need to compare model predictions against ground truth. |
+| **Complexity** | Low |
+| **Depends on** | Prediction parser (existing), JSONL parser (TS-1) |
 
-**What DataVisor has:** No annotation creation.
+**What to build:** Extend the prediction import dialog to accept classification predictions. Two formats:
 
-**Gap:** Sometimes during triage, a user finds a missing annotation (false negative) and wants to add a bounding box. This is a natural part of the correction workflow.
+1. **JSONL format** (matching ingestion): `{"image":"filename.jpg","suffix":"predicted_class","confidence":0.95}`
+2. **CSV format** (simpler): `filename,predicted_class,confidence`
 
-**Priority:** NICE-TO-HAVE for v1.1. The primary workflow is editing existing annotations, not creating new ones. New annotation creation can be added in v1.2 if users request it.
+The existing prediction import flow stores predictions as annotations with `source != 'ground_truth'`. For classification, each prediction is one annotation per image (instead of potentially many bboxes).
 
-**Complexity:** MEDIUM. Requires: draw-to-create interaction, class assignment UI, persistence to DuckDB.
-
-**Depends on:** Annotation editing (2A).
-
-**Recommendation:** Defer to v1.2. Focus v1.1 on edit/delete of existing annotations. If added later, scope to bounding boxes only (click-drag to create, assign class from existing class list).
+**Confidence handling:** Classification models often output a probability distribution over all classes. For DataVisor's single-label scope, only the top-1 prediction and its confidence are imported. Top-k support is a future enhancement (see Differentiators).
 
 ---
 
-### 2C. Annotation Backend Integration (CVAT, Label Studio)
-
-**What competitors do:**
+### TS-7: Sample Detail Modal Adaptation
 
-FiftyOne's annotation integration is a key feature:
-- `dataset.annotate()` uploads samples to CVAT/Label Studio/Labelbox
-- Configurable permissions: `allow_additions`, `allow_deletions`, `allow_label_edits`, `allow_spatial_edits`
-- Label schema defines task type, classes, and custom attributes
-- `dataset.load_annotations()` merges results back
-- Annotation runs are tracked: rename, inspect, delete
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | Clicking an image must show useful information. The detection modal shows bboxes; the classification modal should show class info. |
+| **Complexity** | Medium |
+| **Depends on** | Sample modal (existing), annotation editor (existing), dataset `task_type` |
 
-Encord IS an annotation platform, so this is not "integration" but rather native capability.
+**What to build:** Conditional rendering in `sample-modal.tsx` based on `task_type`:
 
-**What DataVisor has:** No integration with external annotation tools.
+**For classification datasets:**
+- Remove bbox overlay, editable-rect, draw-layer components
+- Show GT class label prominently (large text above image)
+- Show predicted class label (if exists) with confidence score
+- Show correct/incorrect status with color coding (green/red)
+- Show class change dropdown (for editing the GT label -- replaces bbox editing)
+- Retain: similarity panel, tags, triage overlay, keyboard navigation
 
-**Gap:** For heavy annotation tasks (re-labeling hundreds of samples), an integration with CVAT would be valuable. But DataVisor is a personal tool, and setting up CVAT is non-trivial.
-
-**Priority:** NICE-TO-HAVE for v1.1. Most users of a personal introspection tool will make quick fixes in-app, not set up a separate CVAT instance. Defer until there is demonstrated need.
-
-**Complexity:** HIGH. Requires CVAT API integration, task creation, status tracking, result merging.
-
-**Depends on:** Nothing, but is only useful if the user has CVAT/Label Studio running.
-
-**Recommendation:** Defer indefinitely. Instead, support exporting flagged samples to COCO format (from 1E), which can be imported into any annotation tool. This achieves the same goal without tight coupling.
+**The annotation editor** currently supports bbox move/resize/delete and class change. For classification, only class change is relevant. The `class-picker.tsx` component (dropdown to change category) works as-is.
 
 ---
 
-## 3. Error Triage & Quality Analysis
-
-### 3A. Interactive Evaluation Dashboard (Confusion Matrix, PR Curves, Per-Class AP)
-
-**What competitors do:**
-
-FiftyOne's Model Evaluation panel is a standout feature:
-- Interactive confusion matrix: click any cell to filter the grid to those specific GT/prediction pairs
-- PR curves with adjustable confidence thresholds
-- Per-class metrics: precision, recall, F1, AP
-- All metrics are linked to the dataset view -- changing filters updates the evaluation metrics
-- Subset evaluation: `use_subset()` to evaluate on specific conditions (e.g., only nighttime images)
+### TS-8: Statistics Dashboard Adaptation
 
-Encord Active provides model quality metrics focused on active learning:
-- Entropy, Least Confidence, Margin, Variance, Mean Object Confidence
-- These rank samples by uncertainty for prioritized re-annotation
+| Attribute | Detail |
+|-----------|--------|
+| **Why expected** | The overview tab shows annotation counts, class distribution, split breakdown. These need to reflect classification semantics. |
+| **Complexity** | Low |
+| **Depends on** | Stats dashboard (existing), statistics hooks (existing) |
 
-**What DataVisor has:** Error categorization (TP/FP/FN/Label Error) and dataset statistics dashboard (class distribution, annotation counts). No confusion matrix, no PR curves, no per-class AP.
+**What to build:** Adapt dashboard text and metrics for classification context:
 
-**Gap:** The confusion matrix with click-to-filter is FiftyOne's killer evaluation feature. A CV engineer evaluating a model needs to see "my model confuses 'car' with 'truck' 40% of the time" and then immediately see those misclassified samples. DataVisor has the error categorization but lacks the statistical visualization layer.
+| Detection term | Classification term |
+|----------------|-------------------|
+| "Annotations" | "Labeled images" |
+| "Annotations per image" histogram | "Class distribution" (same chart, simpler) |
+| "Bounding box area" histogram | Remove (not applicable) |
+| mAP/AP metrics cards | Accuracy/F1 metrics cards |
+| IoU threshold slider | Remove |
 
-**Priority:** TABLE STAKES for a model evaluation tool. DataVisor already has GT vs Predictions comparison, but without aggregate metrics (confusion matrix, mAP, per-class AP), the evaluation is sample-by-sample rather than systematic.
-
-**Complexity:** HIGH.
-- Confusion matrix: Aggregate TP/FP/FN by class pair, render interactive heatmap, click-to-filter
-- PR curves: Sweep confidence thresholds, compute precision/recall per class, Recharts line chart
-- Per-class AP: Standard COCO-style AP computation
-- All must link to the grid view for click-to-filter
-
-**Depends on:** Evaluation pipeline (existing -- TP/FP/FN matching). Statistics dashboard (existing -- extend it). Recharts (existing in stack).
-
-**Recommendation:** Build the confusion matrix with click-to-filter as the centerpiece. This single feature closes the biggest evaluation gap. PR curves and per-class AP can follow. Use Recharts (already in the stack) for visualization.
+The underlying data is the same (annotations table rows), but the presentation changes. The class distribution chart works identically -- it counts annotations per category, which for classification is images per class.
 
 ---
 
-### 3B. Quality Scoring Metrics (Uniqueness, Hardness, Mistakenness)
-
-**What competitors do:**
-
-FiftyOne Brain provides four computed quality scores:
-- **Uniqueness:** Non-duplicate detection, comparing image content across the dataset. Useful for deduplication and early-stage data selection.
-- **Hardness:** Per-sample difficulty during training, computed from model logits. Helps identify which unlabeled examples deserve annotation budget.
-- **Mistakenness:** Annotation error probability, computed from model logits. Identifies likely mislabeled samples. Works on classification and detection.
-- **Representativeness:** How typical a sample is, revealing common data modes vs outliers.
-
-Additionally:
-- **Exact duplicate detection:** Identifies identical files with different names.
-- **Near-duplicate detection:** Finds visually similar images that may cause data quality issues.
-- **Leaky splits detection:** Finds potential data leakage between train/test/val splits.
-
-Encord Active provides 25+ quality metrics in three categories:
-- **Data quality:** Brightness (0-1), Sharpness (0-1), Uniqueness, Area, Diversity
-- **Label quality:** Border Proximity, Broken Object Tracks, Classification Quality, Label Duplicates, Object Classification Quality, Annotation Quality Score, Relative Area, Aspect Ratio
-- **Issue shortcuts:** Pre-configured filters for common problems -- Duplicates (uniqueness < 0.00001), Blur (sharpness < 0.005), Dark (brightness < 0.1), Bright (brightness > 0.7), Low Annotation Quality (quality < 0.02)
+## Differentiators
 
-**What DataVisor has:** Error categorization (Hard FP, Label Error, FN) and Pydantic AI agent for pattern detection. No per-sample quality scores. No deduplication. No hardness/mistakenness scoring.
+Features that set DataVisor apart. Not expected, but valuable. Build these after table stakes.
 
-**Gap:** DataVisor categorizes errors but does not score individual samples on quality dimensions. FiftyOne's mistakenness score and Encord's issue shortcuts are the most actionable features -- they surface the "worst" samples automatically.
+### D-1: Misclassification Drill-Down View
 
-**Priority:** TABLE STAKES for the "worst images ranking" feature planned in v1.1. A combined quality score requires component metrics.
+| Attribute | Detail |
+|-----------|--------|
+| **Value proposition** | Click a confused pair in the confusion matrix and see side-by-side examples of "predicted 8, actually 3" with the images. No other lightweight tool does this well. |
+| **Complexity** | Medium |
+| **Depends on** | Confusion matrix click-to-filter (TS-4), classification error analysis (TS-5) |
 
-**Complexity:** MEDIUM per metric.
-- Image uniqueness: Compute from existing DINOv2 embeddings + Qdrant similarity search (infrastructure exists)
-- Image brightness/sharpness: Simple image processing metrics (OpenCV)
-- Near-duplicate detection: Cosine similarity threshold on existing embeddings
-- Mistakenness: Requires model logits (not just predictions), which DataVisor does not currently import
+**What to build:** When a user clicks a confusion matrix cell (i, j), show a dedicated panel with:
+1. All images where GT=class_i and prediction=class_j
+2. Thumbnails with both labels visible (badge: "GT: 3 / Pred: 8")
+3. Sort by confidence (most confident mistakes first -- these are the most concerning)
+4. One-click ability to correct the GT label if it is actually wrong (label error)
 
-**Depends on:** DINOv2 embeddings (existing). Qdrant (existing). Evaluation pipeline (existing).
-
-**Recommendation:** For v1.1, implement:
-1. **Near-duplicate detection** using existing embeddings (low effort, high value)
-2. **Image quality metrics** (brightness, sharpness, contrast) for the AI agent
-3. **Composite "worst sample" score** combining: error count + low confidence + low uniqueness
-
-Defer hardness and mistakenness -- they require model logits, which would need a new import schema.
+This extends the existing click-to-filter behavior to be richer for classification. FiftyOne shows the filtered sample list, but DataVisor can show a purpose-built comparison view.
 
 ---
 
-### 3C. Error Triage Workflow (Review, Tag, Resolve)
-
-**What competitors do:**
-
-FiftyOne's triage workflow is programmatic:
-1. Run `evaluate_detections()` to tag TP/FP/FN
-2. Create a view filtering to FP or FN samples
-3. Browse in the App, optionally clicking confusion matrix cells
-4. Batch-tag samples via the App's tag icon
-5. Programmatically process tagged samples
-
-FiftyOne App batch operations include: select samples in grid -> tag selected -> clone selected -> delete selected -> delete selected labels. Selection works via checkbox on each sample.
-
-Encord's triage workflow is more structured:
-1. Encord Active surfaces issues via quality metrics and shortcuts (Blur, Dark, Low Quality, etc.)
-2. Users create "Collections" -- saved groups of problematic data units
-3. Issues can be tagged and tracked in Project Analytics
-4. Workflows route flagged samples back to annotation stages (Annotate -> Review -> Approve pipeline)
-5. Review mode: approve (N key), reject (B key), toggle review edit mode (Ctrl+E)
-6. "Data Agents" automate triage by integrating foundation models into workflows
-
-**What DataVisor has:** Error categorization (Hard FP, Label Error, FN), bulk tagging, saved views, AI agent recommendations. No structured review-approve workflow. No issue tracking.
+### D-2: Class-Level Performance Sparklines
 
-**Gap:** The gap is not in error detection (DataVisor's categorization + AI agent is strong) but in the triage workflow UX:
-- No dedicated "error review" mode that dims non-error samples
-- No approve/reject/skip workflow for reviewing flagged items
-- No progress tracking (reviewed 45/120 flagged samples)
+| Attribute | Detail |
+|-----------|--------|
+| **Value proposition** | At-a-glance view of which classes perform well and which are disasters, without reading a table of numbers. |
+| **Complexity** | Low |
+| **Depends on** | Per-class metrics (TS-3), Recharts (existing) |
 
-**Priority:** DIFFERENTIATOR -- a focused error triage mode with keyboard-driven review (approve/reject/skip) would be faster than both FiftyOne's programmatic approach and Encord's multi-platform workflow.
-
-**Complexity:** MEDIUM.
-- Review mode: filter to error samples, highlight current, dim others
-- Keyboard: N = correct (remove error tag), B = confirmed error, Space = skip
-- Progress: "Reviewed 45/120 -- 23 confirmed errors, 22 false alarms"
-- Persistence: Track review status per sample
-
-**Depends on:** Error categorization (existing). Tagging (existing). Keyboard shortcuts (new, see section 5).
-
-**Recommendation:** Build a dedicated "Triage Mode" that enters a focused review workflow: shows one error sample at a time, keyboard-driven approve/reject/skip, progress tracking, auto-advances to next sample. This is the kind of opinionated UX that makes DataVisor better than FiftyOne for error review, where FiftyOne forces users to manually browse and tag.
+**What to build:** In the per-class metrics table, add inline sparkline-style bars for precision, recall, and F1. Color-code: green (>0.9), yellow (0.7-0.9), red (<0.7). Sort by worst-performing class by default to surface problems immediately.
 
 ---
 
-### 3D. Worst Images Ranking (Combined Quality Score)
-
-**What competitors do:**
-
-FiftyOne Brain's `compute_hardness()` and `compute_mistakenness()` each produce a per-sample float score that can be sorted to find the worst samples. Users combine multiple scores by creating computed fields:
-```python
-dataset.set_field("quality_score", F("mistakenness") + F("hardness"))
-```
-
-Encord Active's issue shortcuts pre-define thresholds (blur < 0.005, dark < 0.1, etc.) and surface samples that fail multiple checks.
-
-Neither platform has a single "worst images" composite ranking out of the box.
+### D-3: Top-K Confidence Distribution
 
-**What DataVisor has:** Error categorization but no numeric ranking. The AI agent detects patterns but does not rank individual samples.
+| Attribute | Detail |
+|-----------|--------|
+| **Value proposition** | Shows where the model is uncertain vs confident, separated by correct/incorrect predictions. Reveals overconfident mistakes. |
+| **Complexity** | Medium |
+| **Depends on** | Classification predictions with confidence (TS-6), Recharts (existing) |
 
-**Gap:** A composite "data quality score" that ranks every sample by how problematic it is. This would power the "Smart worst images ranking" feature planned for v1.1.
+**What to build:** Two overlaid histograms:
+1. Confidence distribution of **correct** predictions (expect: skewed right, high confidence)
+2. Confidence distribution of **incorrect** predictions (expect: more spread out)
 
-**Priority:** DIFFERENTIATOR -- neither competitor does this as a first-class feature. A "Problems" tab showing samples ranked by composite badness score is novel.
+If the incorrect predictions have high confidence, the model is dangerously overconfident. If they cluster at low confidence, a simple threshold can filter them.
 
-**Complexity:** MEDIUM.
-- Define component metrics: error count, confidence variance, brightness, sharpness, near-duplicate distance, annotation density
-- Normalize each to 0-1
-- Weighted combination into single score
-- Store in DuckDB, expose as sortable field
-- UI: "Worst Images" view sorted by composite score
-
-**Depends on:** Quality metrics (3B). Error categorization (existing).
-
-**Recommendation:** Define a `quality_score` field computed from: (a) number of errors on sample, (b) mean prediction confidence (low = uncertain), (c) near-duplicate distance (high = unusual), (d) image quality metrics. Surface as a sortable column and as a dedicated "Worst Images" view.
+Also: **confidence calibration plot** (reliability diagram) showing predicted confidence vs actual accuracy. Most models are poorly calibrated, and this visualization makes it obvious.
 
 ---
 
-## 4. Deployment & Infrastructure
-
-### 4A. Docker Deployment
-
-**What competitors do:**
-
-FiftyOne (open-source) provides a Dockerfile for building custom images:
-- Configurable Python version
-- Persistent `/fiftyone` directory for databases and datasets
-- Docker Compose not officially provided for OSS, but community examples exist
-
-FiftyOne Enterprise provides:
-- Helm chart for Kubernetes deployment (helm.fiftyone.ai)
-- Docker Compose for smaller deployments
-- Central Authentication Service (CAS)
-- Multi-container architecture: app, API, database, CAS
-
-Encord is SaaS-only -- no self-hosted Docker deployment. Data stays in user's cloud storage; the platform is hosted by Encord.
+### D-4: Per-Split Evaluation Comparison
 
-**What DataVisor has:** No Docker support. Runs locally with `uvicorn` + `npm run dev`.
+| Attribute | Detail |
+|-----------|--------|
+| **Value proposition** | Compare model performance across train/val/test splits. Large gap between train and val accuracy immediately reveals overfitting. |
+| **Complexity** | Low |
+| **Depends on** | Split handling (existing), classification evaluation (TS-3) |
 
-**Gap:** DataVisor needs Docker for cloud VM deployment (per PROJECT.md). This is the most basic deployment gap.
-
-**Priority:** TABLE STAKES for v1.1 (explicitly in scope per milestone definition).
-
-**Complexity:** MEDIUM.
-- Dockerfile: Multi-stage build (Python backend + Node frontend build)
-- Docker Compose: Backend, frontend, Qdrant services
-- Volume mounts: Dataset storage, DuckDB database, Qdrant data
-- Environment configuration: Image source paths, GPU support (optional)
-
-**Depends on:** Nothing. Can be built in parallel with features.
-
-**Recommendation:** Multi-stage Dockerfile: (1) Node build stage for frontend, (2) Python runtime with bundled frontend. Docker Compose with three services: app, qdrant, and an init container for setup. Map volumes for `/data` (datasets), `/db` (DuckDB), `/qdrant` (vectors).
+**What to build:** A comparison table/chart showing accuracy, macro F1, and per-class F1 side-by-side for each split. Highlight cells where test performance is significantly worse than train (>5% drop). This is trivial to compute (run classification evaluation per split) but extremely informative.
 
 ---
 
-### 4B. Authentication
-
-**What competitors do:**
-
-FiftyOne OSS: No authentication. Anyone with access to the port can use it.
-
-FiftyOne Enterprise: Full auth via Central Authentication Service (CAS), supporting OIDC/OAuth2, Auth0, and air-gapped deployments. Role-based access control with user groups and permissions.
+### D-5: Embedding Scatter with Classification Coloring
 
-Encord: Cloud-hosted with SSO, SAML, team management, SOC-2/HIPAA/GDPR compliance.
+| Attribute | Detail |
+|-----------|--------|
+| **Value proposition** | The existing t-SNE scatter plot colored by class label instantly shows cluster quality. Misclassifications visible as dots in the wrong cluster. |
+| **Complexity** | Low |
+| **Depends on** | Embedding scatter (existing), classification labels |
 
-**What DataVisor has:** No authentication. Open port.
+**What to build:** The existing embedding scatter already supports coloring by class. For classification datasets, default to coloring by GT class label. Add a toggle to color by:
+1. **GT class** (default) -- shows natural cluster structure
+2. **Predicted class** -- shows model's view of the data
+3. **Correct/incorrect** -- highlights all misclassifications as red dots
 
-**Gap:** When deployed on a cloud VM, the app is exposed to the internet. Basic auth is the minimum security requirement. This is explicitly in scope for v1.1.
-
-**Priority:** TABLE STAKES for cloud deployment. Without auth, anyone who discovers the URL can access your dataset.
-
-**Complexity:** LOW.
-- Single-user basic auth (username/password from environment variable)
-- Applied as middleware on all API routes and frontend routes
-- No user management, no RBAC, no SSO -- just a password gate
-
-**Depends on:** Docker deployment (4A).
-
-**Recommendation:** Implement as FastAPI middleware: check `Authorization: Basic ...` header against `DATAVISOR_USERNAME` / `DATAVISOR_PASSWORD` env vars. Frontend: show login form, store token in session. This is explicitly scoped as single-user in PROJECT.md -- do not over-engineer.
+Option 3 is the killer feature: overlay misclassification status on the embedding plot. Misclassified samples that are near the decision boundary (cluster edge) are expected. Misclassified samples deep inside a correct cluster suggest label errors.
 
 ---
 
-### 4C. Cloud Deployment Scripts
-
-**What competitors do:**
-
-FiftyOne Enterprise: Helm chart for Kubernetes with detailed docs (helm.fiftyone.ai). Community Docker deployment guides.
-
-FiftyOne OSS: Remote sessions via SSH port forwarding (`fiftyone app connect --destination user@host`). This is the simplest cloud access pattern.
+## Anti-Features
 
-Encord: No deployment needed (SaaS).
+Features to explicitly NOT build for this milestone.
 
-**What DataVisor has:** No deployment scripts.
-
-**Gap:** PROJECT.md specifies "GCP deployment script + local run script with setup instructions."
-
-**Priority:** TABLE STAKES for v1.1 (explicitly in scope).
-
-**Complexity:** LOW-MEDIUM.
-- `scripts/deploy-gcp.sh`: Create GCE instance, install Docker, pull/build image, start compose
-- `scripts/run-local.sh`: Docker compose up with sensible defaults
-- Documentation: Setup instructions, port configuration, data mounting
-
-**Depends on:** Docker deployment (4A). Auth (4B).
-
-**Recommendation:** Provide two scripts: (1) `run-local.sh` for Docker Compose on local machine, (2) `deploy-gcp.sh` for GCE VM provisioning with startup script. Both use Docker Compose. The GCP script should configure firewall rules for port 443 only with auth required.
+| Anti-Feature | Why Avoid | What to Do Instead |
+|--------------|-----------|-------------------|
+| **Multi-label classification** | Different data model (multiple labels per image), different metrics (hamming loss, subset accuracy), different UI (checkbox lists instead of single badge). Scope explosion. | Scope to single-label only. Add multi-label in a future milestone if needed. |
+| **Top-K evaluation** | Requires importing full probability distributions (N probabilities per image per class). Complicates prediction import schema significantly. | Import only top-1 prediction with confidence. Note: the confidence score captures some of this info already. |
+| **PR curves for classification** | PR curves are less informative for multi-class classification than for detection. The confusion matrix and per-class precision/recall table are better tools. Confidence-based filtering (existing) handles the threshold sweep use case. | Show per-class precision/recall in a table. Use the confidence histogram (D-3) for threshold analysis. |
+| **mAP for classification** | mAP is a detection metric (requires IoU). Accuracy and macro F1 are the standard classification metrics. Showing mAP would confuse users. | Show accuracy, macro F1, weighted F1. |
+| **Bbox editing for classification** | No bounding boxes. The editable-rect, draw-layer components are irrelevant. | Show class label editor (dropdown) instead. |
+| **IoU threshold controls** | No spatial matching, no IoU. Showing an IoU slider would confuse users. | Hide IoU controls when `task_type === 'classification'`. |
+| **Detection-specific error categories** | "Hard FP" (no nearby GT box) has no meaning in classification. "Label Error" (correct box, wrong class) conflates with misclassification. | Use simpler categories: Correct, Misclassified, Missing Prediction. |
 
 ---
 
-### 4D. Remote Sessions / Tunnel Access
-
-**What competitors do:**
-
-FiftyOne supports remote sessions natively:
-```bash
-# On remote machine
-fiftyone app launch --remote --port 5151
+## Feature Dependencies
 
-# On local machine
-fiftyone app connect --destination user@remote --port 5151
 ```
-This sets up SSH port forwarding automatically. Users can also manually forward: `ssh -N -L 5151:localhost:5151 user@remote`.
-
-**What DataVisor has:** No remote session support.
-
-**Gap:** Minor gap if Docker + auth is implemented (users just hit the URL). SSH tunneling is a nice developer convenience but not essential when basic auth exists.
-
-**Priority:** NICE-TO-HAVE. Docker + auth covers the primary use case.
-
-**Complexity:** LOW. Document the SSH tunnel approach: `ssh -N -L 8080:localhost:8080 user@vm`.
-
-**Depends on:** Docker deployment (4A).
-
-**Recommendation:** Document SSH tunneling as an alternative to basic auth for security-conscious users. No code needed -- just docs.
-
----
-
-## 5. Keyboard Shortcuts & Power-User UX
-
-### 5A. Core Navigation Shortcuts
-
-**What competitors do:**
-
-FiftyOne App shortcuts (accessed via `?` key):
-- `?` -- Show all shortcuts
-- `z` -- Crop/zoom to visible labels
-- `ESC` -- Reset view
-- Arrow keys (up/down) -- Rotate z-order of overlapping labels
-- Spacebar -- Play/pause video
-- `<` / `>` -- Frame-by-frame navigation (video, when paused)
-- `0-9` -- Seek to 0%-90% of video
-- Grid filtering and sorting via sidebar (no keyboard shortcuts for grid navigation)
-
-FiftyOne notably does NOT have keyboard shortcuts for: navigating between samples in the grid, toggling label visibility by keyboard, or sample selection by keyboard. These are open feature requests (GitHub issues #2120, #1761).
-
-Encord annotation editor shortcuts (comprehensive):
-- **Navigation:** Arrow keys (next/previous sample, frame navigation), Space (play/pause)
-- **Editing:** Ctrl+Z/Ctrl+Shift+Z (undo/redo), Backspace (delete), Ctrl+C/V (copy/paste)
-- **Review:** N (approve), B (reject), Ctrl+E (toggle review edit)
-- **Tools:** D (freehand drawing), G (brush), H (eraser), `[`/`]` (brush size)
-- **Annotation:** A (add vertex), S (remove vertex), F (edit vertex), Enter (complete), Esc (cancel)
-- **Display:** Shift+H (hide all labels), Shift+N (show object names)
-- **Bulk:** Ctrl+A (select all), Shift+D (remove from frame)
-- **Meta:** Ctrl+Shift+K (open shortcuts menu), Ctrl+S (save), Shift+Enter (submit task)
-
-**What DataVisor has:** No keyboard shortcuts.
-
-**Gap:** Keyboard navigation is expected by power users. Both competitors support it, though FiftyOne's implementation is incomplete (no grid navigation shortcuts).
-
-**Priority:** TABLE STAKES for power-user adoption. CV engineers reviewing hundreds of samples expect keyboard navigation. The triage workflow (3C) depends on this.
-
-**Complexity:** MEDIUM.
-
-**Depends on:** Sample detail modal (existing). Grid view (existing). Triage mode (new, 3C).
-
-**Recommendation:** Implement in two tiers:
-
-**Tier 1 (v1.1 must-have):**
-| Shortcut | Action |
-|----------|--------|
-| `?` | Show shortcuts help overlay |
-| `ArrowLeft` / `ArrowRight` | Previous/next sample in modal |
-| `ESC` | Close modal / cancel action |
-| `Space` | Toggle label visibility |
-| `G` | Toggle GT labels |
-| `P` | Toggle prediction labels |
-| `T` | Tag current sample |
-| `Delete` / `Backspace` | Delete selected annotation (when editing) |
-| `Ctrl+Z` / `Cmd+Z` | Undo (when editing) |
-| `1-9` | Quick-assign class by index (when editing) |
-
-**Tier 2 (v1.1 nice-to-have):**
-| Shortcut | Action |
-|----------|--------|
-| `J` / `K` | Navigate grid (previous/next row) |
-| `Enter` | Open selected sample in modal |
-| `E` | Enter edit mode on selected annotation |
-| `F` | Toggle fullscreen on modal |
-| `/` | Focus search bar |
-| `N` / `B` | Approve / Reject in triage mode |
-
----
-
-### 5B. Customizable Hotkeys
-
-**What competitors do:**
-
-Encord allows customizable hotkeys: users can remap keyboard shortcuts to match their workflow preferences. Shortcuts menu via Ctrl+Shift+K.
-
-FiftyOne does not support customizable hotkeys.
-
-**What DataVisor has:** No shortcuts at all.
-
-**Gap:** Minor. Fixed shortcuts with good defaults cover 95% of needs.
-
-**Priority:** NICE-TO-HAVE. Not worth the complexity for v1.1.
-
-**Complexity:** MEDIUM. Requires a settings UI and keymap storage.
-
-**Recommendation:** Defer. Ship with sensible fixed defaults. Revisit if users request remapping.
-
----
-
-## 6. View & Workspace Management
-
-### 6A. Custom Workspaces / Panel Layouts
-
-**What competitors do:**
-
-FiftyOne Spaces (since v0.19) allow:
-- Multiple panels open simultaneously (Grid, Embeddings, Histograms, Map, Model Evaluation)
-- Split panels horizontally or vertically
-- Drag tabs between panels
-- Save workspace layouts with name, description, and color
-- Load saved workspaces programmatically or via UI
-- Workspace state includes panel types, sizes, positions, and internal panel state
-
-Encord does not have customizable workspace layouts -- it uses a fixed editor interface.
-
-**What DataVisor has:** Fixed layout with grid view and side-by-side embedding panel.
-
-**Gap:** FiftyOne's workspace system is mature and powerful. However, DataVisor's fixed layout already shows grid + embeddings + sidebar, which covers the primary workflow. Multi-panel workspaces are a power feature with diminishing returns for a personal tool.
-
-**Priority:** NICE-TO-HAVE for v1.1. The current layout works.
-
-**Complexity:** HIGH. Requires a panel framework, drag-and-drop layout, persistence.
-
-**Depends on:** Nothing, but affects all existing UI components.
-
-**Recommendation:** Defer to v1.2+. Focus v1.1 on the single-layout experience with the planned new features (triage mode, evaluation dashboard). If workspaces are ever added, start with a simple tab system rather than full drag-and-drop panels.
-
----
-
-### 6B. Histograms / Distribution Panels
-
-**What competitors do:**
-
-FiftyOne has a Histograms panel that shows:
-- Distribution of any field (class labels, confidence scores, metadata values)
-- Interactive: click histogram bars to filter the grid
-- Updates automatically as the view changes
-
-Encord Active shows metric distributions for each quality metric.
-
-**What DataVisor has:** Dataset statistics dashboard with class distribution (bar chart) and annotation counts. Not interactive (clicking does not filter).
-
-**Gap:** Interactive histograms that filter the grid are a natural extension of the existing statistics dashboard.
-
-**Priority:** DIFFERENTIATOR -- interactive histograms (click bar to filter) would connect the statistics dashboard to the grid view, enabling quick data exploration by distribution.
-
-**Complexity:** MEDIUM.
-- Render histograms for any numeric/categorical field (Recharts, already in stack)
-- Click handler: clicking a bar adds a filter to the sidebar
-- Bidirectional: changing sidebar filters updates histogram highlighting
-
-**Depends on:** Statistics dashboard (existing). Sidebar filtering (existing). Recharts (existing).
-
-**Recommendation:** Make the existing statistics dashboard interactive. When a user clicks on a class in the distribution chart, filter the grid to that class. When they click a confidence range bar, filter to that range. This requires minimal new UI -- just adding click handlers to existing Recharts components and dispatching filter actions to the Zustand store.
-
----
-
-### 6C. Map / Geolocation Panel
-
-**What competitors do:**
-
-FiftyOne has a Map panel (Mapbox GL JS) for datasets with GeoLocation fields:
-- Scatterplot of sample locations on a map
-- Lasso selection on the map filters the grid
-- Multiple map types
-
-Encord does not have a map panel.
-
-**What DataVisor has:** No geolocation support.
-
-**Gap:** Only relevant for datasets with GPS metadata (autonomous driving, satellite imagery, drone footage).
-
-**Priority:** NICE-TO-HAVE. Out of scope for v1.1 unless the user's datasets include geolocation.
-
-**Complexity:** MEDIUM. Mapbox GL JS integration, GeoJSON field handling.
-
-**Recommendation:** Defer. Only build if there is a specific need for geolocation-aware datasets.
-
----
-
-## 7. Advanced Features
-
-### 7A. Model Zoo (Run Inference In-App)
-
-**What competitors do:**
-
-FiftyOne Model Zoo provides:
-```python
-import fiftyone.zoo as foz
-model = foz.load_zoo_model("faster-rcnn-resnet50-fpn-coco-torch")
-dataset.apply_model(model, label_field="predictions")
+[TS-1: JSONL Parser + Schema]
+    |
+    +---> [TS-2: Class Label Badge]
+    |         |
+    |         +---> [D-5: Embedding Coloring] (uses class labels)
+    |
+    +---> [TS-6: Prediction Import]
+    |         |
+    |         +---> [TS-3: Classification Eval Metrics]
+    |         |         |
+    |         |         +---> [TS-4: Confusion Matrix Adaptation]
+    |         |         |         |
+    |         |         |         +---> [D-1: Misclassification Drill-Down]
+    |         |         |
+    |         |         +---> [TS-5: Error Analysis]
+    |         |         |         |
+    |         |         |         +---> [D-3: Confidence Distribution]
+    |         |         |
+    |         |         +---> [D-2: Per-Class Sparklines]
+    |         |         |
+    |         |         +---> [D-4: Per-Split Comparison]
+    |         |
+    +---> [TS-7: Detail Modal Adaptation]
+    |
+    +---> [TS-8: Stats Dashboard Adaptation]
 ```
-- 70+ pre-trained models from PyTorch and TensorFlow
-- `apply_model()` runs inference and stores predictions as label fields
-- `compute_embeddings()` generates embeddings from any model
-- Custom model support via `TorchImageModel` class
-
-Encord integrates models via "Data Agents" for pre-labeling and automated review.
-
-**What DataVisor has:** Import pre-computed predictions (JSON). VLM auto-tagging (Moondream2). No general model inference.
 
-**Gap:** DataVisor imports predictions but does not run inference. Users must run models externally and import results.
+**Critical path:** TS-1 (parser/schema) unblocks everything. TS-6 (prediction import) unblocks all evaluation features. TS-3 (metrics) unblocks all downstream analysis.
 
-**Priority:** NICE-TO-HAVE for v1.1. The import-predictions workflow is sufficient for a personal tool. Running inference adds GPU management complexity.
-
-**Complexity:** HIGH. Model download, GPU scheduling, inference pipeline, result storage.
-
-**Depends on:** Prediction import (existing).
-
-**Recommendation:** Defer. The existing "import predictions" workflow is pragmatic. Running inference is a different product surface. If added later, start with a single model (e.g., YOLOv8) rather than a full zoo.
+**Parallelizable:** TS-2 (badge display), TS-7 (detail modal), and TS-8 (stats dashboard) can be built in parallel once TS-1 is complete. They all depend on having classification data in the database but not on each other.
 
 ---
 
-### 7B. Similarity Search UX
-
-**What competitors do:**
+## MVP Recommendation
 
-FiftyOne supports multiple similarity backends:
-- scikit-learn, Qdrant, Redis, Pinecone, MongoDB, Elasticsearch, Milvus, LanceDB
-- "Find similar" from any sample: `dataset.sort_by_similarity(sample_id, k=25)`
-- Image-level and patch-level (object crop) similarity
-- Text-to-image similarity via CLIP embeddings
+**Phase 1 (Core Ingestion + Display):**
+1. TS-1: JSONL ingestion parser + schema extension
+2. TS-2: Class label badge on thumbnails
+3. TS-7: Sample detail modal adaptation
+4. TS-8: Statistics dashboard adaptation
 
-Encord Active provides similarity search, natural language search, and image-based search.
+This gets a classification dataset loaded, browsable, and visually meaningful. Users can explore the dataset, see class distribution, filter by class, use the embedding scatter.
 
-**What DataVisor has:** Qdrant vector storage for similarity search. The infrastructure exists but there is no "find similar" UI interaction.
+**Phase 2 (Evaluation + Error Analysis):**
+5. TS-6: Classification prediction import
+6. TS-3: Classification evaluation metrics
+7. TS-4: Confusion matrix adaptation
+8. TS-5: Classification error analysis
 
-**Gap:** The backend capability exists but the UX is missing. Users cannot right-click a sample and say "find similar images."
+This enables the full GT-vs-predictions workflow: import predictions, see accuracy/F1, explore the confusion matrix, identify misclassified samples.
 
-**Priority:** TABLE STAKES -- the infrastructure is already built. Exposing it via UI is low-hanging fruit with high value.
+**Phase 3 (Differentiators):**
+9. D-5: Embedding coloring by correct/incorrect (low effort, high impact)
+10. D-1: Misclassification drill-down view
+11. D-2: Per-class sparklines
+12. D-3: Confidence distribution histogram
+13. D-4: Per-split comparison
 
-**Complexity:** LOW. Add a "Find Similar" button/context menu item on each sample that queries Qdrant and updates the grid view.
-
-**Depends on:** Qdrant similarity search (existing). Grid view (existing).
-
-**Recommendation:** Add a "Find Similar" action to the sample detail modal and grid context menu. Query Qdrant for the k nearest neighbors by embedding, display results in the grid. This is one of the highest value-to-effort features available.
+**Defer:** Multi-label classification, top-k evaluation, PR curves, mAP.
 
 ---
 
-### 7C. Plugin System Enhancement (Python Panels, Operators)
-
-**What competitors do:**
+## Existing Features That Work As-Is for Classification
 
-FiftyOne's plugin system (mature, since v0.17+):
-- **Panels:** Full React components embedded in the App, with Python backend logic
-- **Python Panels (since v0.25):** Write panels entirely in Python (no JS needed)
-- **Operators:** User-facing actions (simple to complex) that can be composed
-- Configuration via `fiftyone.yml` manifest
-- Plugin marketplace and curated plugin list
+These features require NO changes:
 
-**What DataVisor has:** `BasePlugin` class with ingestion/UI/transformation hooks.
-
-**Gap:** DataVisor's plugin system is simpler by design (Python-only). The gap is not in architecture but in ecosystem -- there are no third-party plugins yet.
-
-**Priority:** NICE-TO-HAVE for v1.1. The plugin system exists and works. Enhancements are not urgent.
-
-**Complexity:** Varies by enhancement.
-
-**Recommendation:** No plugin system changes for v1.1. Focus on core features. The existing `BasePlugin` is sufficient for extensibility.
+| Feature | Why It Works |
+|---------|-------------|
+| **Image grid browser** | Renders thumbnails. Classification just needs a different overlay (badge instead of bbox). |
+| **t-SNE embedding scatter** | DINOv2 embeddings are computed from images, not annotations. Works identically. |
+| **Lasso filtering** | Selects by sample ID. Task-agnostic. |
+| **Find similar** | Qdrant similarity search uses image embeddings. Task-agnostic. |
+| **Near-duplicates** | Embedding distance. Task-agnostic. |
+| **Saved views** | Filter state persistence. Task-agnostic. |
+| **Tags / triage workflow** | Sample-level operations. Task-agnostic. |
+| **Keyboard shortcuts** | Sample navigation. Task-agnostic. |
+| **Split filtering** | Filters by split field. Task-agnostic. |
+| **Search by filename** | Text search. Task-agnostic. |
+| **VLM auto-tagging** | Uses image content, not annotations. Task-agnostic. |
+| **AI agent analysis** | Operates on statistics and error data. Needs updated prompts for classification context but architecture is the same. |
 
 ---
 
-### 7D. 3D Visualization (Point Clouds, Meshes)
-
-**What competitors do:**
-
-FiftyOne (since v0.17/0.24): 3D point cloud visualization, 3D bounding boxes, 3D polylines, mesh rendering, orthographic projection in grid view, dedicated 3D visualizer with configurable lights and materials.
-
-Encord (2025): LiDAR point cloud support (.pcd, .ply, .las, .laz, .mcap), sensor fusion visualization.
-
-**What DataVisor has:** 2D images only.
-
-**Gap:** Only relevant for 3D CV datasets (autonomous driving, robotics).
-
-**Priority:** OUT OF SCOPE per PROJECT.md. "3D point cloud visualization -- different rendering pipeline entirely."
-
-**Recommendation:** Defer indefinitely per project constraints.
-
----
-
-### 7E. Video Support
-
-**What competitors do:**
-
-FiftyOne: Video datasets with frame-by-frame browsing, temporal detection, playback controls (spacebar play/pause, `<`/`>` frame navigation, `0-9` seek).
-
-Encord: Full video annotation with keyframe interpolation, object tracking, temporal ranges.
-
-**What DataVisor has:** Image-only.
-
-**Gap:** Out of scope per PROJECT.md.
-
-**Priority:** OUT OF SCOPE. "Video annotation support -- image-only for now."
-
-**Recommendation:** Defer per project constraints.
-
----
-
-## 8. Data Operations
-
-### 8A. View Expressions / Advanced Filtering
-
-**What competitors do:**
-
-FiftyOne provides a rich Python API for dataset views:
-```python
-from fiftyone import ViewField as F
-
-# Chain view stages
-view = (
-    dataset
-    .match_tags("validation")
-    .match(F("metadata.size_bytes") >= 48 * 1024)
-    .filter_labels("predictions", F("confidence") > 0.8)
-    .sort_by("filepath")
-    .limit(100)
-)
-```
-
-View stages include: `match()`, `filter_labels()`, `filter_field()`, `exists()`, `select()`, `exclude()`, `select_fields()`, `exclude_fields()`, `sort_by()`, `limit()`, `skip()`, `take()`, `shuffle()`, `match_tags()`, plus array operations (`.length()`, `.filter()`, `.map()`).
-
-Saved views store the filter rules, not the data -- storage efficient.
-
-**What DataVisor has:** Sidebar metadata filtering (dynamic on any field), search by filename, sort by metadata, saved views. No programmatic view API.
-
-**Gap:** DataVisor's UI-based filtering covers the common cases. The gap is the lack of a programmatic API for complex multi-stage filter chains. This matters for power users who want reproducible, scriptable data exploration.
-
-**Priority:** NICE-TO-HAVE for v1.1. The UI-based filtering covers 90% of use cases. A Python API is a v2 feature.
-
-**Complexity:** HIGH for a full view expression system. LOW for extending the existing filter system.
-
-**Depends on:** Sidebar filtering (existing). DuckDB (existing -- already supports complex SQL).
-
-**Recommendation:** Defer the Python view API. For v1.1, extend the sidebar to support: (a) filter by annotation count, (b) filter by prediction confidence range, (c) filter by error type. These cover the most common advanced filtering needs without a programmatic API.
-
----
-
-### 8B. Computed / Derived Fields
-
-**What competitors do:**
-
-FiftyOne allows adding computed fields:
-```python
-dataset.add_sample_field("num_objects", fo.IntField)
-dataset.set_values("num_objects", [len(s.ground_truth.detections) for s in dataset])
-```
-And ViewExpressions for on-the-fly computation:
-```python
-view = dataset.set_field("quality", F("mistakenness") + F("hardness"))
-```
-
-**What DataVisor has:** Metadata fields from ingestion. No user-defined computed fields.
-
-**Gap:** Computed fields are useful for combining multiple metrics into composite scores (like the quality score from 3D).
-
-**Priority:** NICE-TO-HAVE for v1.1. Can be implemented server-side with DuckDB computed columns.
-
-**Complexity:** LOW for server-side computed fields in DuckDB. MEDIUM for exposing in UI.
-
-**Depends on:** DuckDB (existing).
-
-**Recommendation:** Implement the quality score (3D) as a computed field in DuckDB. Do not build a general user-defined field system for v1.1 -- just pre-compute the fields DataVisor needs.
-
----
-
-## Feature Priority Summary
-
-### Must Build for v1.1 (Table Stakes + High-Value Differentiators)
-
-| # | Feature | Priority | Complexity | Section |
-|---|---------|----------|------------|---------|
-| 1 | YOLO + VOC format import | Table Stakes | Medium | 1A |
-| 2 | Train/val/test split handling | Table Stakes | Medium | 1B |
-| 3 | Smart folder detection UI | Differentiator | Medium | 1C |
-| 4 | Dataset export (COCO, YOLO) | Table Stakes | Medium | 1E |
-| 5 | Bbox editing (move/resize/delete) | Table Stakes | High | 2A |
-| 6 | Interactive confusion matrix + click-to-filter | Table Stakes | High | 3A |
-| 7 | Near-duplicate detection | Table Stakes | Low | 3B |
-| 8 | Image quality metrics (brightness, sharpness) | Table Stakes | Low | 3B |
-| 9 | Error triage mode (keyboard review workflow) | Differentiator | Medium | 3C |
-| 10 | Worst images composite ranking | Differentiator | Medium | 3D |
-| 11 | Docker deployment | Table Stakes | Medium | 4A |
-| 12 | Basic auth | Table Stakes | Low | 4B |
-| 13 | Deployment scripts (local + GCP) | Table Stakes | Low-Medium | 4C |
-| 14 | Keyboard shortcuts (Tier 1) | Table Stakes | Medium | 5A |
-| 15 | "Find Similar" UI button | Table Stakes | Low | 7B |
-| 16 | Interactive histograms (click-to-filter) | Differentiator | Medium | 6B |
-
-### Defer to v1.2+
-
-| # | Feature | Why Defer | Section |
-|---|---------|-----------|---------|
-| 17 | Create new annotations | Quick corrections (edit/delete) are sufficient for v1.1 | 2B |
-| 18 | CVAT/Label Studio integration | Export to COCO format achieves same goal | 2C |
-| 19 | PR curves + per-class AP | Confusion matrix is the priority; curves follow naturally | 3A |
-| 20 | Mistakenness / hardness scoring | Requires model logits import schema | 3B |
-| 21 | Custom workspaces | Current layout works; panels are a large refactor | 6A |
-| 22 | Customizable hotkeys | Fixed defaults are sufficient | 5B |
-| 23 | Model zoo / in-app inference | Import predictions workflow is pragmatic | 7A |
-| 24 | View expression Python API | UI filtering covers 90% of use cases | 8A |
-| 25 | Demo / quickstart dataset | Low effort but not core to v1.1 delivery | 1D |
-
-### Explicitly Out of Scope
-
-| Feature | Reason | Section |
-|---------|--------|---------|
-| 3D point cloud visualization | Different rendering pipeline, per PROJECT.md | 7D |
-| Video support | Image-only, per PROJECT.md | 7E |
-| Map / geolocation panel | No current need for geo datasets | 6C |
-| Multi-user auth / RBAC | Personal tool, per PROJECT.md | 4B |
-| Plugin system overhaul | Existing BasePlugin is sufficient | 7C |
-
----
-
-## Feature Dependencies (v1.1 Build Order)
+## Sources
 
-```
-[Docker + Auth + Deploy Scripts]  (parallel with everything)
-     |
-     v
-[YOLO + VOC Parsers] ──> [Smart Folder Detection UI] ──> [Split Handling]
-     |
-     v
-[Dataset Export]  (requires format writers from parsers)
-     |
-     v
-[Image Quality Metrics] ──> [Near-Duplicate Detection] ──> [Composite Score]
-     |                                                          |
-     v                                                          v
-[Bbox Editing in Modal] ──> [Keyboard Shortcuts] ──> [Error Triage Mode]
-     |                                                          |
-     v                                                          v
-[Interactive Confusion Matrix] ──────────────────────> [Click-to-Filter]
-     |
-     v
-[Interactive Histograms]
-     |
-     v
-["Find Similar" Button]  (uses existing Qdrant infrastructure)
-```
+### FiftyOne (HIGH confidence -- official documentation)
+- [FiftyOne Classification Evaluation API](https://docs.voxel51.com/api/fiftyone.utils.eval.classification.html)
+- [FiftyOne Evaluating Models](https://docs.voxel51.com/user_guide/evaluation.html)
+- [FiftyOne Evaluate Classifications Tutorial](https://docs.voxel51.com/tutorials/evaluate_classifications.html)
+- [FiftyOne Drawing Labels](https://docs.voxel51.com/user_guide/draw_labels.html)
 
-**Critical path:** Docker/Auth and Format Parsers can start simultaneously. Most features build on existing infrastructure (DuckDB, Qdrant, Zustand stores). The confusion matrix and triage mode are the two highest-complexity features and should be prioritized early in development.
+### Cleanlab (HIGH confidence -- official documentation)
+- [Cleanlab Image Classification Tutorial](https://docs.cleanlab.ai/master/tutorials/image.html)
+- [Cleanlab Datalab Image Issues](https://docs.cleanlab.ai/master/tutorials/datalab/image.html)
+- [Cleanlab GitHub](https://github.com/cleanlab/cleanlab)
 
----
+### Roboflow (MEDIUM confidence -- product documentation)
+- [Roboflow Classification Label Visualization](https://docs.roboflow.com/workflow-blocks/visualize-predictions/classification-label-visualization)
 
-## Sources
+### Classification Metrics (HIGH confidence -- authoritative references)
+- [Google ML Classification Metrics](https://developers.google.com/machine-learning/crash-course/classification/accuracy-precision-recall)
+- [Evidently AI Multi-class Metrics](https://www.evidentlyai.com/classification-metrics/multi-class-metrics)
+- [scikit-learn confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)
 
-### FiftyOne (HIGH confidence -- official documentation)
-- [FiftyOne Import Datasets (v1.12.0)](https://docs.voxel51.com/user_guide/import_datasets.html)
-- [FiftyOne Export Datasets (v1.11.1)](https://docs.voxel51.com/user_guide/export_datasets.html)
-- [FiftyOne Using Datasets (v1.12.0)](https://docs.voxel51.com/user_guide/using_datasets.html)
-- [FiftyOne Dataset Views (v1.12.0)](https://docs.voxel51.com/user_guide/using_views.html)
-- [FiftyOne App (v1.12.0)](https://docs.voxel51.com/user_guide/app.html)
-- [FiftyOne Evaluation (v1.11.1)](https://docs.voxel51.com/user_guide/evaluation.html)
-- [FiftyOne Brain](https://docs.voxel51.com/brain.html)
-- [FiftyOne Annotation (v1.11.0)](https://docs.voxel51.com/user_guide/annotation.html)
-- [FiftyOne Environments](https://docs.voxel51.com/installation/environments.html)
-- [FiftyOne Model Zoo (v1.11.1)](https://docs.voxel51.com/model_zoo/index.html)
-- [FiftyOne Dataset Zoo (v1.11.1)](https://docs.voxel51.com/dataset_zoo/datasets.html)
-- [FiftyOne Plugins Development (v1.11.1)](https://docs.voxel51.com/plugins/developing_plugins.html)
-- [FiftyOne Interactive Plots (v1.12.0)](https://docs.voxel51.com/user_guide/plots.html)
-- [FiftyOne Enterprise Helm Chart](https://helm.fiftyone.ai/)
-- [FiftyOne Teams Deployment (GitHub)](https://github.com/voxel51/fiftyone-teams-app-deploy)
-
-### FiftyOne (MEDIUM confidence -- blog posts, GitHub issues)
-- [FiftyOne v0.24 Announcement (3D, Workspaces)](https://voxel51.com/blog/announcing-fiftyone-0-24-with-3d-meshes-and-custom-workspaces)
-- [FiftyOne v0.25 Announcement (Python Panels, SAM 2)](https://voxel51.com/blog/announcing-fiftyone-0-25)
-- [FiftyOne GitHub Issue #2120 (Selection shortcut FR)](https://github.com/voxel51/fiftyone/issues/2120)
-- [FiftyOne GitHub Issue #1761 (Hide labels shortcut FR)](https://github.com/voxel51/fiftyone/issues/1761)
-- [FiftyOne GitHub Issue #1780 (from_dir failure bug)](https://github.com/voxel51/fiftyone/issues/1780)
-- [FiftyOne GitHub Issue #1781 (VOC same-directory bug)](https://github.com/voxel51/fiftyone/issues/1781)
-- [FiftyOne Model Evaluation Blog](https://voxel51.com/blog/unified-model-insights-with-fiftyone-model-evaluation-workflows)
-
-### Encord (HIGH confidence -- official documentation)
-- [Encord Getting Started](https://docs.encord.com/platform-documentation/GettingStarted/gettingstarted-welcome)
-- [Encord Annotate Overview](https://docs.encord.com/platform-documentation/Annotate/annotate-overview)
-- [Encord Label Editor](https://docs.encord.com/platform-documentation/Annotate/annotate-label-editor)
-- [Encord Editor Shortcuts](https://docs.encord.com/platform-documentation/Annotate/annotate-label-editor/annotate-label-editor-settings-shortcuts)
-- [Encord Active Overview](https://docs.encord.com/platform-documentation/Active/active-overview)
-- [Encord Active Issue Shortcuts](https://docs.encord.com/platform-documentation/Active/active-basics/active-issue-shortcuts-prediction-types)
-- [Encord Active Model Quality Metrics](https://docs.encord.com/platform-documentation/Active/active-quality-metrics/active-model-quality-metrics)
-- [Encord 2025 Release Notes](https://docs.encord.com/release-notes/releasenotes-2025)
-
-### Encord (MEDIUM confidence -- marketing/blog)
-- [Encord Product Updates Feb 2025](https://encord.com/blog/encord-product-updates-february-2025/)
-- [Encord Data Quality Metrics Blog](https://encord.com/blog/data-quality-metrics/)
-- [Encord Annotate Product Page](https://encord.com/annotate/)
+### Label Studio (MEDIUM confidence -- official documentation)
+- [Label Studio Image Classification Template](https://labelstud.io/templates/image_classification)
 
 ---
-*Competitive feature analysis for: DataVisor v1.1 vs FiftyOne (Voxel51) + Encord*
-*Researched: 2026-02-12*
+*Classification feature landscape for: DataVisor classification support milestone*
+*Researched: 2026-02-18*
diff --git a/.planning/research/PITFALLS.md b/.planning/research/PITFALLS.md
index 34ea13f..57a27d6 100644
--- a/.planning/research/PITFALLS.md
+++ b/.planning/research/PITFALLS.md
@@ -1,767 +1,392 @@
-# Domain Pitfalls: DataVisor v1.1
+# Domain Pitfalls
 
-**Domain:** Adding Docker deployment, auth, annotation editing, smart ingestion, and error triage to an existing FastAPI + DuckDB + Next.js CV dataset introspection tool
-**Researched:** 2026-02-12
-**Scope:** Pitfalls specific to v1.1 features on the existing v1.0 codebase (12,720 LOC, 59 tests)
-**Overall confidence:** MEDIUM-HIGH
+**Domain:** Adding single-label classification support to an existing detection-focused CV dataset tool
+**Researched:** 2026-02-18
+**Confidence:** HIGH (all findings grounded in actual codebase analysis)
 
 ---
 
 ## Critical Pitfalls
 
-Mistakes that cause rewrites, data loss, or deployment failures.
+Mistakes that cause rewrites, data corruption, or broken existing workflows.
 
-### Pitfall 1: DuckDB WAL and Lock Files Not Surviving Docker Container Restarts
+### Pitfall 1: Schema Pollution -- Nullable BBox Columns Infect Every Query
 
-**Severity:** CRITICAL
-**Affects:** Docker containerization, data persistence
+**What goes wrong:** The `annotations` table has `bbox_x`, `bbox_y`, `bbox_w`, `bbox_h` as `DOUBLE NOT NULL`. Classification annotations have no bounding boxes. The naive fix is making these columns nullable or stuffing sentinel values (0,0,0,0), but then every existing query that touches bbox columns -- `_load_detections()`, `_compute_iou_matrix()`, `AnnotationOverlay`, `EditableRect`, area calculations, `AnnotationUpdate` -- must guard against null/sentinel bboxes. Miss one query and you get silent wrong results or crashes.
 
-**What goes wrong:**
-DuckDB creates three filesystem artifacts alongside the database file: `datavisor.duckdb`, `datavisor.duckdb.wal` (write-ahead log), and a `datavisor.duckdb.tmp/` directory for intermediate processing. The WAL file is deleted on clean shutdown but persists if the container is killed (SIGKILL from `docker stop` after the 10s grace period, OOM kill, or crash). On next container start, DuckDB replays the WAL to recover uncommitted data. If the WAL file is missing (because the volume mount was only for the `.duckdb` file, not the directory), data loss occurs silently -- DuckDB opens without error but the last transactions are gone.
+**Why it happens:** The annotations table was designed as a detection-first schema. Every column assumes spatial data exists. The `area` column is computed as `bbox_w * bbox_h`. The `AnnotationCreate` model requires all four bbox fields. The `AnnotationUpdate` model only has bbox fields -- it literally cannot update a classification annotation.
 
-The existing `DuckDBRepo.__init__` in `app/repositories/duckdb_repo.py` creates the parent directory via `db_path.parent.mkdir(parents=True, exist_ok=True)` and connects to a file at `data/datavisor.duckdb` (from `config.py`). In Docker, this `data/` directory must be a volume mount, not just the `.duckdb` file.
+**Concrete code locations affected:**
+- `duckdb_repo.py:57-72` -- annotations table DDL with `NOT NULL` bbox columns
+- `app/models/annotation.py:6-57` -- all three Pydantic models hardcode bbox fields
+- `app/routers/annotations.py:42` -- `area = body.bbox_w * body.bbox_h`
+- `app/services/evaluation.py:225` -- `_BoxRow` type alias includes bbox coordinates
+- `frontend/src/types/annotation.ts:6-19` -- `Annotation` interface requires bbox fields
+- `frontend/src/components/grid/annotation-overlay.tsx:63-72` -- renders `<rect>` from bbox
 
-**Why it happens:**
-Developers volume-mount only the database file (`-v ./data/datavisor.duckdb:/app/data/datavisor.duckdb`) instead of the entire directory. The WAL and tmp files are created as siblings on the container filesystem (ephemeral layer) and vanish when the container restarts. DuckDB's official documentation states: "If DuckDB exits normally, the WAL file is deleted upon exit. If DuckDB crashes, the WAL file is required to recover data."
+**Consequences:**
+- Classification annotations with NULL bboxes break `NOT NULL` constraints on insert
+- Sentinel values (0,0,0,0) produce 0-area rectangles in SVG overlays, 0-area in stats
+- Every SQL query selecting `bbox_*` columns returns meaningless data for classification
+- IoU computation on zero-sized boxes produces NaN or 0, silently breaking evaluation
 
-Additionally, Docker's default stop signal is SIGTERM with a 10-second timeout before SIGKILL. If FastAPI's shutdown handler (the `lifespan` context manager's cleanup in `app/main.py`) takes longer than 10 seconds -- possible during a large ingestion with thumbnail generation -- the container is killed before `db.close()` runs, leaving the WAL behind.
+**Prevention:** Add a `task_type` discriminator column to the `datasets` table (not annotations). Classification datasets never create bbox data. Use a separate code path for classification annotations that maps to a simpler schema view. Specifically:
+1. Add `task_type VARCHAR DEFAULT 'detection'` to `datasets` table
+2. For classification, annotations table still has bbox columns but they store 0.0 (not NULL) to preserve NOT NULL constraint, and a `task_type`-aware query layer skips them
+3. Better: create a `classifications` table with just `(id, dataset_id, sample_id, category_name, source, confidence, metadata)` -- one row per image, no bbox columns at all. This is cleaner but requires more code changes.
 
-**Prevention:**
-1. Volume-mount the entire `data/` directory, never individual files: `volumes: ["./data:/app/data"]`
-2. Add a `STOPSIGNAL SIGTERM` to the Dockerfile and set `stop_grace_period: 30s` in docker-compose.yml to give the lifespan handler time to close DuckDB cleanly
-3. Add an explicit `CHECKPOINT` call in the lifespan shutdown before `db.close()` to flush the WAL to the database file: `self.connection.execute("CHECKPOINT")`
-4. Ensure the container user has write permission to the entire mounted directory, not just the `.duckdb` file
-5. Set `checkpoint_threshold` via `PRAGMA checkpoint_threshold='8MB'` to checkpoint more frequently (default is 16MB), reducing WAL size and recovery window
-
-**Warning signs:**
-- Data disappears after `docker-compose restart` but not after `docker-compose down && docker-compose up`
-- A `.wal` file appears in the data directory after `docker stop` but is missing after `docker start`
-- `docker logs` shows DuckDB opening successfully but with fewer rows than expected
+**Recommendation:** Separate `classifications` table. The bbox columns are not "optional detection data" -- they are structurally meaningless for classification. Trying to reuse the annotations table forces every consumer to handle two shapes of data from one table. A separate table with shared query interfaces (via a service abstraction) is cleaner.
 
-**Phase to address:** Docker containerization (Phase 1 of v1.1)
+**Detection:** If you go the shared-table route, grep for `bbox_` across the codebase -- every hit is a location that needs a conditional. Currently 30+ references.
 
-**Confidence:** HIGH -- verified against DuckDB official documentation on [files created by DuckDB](https://duckdb.org/docs/stable/operations_manual/footprint_of_duckdb/files_created_by_duckdb) and [WAL recovery behavior](https://duckdb.org/docs/stable/connect/concurrency). WAL lock file issue confirmed in [DuckDB Issue #10002](https://github.com/duckdb/duckdb/issues/10002).
+**Phase to address:** Phase 1 (schema design). Get this wrong and everything downstream is a rewrite.
 
 ---
 
-### Pitfall 2: Qdrant Local Mode Cannot Run in Docker -- Must Migrate to Server Mode
+### Pitfall 2: Metric Confusion -- mAP/IoU Leaking into Classification Evaluation
 
-**Severity:** CRITICAL
-**Affects:** Docker containerization, Qdrant integration
+**What goes wrong:** The entire evaluation pipeline is built on IoU matching. `compute_evaluation()` uses `supervision.MeanAveragePrecision` and `supervision.ConfusionMatrix.from_detections()` which expect `sv.Detections` objects with `xyxy` bounding boxes. Classification evaluation needs accuracy, precision, recall, F1, and per-class metrics computed by exact label matching (no spatial component). If you try to reuse the detection evaluation with dummy bboxes, you get meaningless mAP scores.
 
-**What goes wrong:**
-The current codebase uses Qdrant in **local embedded mode**: `QdrantClient(path=str(path))` in `app/services/similarity_service.py`. This runs Qdrant as an in-process Python library with on-disk persistence at `data/qdrant/`. In Docker, you need Qdrant as a separate container service (server mode) because: (a) the embedded Qdrant client does not support concurrent access, which matters when multiple uvicorn workers run, (b) it adds ~500MB to the FastAPI container image, and (c) Qdrant's Docker image (`qdrant/qdrant`) is the canonical deployment path and provides proper health checks, metrics, and persistence.
+**Why it happens:** The evaluation service (`app/services/evaluation.py`) is 560 lines of detection-specific logic: IoU matrix computation, greedy matching, COCO-style interpolated AP. The API response model (`EvaluationResponse`) returns `map50`, `map75`, `map50_95`, `iou_threshold` -- all detection-specific fields. The frontend `evaluation-panel.tsx` renders PR curves and the confusion matrix with IoU/confidence sliders.
 
-Switching from `QdrantClient(path=...)` to `QdrantClient(host="qdrant", port=6333)` is a one-line code change, but the data migration is not. The local-mode on-disk format is not compatible with the server-mode storage. All existing embeddings synced to Qdrant must be re-synced from DuckDB after the migration.
+**Concrete code locations affected:**
+- `app/services/evaluation.py` -- entire file assumes detection
+- `app/services/error_analysis.py` -- `categorize_errors()` uses IoU matching
+- `app/services/annotation_matching.py` -- `match_sample_annotations()` is IoU-based
+- `app/models/evaluation.py` -- `APMetrics` has mAP fields, `EvaluationResponse` has `iou_threshold`
+- `frontend/src/types/evaluation.ts` -- TypeScript mirrors backend detection-specific types
+- `frontend/src/components/stats/evaluation-panel.tsx` -- IoU slider, PR curves
+- `frontend/src/components/stats/metrics-cards.tsx` -- likely shows mAP
 
-**Why it happens:**
-Local mode is the recommended development path ("useful for development, prototyping and testing") and the existing code was designed for single-process local execution. Developers assume the migration is just changing the constructor, but forget about: (a) data format incompatibility, (b) network connectivity in docker-compose, (c) the need for an API key for security, and (d) health check dependencies (FastAPI should wait for Qdrant to be healthy before starting).
+**Consequences:**
+- Showing mAP for a classification dataset is nonsensical and misleading
+- IoU slider has no meaning -- users will be confused
+- PR curves per class are meaningful for classification but computed differently (no spatial matching)
+- Error analysis categories (Hard FP, Label Error based on IoU) do not apply
 
-**Prevention:**
-1. In `docker-compose.yml`, add Qdrant as a service with a volume for persistence:
-   ```yaml
-   qdrant:
-     image: qdrant/qdrant:latest
-     volumes: ["./data/qdrant_server:/qdrant/storage"]
-     ports: ["6333:6333"]
-   ```
-2. Update `SimilarityService.__init__` to accept either `path` (local) or `url` (server) based on environment:
-   ```python
-   if qdrant_url:
-       self.client = QdrantClient(url=qdrant_url)
-   else:
-       self.client = QdrantClient(path=str(path))
-   ```
-3. Add `DATAVISOR_QDRANT_URL` environment variable to `config.py` Settings class (default None for local dev)
-4. Add a `depends_on` with health check in docker-compose so FastAPI waits for Qdrant:
-   ```yaml
-   depends_on:
-     qdrant:
-       condition: service_healthy
-   ```
-5. On first Docker startup, the `ensure_collection` + `_sync_from_duckdb` flow in `SimilarityService` already handles syncing -- but verify it works when the collection is empty in a fresh Qdrant server
-
-**Warning signs:**
-- `ConnectionRefusedError` on FastAPI startup because Qdrant container is not yet ready
-- Similarity search returns empty results in Docker but works locally
-- FastAPI container image is 8GB+ because it bundles the Qdrant Rust binaries via qdrant-client's local mode
-
-**Phase to address:** Docker containerization (Phase 1 of v1.1)
-
-**Confidence:** HIGH -- verified against [Qdrant quickstart docs](https://qdrant.tech/documentation/quickstart/) and [qdrant-client README](https://github.com/qdrant/qdrant-client) which explicitly states "If you require concurrent access to local mode, you should use Qdrant server instead."
+**Prevention:** Build a separate `compute_classification_evaluation()` function and a `ClassificationEvaluationResponse` model. Route based on `dataset.task_type`. Classification evaluation is actually simpler: compare `gt_category` to `pred_category` per sample. Metrics: accuracy, macro/micro precision/recall/F1, per-class precision/recall/F1, confusion matrix (still works, but simpler -- no "background" row/column from unmatched detections).
 
----
+**Detection:** If the evaluation endpoint returns `iou_threshold` for a classification dataset, something went wrong.
 
-### Pitfall 3: NEXT_PUBLIC_API_URL Baked at Build Time, Not Configurable at Runtime
+**Phase to address:** Phase 2 (evaluation logic). Must come after schema but before UI work.
 
-**Severity:** CRITICAL
-**Affects:** Docker containerization, deployment flexibility
+---
 
-**What goes wrong:**
-The frontend's API base URL is set in `frontend/src/lib/constants.ts`:
+### Pitfall 3: UI Conditional Spaghetti -- `if detection else classification` Everywhere
+
+**What goes wrong:** Instead of polymorphic components, developers scatter `if (taskType === 'detection')` checks throughout the frontend. Components like `AnnotationOverlay`, `SampleModal`, `EvaluationPanel`, `ErrorAnalysisPanel`, `TriageOverlay`, `FilterSidebar`, `StatsDashboard` all need different rendering for classification vs detection. With 10+ components each having 2-3 conditionals, you get 30+ branching points that are easy to miss and hard to test.
+
+**Why it happens:** The fastest way to add classification support is to add conditionals to existing components. Each one is small and "just one more if-statement." But they compound:
+- `AnnotationOverlay`: render bbox rect vs class label badge
+- `SampleModal`: bbox editor vs class label display
+- `EvaluationPanel`: IoU slider vs no IoU slider
+- `MetricsCards`: mAP vs accuracy
+- `ErrorAnalysisPanel`: spatial error types vs correct/incorrect
+- `PerClassTable`: AP columns vs precision/recall/F1 columns
+- `ConfusionMatrix`: background row vs no background row
+- `AnnotationList`: bbox coordinates vs class label
+- `DrawLayer`: bbox drawing vs class assignment
+- `TriageOverlay`: per-bbox triage vs per-image triage
+
+**Consequences:**
+- Adding a third task type (segmentation, keypoint) requires touching every component again
+- Testing combinatorial explosion: each component x each task type
+- Easy to miss one conditional, producing a detection UI for classification data
+- Code reviews become "did you check all 30 places?"
+
+**Prevention:** Use a strategy/adapter pattern at the component boundary. Create a `TaskAdapter` that provides task-specific sub-components:
 ```typescript
-export const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? "http://localhost:8000";
+// Instead of 30 if-statements:
+const adapter = useTaskAdapter(dataset.task_type);
+// adapter.AnnotationOverlay -- renders bboxes or class badges
+// adapter.EvaluationPanel -- detection or classification metrics
+// adapter.getMetricLabel() -- "mAP@50" or "Accuracy"
 ```
+Alternatively, create parallel component trees: `detection/EvaluationPanel` and `classification/EvaluationPanel` with shared layout components. The dataset page picks the right tree once.
 
-`NEXT_PUBLIC_` environment variables are **inlined into the JavaScript bundle at `next build` time**. They are string-replaced in the compiled JS -- there is no runtime resolution. If you build the Docker image with `NEXT_PUBLIC_API_URL=http://localhost:8000` (or leave it unset), the compiled JS will contain the literal string `"http://localhost:8000"`. When you deploy to a GCP VM at `http://35.202.x.x:8000`, the frontend still calls `localhost:8000`, which fails because the browser is on the user's machine, not the VM.
-
-**Why it happens:**
-Next.js explicitly documents this: "Public environment variables will be inlined into the JavaScript bundle during `next build`." Developers either: (a) hardcode the URL and rebuild per environment, (b) set it at build time and forget it cannot change, or (c) try to set it in `docker run -e` and discover it has no effect.
-
-**Prevention:**
-1. **Option A (simplest for this project):** Use a reverse proxy (nginx/caddy) that serves both frontend and API from the same origin, eliminating the need for a separate API URL. Frontend calls `/api/...` which the proxy routes to the FastAPI backend. No CORS issues, no URL configuration.
-2. **Option B:** Use Next.js `publicRuntimeConfig` with `getServerSideProps` to inject the API URL at request time. But this forces SSR for every page.
-3. **Option C:** Use the `next-runtime-env` library to read environment variables at runtime via a thin server-side injection.
-4. **Option D:** Pass the API URL via a `<script>` tag injected into `_document.tsx` at container startup (entrypoint script replaces a placeholder in the built HTML).
-
-**Recommendation:** Option A is strongly preferred. A single-origin setup via reverse proxy eliminates CORS entirely and makes basic auth work seamlessly (see Pitfall 4). The existing `allow_origins=["*"]` in `app/main.py` can then be tightened.
+**Detection:** Count `if.*detection` or `if.*classification` or `taskType` in the frontend. If > 10, you have spaghetti.
 
-**Warning signs:**
-- Frontend works in local dev but shows "Failed to fetch" errors when deployed to GCP VM
-- Browser console shows requests to `http://localhost:8000` even though the app is accessed via a public IP
-- Setting `NEXT_PUBLIC_API_URL` in `docker run -e` has no effect
-
-**Phase to address:** Docker containerization (Phase 1 of v1.1)
-
-**Confidence:** HIGH -- verified against [Next.js environment variables documentation](https://nextjs.org/docs/pages/guides/environment-variables) and [multiple GitHub discussions](https://github.com/vercel/next.js/discussions/17641) confirming this is a build-time-only mechanism.
+**Phase to address:** Phase 3 (UI). Design the abstraction before writing any UI code.
 
 ---
 
-### Pitfall 4: Basic Auth Over HTTP Sends Credentials in Cleartext
+### Pitfall 4: Breaking Existing Detection Workflows via Shared Schema Migration
 
-**Severity:** CRITICAL
-**Affects:** Authentication, GCP VM deployment
+**What goes wrong:** A schema migration that alters the `annotations` table (making bbox columns nullable, adding columns, changing types) breaks existing detection datasets. DuckDB does not support transactional DDL in the same way PostgreSQL does. If the migration fails midway, you can end up with a partially altered schema.
 
-**What goes wrong:**
-HTTP Basic Authentication encodes credentials as `base64(username:password)` in the `Authorization` header. Base64 is encoding, not encryption. Without HTTPS, every request sends the password in cleartext over the network. On a GCP VM accessed over the public internet, anyone on the network path (ISP, coffee shop WiFi, GCP internal routing) can intercept the credentials. This is not a theoretical risk -- it is trivially exploitable with tools like Wireshark or `tcpdump`.
+**Why it happens:** The temptation to "just" ALTER TABLE is strong. The current schema uses `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` for backward-compatible additions (see `duckdb_repo.py:84-103`). But making `bbox_x DOUBLE NOT NULL` into `bbox_x DOUBLE` (nullable) is a destructive change that affects all existing data.
 
-Additionally, the existing SSE streams (ingestion progress, embedding progress, VLM progress) use the browser's native `EventSource` API, which **cannot set custom HTTP headers**. The `EventSource` constructor only supports `withCredentials: true` (for cookies) -- not `Authorization` headers. This means SSE endpoints either: (a) must use cookie-based auth instead of header-based auth, (b) must accept a token in the URL query string, or (c) must use a polyfill like `event-source-plus` or `@microsoft/fetch-event-source` that uses `fetch` under the hood.
+**Concrete risk:** DuckDB's `ALTER TABLE ... ALTER COLUMN` support is limited. Changing NOT NULL to nullable may require recreating the table. If classification data gets inserted into the same table with sentinel bbox values, existing queries that do `WHERE bbox_w > 0` or area-based filtering will include/exclude rows incorrectly.
 
-**Why it happens:**
-"Single-user basic auth" sounds simple, but the interaction between HTTP Basic Auth, HTTPS requirements, SSE limitations, and CORS creates a surprisingly complex surface. Developers implement basic auth on the API, test in the browser (which shows a native auth dialog), confirm it works, then deploy to HTTP and do not realize the credentials are exposed. The SSE issue is discovered only when progress streams break after adding auth middleware.
+**Consequences:**
+- Existing detection datasets produce different query results after migration
+- Users who re-open DataVisor after update find their detection workflows broken
+- Area-based filters (if any exist in saved views) return wrong results
+- No rollback path if migration corrupts data
 
 **Prevention:**
-1. **HTTPS is mandatory.** Use one of:
-   - Caddy as a reverse proxy (automatic HTTPS via Let's Encrypt, zero configuration for a domain)
-   - nginx with certbot
-   - GCP load balancer with managed SSL certificate (overkill for single-user)
-2. **For SSE + auth:** Use a session cookie set by a login endpoint rather than per-request Basic Auth headers. The flow: POST `/auth/login` with credentials -> server sets `HttpOnly, Secure, SameSite=Strict` cookie -> all subsequent requests (including `EventSource`) include the cookie automatically
-3. **If using reverse proxy (recommended from Pitfall 3):** Caddy/nginx can handle basic auth at the proxy layer, before requests reach FastAPI. This means zero auth code in FastAPI and SSE streams work without modification.
-4. **Never put credentials in URL query strings** -- they end up in server logs, browser history, and proxy logs
-
-**Warning signs:**
-- SSE streams break after adding `Depends(verify_auth)` to endpoints because `EventSource` does not send the `Authorization` header
-- Browser shows the native Basic Auth dialog on every page load (no session persistence)
-- Penetration test flags "credentials transmitted over unencrypted channel"
+1. Never modify existing column constraints -- add new tables/columns only
+2. Separate `classifications` table means zero changes to existing `annotations` table
+3. If you must use the shared table, add a `task_type` column (default 'detection') and filter by it -- never change existing column nullability
+4. Run migration on a copy of the database first as a smoke test
+5. Bump a schema version in the `datasets.metadata` JSON so old clients can warn
 
-**Phase to address:** Docker containerization / deployment (Phase 1 of v1.1)
+**Detection:** After migration, run `SELECT COUNT(*) FROM annotations WHERE bbox_x IS NULL` -- should be 0 for detection datasets.
 
-**Confidence:** HIGH -- EventSource header limitation verified against [MDN EventSource docs](https://developer.mozilla.org/en-US/docs/Web/API/EventSource/withCredentials) and [WHATWG HTML spec issue #2177](https://github.com/whatwg/html/issues/2177).
+**Phase to address:** Phase 1 (schema). Must be designed correctly before any data enters the system.
 
 ---
 
-### Pitfall 5: SVG-to-Canvas Coordinate System Mismatch When Adding Interactive Annotation Editing
+### Pitfall 5: Confusion Matrix Scaling with 43+ Classes
 
-**Severity:** CRITICAL
-**Affects:** Annotation editing feature
+**What goes wrong:** The current confusion matrix renders as an HTML table with `min-w-[32px]` per cell. With 10 classes + background = 11x11 = 121 cells, this is manageable. With 43 jersey numbers + background = 44x44 = 1,936 cells, the matrix becomes an unreadable 1,408px-wide table (44 * 32px) with tiny rotated labels that overlap.
 
-**What goes wrong:**
-The current annotation overlay in `frontend/src/components/grid/annotation-overlay.tsx` uses SVG with a `viewBox` matching the original image dimensions (`viewBox="0 0 ${imageWidth} ${imageHeight}"`). Annotation coordinates are in **original pixel space** and the SVG `preserveAspectRatio` handles all scaling automatically. This is elegant and correct for read-only display.
+**Why it happens:** The `ConfusionMatrix` component was designed for ~10 classes. It uses:
+- Rotated column headers with `maxHeight: 80` -- with 43 labels, they crowd
+- `min-w-[32px]` cells -- fine at 10, but 44 * 32 = 1,408px minimum width
+- Row-normalized values displayed as `norm.toFixed(2)` -- most cells will be "0.00" noise
+- Click handler for each cell -- 1,936 click targets, most empty
 
-For interactive editing (move, resize, delete bounding boxes), you need to switch to react-konva (Canvas-based) because SVG does not have built-in drag handles, transform controls, or efficient hit testing. But react-konva's coordinate system works differently:
+**Concrete code:** `frontend/src/components/stats/confusion-matrix.tsx:26-138`
 
-1. **Konva uses Stage/Layer coordinates**, not viewBox. There is no equivalent of SVG's `preserveAspectRatio="xMidYMid meet"`. You must manually compute the scale factor between the displayed image size and the original image dimensions.
-2. **Konva's Transformer modifies `scaleX`/`scaleY`, not `width`/`height`**. After a resize, the shape's `width()` is unchanged but `scaleX()` is 2.0. If you save `width()` to the database without multiplying by `scaleX()`, the annotation silently shrinks back to its original size.
-3. **Zoom and pan change the coordinate space.** If the user zooms in on an image, pointer events return coordinates in the zoomed space. Converting back to original pixel space requires `stage.getPointerPosition()` -> divide by stage scale -> subtract stage offset. Getting this wrong means annotations drift from their intended positions when zoomed.
-4. **The current system stores absolute pixel coordinates** (`bbox_x`, `bbox_y`, `bbox_w`, `bbox_h` in `annotations` table). Mutations must write back in the same coordinate space, not in display-space or stage-space.
-
-**Why it happens:**
-SVG handles coordinate transforms transparently; Canvas does not. Developers who have only worked with SVG overlays underestimate the manual coordinate math required by Canvas. The Transformer tool's scale-vs-dimension behavior is a [well-documented source of confusion](https://longviewcoder.com/2022/04/28/what-the-hell-did-the-transformer-actually-do-to-my-shape/) in the Konva community.
+**Consequences:**
+- Matrix is wider than any screen, requires horizontal scrolling
+- Labels overlap and become unreadable
+- Most cells are zero/near-zero, making the meaningful cells hard to find
+- Performance degrades with 1,936 DOM nodes with event handlers
+- Row-normalization spreads probability mass so thin that all off-diagonal cells look identical
 
 **Prevention:**
-1. **Compute a single scale factor** when the image loads:
-   ```typescript
-   const scale = Math.min(
-     containerWidth / imageWidth,
-     containerHeight / imageHeight
-   );
-   ```
-   Store this in component state. All coordinate conversions go through it.
-2. **In `onTransformEnd`, always normalize scale back to 1:**
-   ```typescript
-   const node = shapeRef.current;
-   const sx = node.scaleX(), sy = node.scaleY();
-   node.scaleX(1); node.scaleY(1);
-   const newW = node.width() * sx;
-   const newH = node.height() * sy;
-   // Convert display coords to original pixel space
-   const bboxX = node.x() / scale;
-   const bboxW = newW / scale;
-   ```
-3. **Set `boundBoxFunc` on the Transformer** to prevent annotations from being dragged outside the image bounds
-4. **In `onDragEnd`, convert position back to pixel space** before persisting
-5. **Keep the SVG overlay for read-only contexts** (grid thumbnails, non-edit modal). Only use Konva in the edit modal. This limits the migration surface.
-6. **Write a `toPixelSpace(displayCoords, scale)` and `toDisplaySpace(pixelCoords, scale)` utility** and use it everywhere. Never do ad-hoc coordinate math.
-
-**Warning signs:**
-- Annotations appear in the correct position but after save-and-reload they are offset by a fixed amount
-- Annotations "jump" when the user starts dragging (because initial position was in wrong coordinate space)
-- Resizing an annotation and saving causes it to shrink or grow unexpectedly
-- Annotations drift when the user zooms in/out during editing
-
-**Phase to address:** Annotation editing (Phase 3 of v1.1)
-
-**Confidence:** HIGH -- Transformer scale behavior verified against [Konva official Transformer docs](https://konvajs.org/docs/react/Transformer.html) and [Konva Issue #830](https://github.com/konvajs/konva/issues/830) on coordinate changes with zoom. The [Konva Issue #1296](https://github.com/konvajs/konva/issues/1296) confirms bounding box calculation issues with stroke and scale.
-
----
-
-## Major Pitfalls
-
-Mistakes that cause significant rework, broken features, or deployment delays.
-
-### Pitfall 6: Docker Image Bloat from PyTorch + Transformers (8-12GB)
+1. Add a "top-K confused classes" view that only shows the K most confused pairs (e.g., top 20)
+2. Support class grouping/collapsing for hierarchical class sets
+3. Use a heatmap canvas renderer instead of HTML table for large matrices (SVG or `<canvas>`)
+4. Add a threshold filter: hide cells below a count threshold
+5. For 43+ classes, default to a "top confusions" bar chart instead of the full matrix
+6. Make the full matrix available as a downloadable CSV for detailed analysis
 
-**Severity:** MAJOR
-**Affects:** Docker containerization, deployment speed
+**Detection:** If `confusion_matrix_labels.length > 20`, switch to the compact view automatically.
 
-**What goes wrong:**
-The current `pyproject.toml` includes `torch>=2.10.0` and `transformers>=5.1.0` as direct dependencies. A naive `pip install` of these in a Docker image results in:
-- PyTorch with CUDA support: ~2.5GB
-- Transformers library: ~500MB
-- Combined with Python, DuckDB, Pillow, scikit-learn, etc.: **8-12GB total image**
-
-This makes `docker pull` take 10+ minutes on a GCP VM, `docker build` takes 20+ minutes, and disk usage on the VM is excessive.
-
-Python 3.14 adds a complication: as of the project's `requires-python = ">=3.14"`, the official `python:3.14-slim` images are available on Docker Hub, but some ML packages may not have pre-built wheels for 3.14 yet, forcing source compilation and further increasing build time.
-
-**Why it happens:**
-ML dependencies are massive. PyTorch bundles CUDA libraries by default even if you only need CPU inference. The `transformers` library pulls in many transitive dependencies. Developers build the image once, accept the size, and only discover the problem when CI/CD pipelines time out or GCP VM disk fills up.
-
-**Prevention:**
-1. **Use CPU-only PyTorch** for the Docker image unless GPU inference is needed in Docker:
-   ```dockerfile
-   RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
-   ```
-   This reduces PyTorch from ~2.5GB to ~200MB.
-2. **Multi-stage build:** Build dependencies in a `builder` stage, copy only site-packages and the app to a slim runtime stage:
-   ```dockerfile
-   FROM python:3.14-slim AS builder
-   RUN pip install --no-cache-dir --target=/deps ...
-   FROM python:3.14-slim AS runtime
-   COPY --from=builder /deps /usr/local/lib/python3.14/site-packages
-   ```
-3. **Use `--no-cache-dir` everywhere** to avoid pip cache bloating the image
-4. **Pin exact versions** to avoid pulling unnecessary updates during builds
-5. **For GPU support on GCP VMs:** Use NVIDIA Container Toolkit and mount the host GPU at runtime rather than bundling CUDA in the image
-6. **Consider MPS is NOT available in Docker on macOS** -- the DINOv2 embedding and Moondream2 VLM services will fall back to CPU. The `_detect_device()` function in `config.py` will return "cpu" in Docker.
-
-**Warning signs:**
-- `docker build` takes 30+ minutes
-- GCP VM disk fills up after a few image pulls
-- `docker push` to registry takes 20+ minutes
-
-**Phase to address:** Docker containerization (Phase 1 of v1.1)
-
-**Confidence:** HIGH -- verified against [PyTorch Docker optimization guide](https://mveg.es/posts/optimizing-pytorch-docker-images-cut-size-by-60percent/) and [Docker Hub Python 3.14 images](https://hub.docker.com/_/python).
+**Phase to address:** Phase 3 (UI). Can be done in parallel with evaluation logic since the component just needs to handle the existing data shape differently.
 
 ---
 
-### Pitfall 7: DuckDB Annotation Mutations Without Transactions Cause Inconsistent State
+## Moderate Pitfalls
 
-**Severity:** MAJOR
-**Affects:** Annotation editing, error triage workflow
+### Pitfall 6: Format Detection False Positives -- JSONL/CSV/Folder Misidentified or Rejected
 
-**What goes wrong:**
-The annotation editing feature will introduce **write mutations** to the `annotations` table (UPDATE for move/resize, DELETE for remove). The error triage workflow will also mutate data (adding tags, changing error classifications). The current codebase is **read-heavy with append-only writes** -- ingestion inserts in bulk, predictions insert in bulk, and all reads use cursors. There are no UPDATE operations anywhere in the existing code.
+**What goes wrong:** The `FolderScanner._is_coco_annotation()` checks if a JSON file has an `"images"` top-level key. A classification dataset's JSONL file (one JSON object per line) will fail this check because JSONL is not valid JSON -- it is newline-delimited. The scanner will report "Found JSON but not valid COCO" for `.jsonl` files, and `_BROWSE_EXTENSIONS` only shows `.json` files in the browser. Additionally, classification datasets commonly use folder-based structure (class_name/image.jpg) which has no annotation file at all -- the scanner finds zero JSON files and returns no splits.
 
-Adding per-annotation UPDATEs introduces new failure modes:
-1. **No primary key enforcement.** The DuckDB schema in `duckdb_repo.py` explicitly avoids PRIMARY KEY constraints ("No PRIMARY KEY or FOREIGN KEY constraints are used -- this yields ~3.8x faster bulk inserts"). This means UPDATE must use composite WHERE clauses (`WHERE id = ? AND dataset_id = ?`), and there is no unique constraint to prevent duplicate annotation IDs.
-2. **Cursor-per-request writes may conflict.** The existing `get_cursor` dependency yields a cursor from the single connection. Two concurrent annotation edits from the same user (e.g., rapid-fire drag operations) create two cursors both attempting writes. DuckDB uses optimistic concurrency control -- the second write may fail with a transaction conflict if both touch the same row.
-3. **Annotation count denormalization.** The `datasets` table stores `annotation_count`. Deleting an annotation must update this counter. If the DELETE succeeds but the UPDATE to `datasets` fails (or the user's connection drops mid-request), the count drifts.
+**Why it happens:** The scanner was built exclusively for COCO format. It looks for `.json` files and validates their structure. Classification datasets commonly use:
+- JSONL: `{"image": "path.jpg", "label": "cat"}` per line
+- CSV: `image_path,label` columns
+- Folder structure: `class_name/image.jpg` (ImageNet-style)
+- JSON mapping: `{"images": {"path.jpg": "cat", ...}}`
 
-**Why it happens:**
-Append-only systems do not need transactions or unique constraints. The v1.0 architecture was correctly designed for its workload (bulk ingestion + reads). v1.1 changes the workload to include interactive single-row mutations, which is a fundamentally different access pattern.
+**Concrete code:**
+- `app/services/folder_scanner.py:467-488` -- `_is_coco_annotation()` only recognizes COCO structure
+- `app/routers/ingestion.py:111` -- `_BROWSE_EXTENSIONS = {".json"}` excludes `.jsonl`, `.csv`
+- `app/models/dataset.py:15` -- `format: str = "coco"` defaults to COCO
+- `app/services/folder_scanner.py:126` -- `ScanResult` hardcodes `format="coco"`
 
 **Prevention:**
-1. **Add annotation IDs as unique identifiers.** While not enforcing a PRIMARY KEY (to preserve bulk insert performance), verify annotation ID uniqueness in application code before UPDATE.
-2. **Wrap mutation operations in explicit transactions:**
-   ```python
-   cursor = db.connection.cursor()
-   try:
-       cursor.begin()
-       cursor.execute("UPDATE annotations SET bbox_x=?, bbox_y=?, bbox_w=?, bbox_h=? WHERE id=? AND dataset_id=?", [...])
-       cursor.execute("UPDATE datasets SET annotation_count = (SELECT COUNT(*) FROM annotations WHERE dataset_id=?) WHERE id=?", [...])
-       cursor.commit()
-   except Exception:
-       cursor.rollback()
-       raise
-   finally:
-       cursor.close()
-   ```
-3. **Debounce annotation mutations on the frontend.** Do not send a PATCH request on every mouse move during drag. Send one PATCH on `onDragEnd` / `onTransformEnd`.
-4. **Consider adding a `modified_at` timestamp column** to annotations for conflict detection (optimistic locking).
-5. **Recompute denormalized counts** from source tables rather than incrementing/decrementing (avoids drift).
-
-**Warning signs:**
-- Annotation count in the sidebar does not match the actual number of annotations after edits
-- Rapid annotation edits occasionally fail with "Transaction conflict" errors
-- Deleted annotations reappear after page refresh (DELETE executed on cursor but transaction not committed)
-
-**Phase to address:** Annotation editing (Phase 3 of v1.1)
-
-**Confidence:** HIGH -- verified against existing schema in `duckdb_repo.py` and DuckDB's [concurrency documentation](https://duckdb.org/docs/stable/connect/concurrency) on optimistic concurrency control.
+1. Add format-specific scanner methods: `_is_classification_jsonl()`, `_is_classification_csv()`, `_is_classification_folder()`
+2. Expand `_BROWSE_EXTENSIONS` to include `.jsonl`, `.csv`
+3. Add a `format` field to `DetectedSplit` that can be `"coco"`, `"classification_jsonl"`, `"classification_folder"`, etc.
+4. The scanner should return the detected format, not assume COCO
+5. For folder-based classification, detect by checking if immediate subdirectories contain images and no annotation files exist
 
----
+**Detection:** Try scanning a folder with a `.jsonl` classification file or an ImageNet-style folder -- if it returns 0 splits with a warning, the scanner needs updating.
 
-### Pitfall 8: Smart Folder Structure Detection Has Unbounded Edge Cases
+**Phase to address:** Phase 1 (ingestion). Must detect the format before importing.
 
-**Severity:** MAJOR
-**Affects:** Smart dataset ingestion UI
+---
 
-**What goes wrong:**
-The smart ingestion feature must auto-detect dataset folder structures. Real-world CV datasets use dozens of conventions:
+### Pitfall 7: Annotation Triage Assumes Spatial Matching -- Meaningless for Classification
 
-**Standard COCO:**
-```
-dataset/
-  annotations/
-    instances_train2017.json
-    instances_val2017.json
-  train2017/
-  val2017/
-```
+**What goes wrong:** The entire annotation triage system (`annotation_matching.py`, `annotation_triage.py`, `triage-overlay.tsx`) is built around per-bbox IoU matching. For classification, there is one "annotation" per image (the class label). The triage categories (TP, FP, FN, Label Error) based on spatial overlap make no sense. A classification prediction is simply "correct" or "incorrect" -- there is no spatial localization to evaluate.
 
-**Standard YOLO:**
-```
-dataset/
-  images/
-    train/
-    val/
-  labels/
-    train/
-    val/
-  data.yaml
-```
+**Why it happens:** The triage system was Phase 14, deep into the detection workflow. It assumes:
+- Multiple annotations per image (detection has many boxes per image)
+- IoU-based matching to pair predictions with GT
+- Per-annotation granularity (each box can be independently triaged)
 
-**Roboflow exports:**
-```
-dataset/
-  train/
-    images/
-    labels/
-  valid/
-    images/
-    labels/
-  test/
-    images/
-    labels/
-  data.yaml
-```
+For classification:
+- One annotation per image (the class label)
+- Matching is by sample_id only (no spatial component)
+- Triage is per-image, not per-annotation
 
-**FiftyOne exports, CVAT exports, custom layouts** all differ further. The detection heuristic must handle: (a) split names in folder names (`train`, `training`, `trn`, `val`, `valid`, `validation`, `test`, `testing`), (b) splits at different directory levels (top-level vs. inside images/), (c) missing splits (no test set), (d) annotation files at different levels, (e) symlinks to shared image directories, (f) datasets with NO split structure (flat single folder).
+**Concrete code:**
+- `app/services/annotation_matching.py:18-135` -- 100% IoU-based
+- `app/routers/annotation_triage.py:46` -- calls `match_sample_annotations()` which does IoU
+- `frontend/src/components/detail/triage-overlay.tsx` -- renders per-bbox triage badges
 
-The dangerous edge case: a dataset with folder names that coincidentally match split names (e.g., a `train/` directory that contains images of trains, not training data).
+**Prevention:**
+1. Classification triage is trivially computed: `gt_label == pred_label ? "correct" : "incorrect"`
+2. No need for IoU matching, confidence-ordered greedy assignment, or matched_id tracking
+3. Create a `match_classification_annotations()` that returns per-sample correct/incorrect
+4. The triage overlay for classification should show the predicted label and whether it matches GT, not per-bbox badges
 
-**Why it happens:**
-There is no standard. Every annotation tool exports differently. Every ML team has their own conventions. Developers implement detection for the 3 formats they have seen and discover the other 20 in user bug reports.
+**Detection:** If the triage endpoint is called with `iou_threshold` for a classification dataset, the routing is wrong.
 
-**Prevention:**
-1. **Detection is a suggestion, not an action.** Show the detected structure to the user and let them confirm/correct before ingestion. Never auto-ingest without confirmation.
-2. **Use a scoring/confidence system.** Score each candidate split detection by:
-   - Presence of known annotation files (`.json`, `.yaml`, `.xml`, `.txt`)
-   - Image file ratio (a real split directory has mostly images)
-   - Naming conventions (weighted: `train` > `trn`)
-   - Sibling directory patterns (if `train/` and `val/` exist as siblings, confidence is higher)
-3. **Support manual override.** If detection fails, let the user manually specify: "This folder is train, this folder is val, this file is the annotation file."
-4. **Start with COCO only (since v1.0 only parses COCO).** Detect COCO-style structures first. The existing `COCOParser` expects a single annotation JSON and a single image directory. Smart ingestion for v1.1 should: find `.json` files that look like COCO, find image directories, let the user map them.
-5. **Ignore symlinks on first pass.** Following symlinks can cause infinite loops and unexpected cross-filesystem traversal. Use `os.walk(followlinks=False)` or `Path.iterdir()` (which does not follow symlinks by default).
-6. **Set a directory scan depth limit** (e.g., max 3 levels deep) to avoid accidentally scanning a mounted filesystem root.
-
-**Warning signs:**
-- Auto-detection picks the wrong directory as "training images"
-- A dataset with subdirectories organized by class (ImageNet-style) is misinterpreted as split-based
-- User reports that "the ingestion imported my test images as training images"
-
-**Phase to address:** Smart ingestion (Phase 2 of v1.1)
-
-**Confidence:** MEDIUM -- based on analysis of common dataset formats from [YOLO dataset structure](https://github.com/ultralytics/ultralytics/blob/main/docs/en/datasets/detect/index.md) and COCO convention documentation. Edge cases are experiential knowledge.
+**Phase to address:** Phase 2 (evaluation/triage). Simpler than detection triage -- should be quick to implement.
 
 ---
 
-### Pitfall 9: GCP Firewall Rules Block All Ports by Default
+### Pitfall 8: Error Analysis Categories Don't Map to Classification
 
-**Severity:** MAJOR
-**Affects:** GCP VM deployment
+**What goes wrong:** The error analysis service (`error_analysis.py`) categorizes detections as: True Positive, Hard False Positive, Label Error, False Negative. These categories are detection-specific:
+- "Hard FP" means a prediction box that does not overlap any GT box -- no spatial equivalent in classification
+- "Label Error" means a prediction box that overlaps a GT box of a different class -- in classification, this is just "incorrect prediction"
+- "False Negative" means a GT box with no matching prediction -- in classification, this means no prediction was made for an image
 
-**What goes wrong:**
-GCP Compute Engine has a **default-deny inbound** firewall policy. When you create a VM and run `docker-compose up`, the services bind to their ports inside the VM, but no external traffic can reach them. The developer SSHs into the VM, runs `curl localhost:3000` and sees the frontend. They open `http://35.202.x.x:3000` in their browser -- connection timeout. They spend 30 minutes debugging Docker port mapping before realizing it is a GCP firewall issue.
+**Concrete code:** `app/services/error_analysis.py:30-208` and `app/models/error_analysis.py`
 
-Even after creating a firewall rule for port 3000 (frontend) and 8000 (API), developers forget: (a) Qdrant port 6333 should NOT be exposed publicly (internal only), (b) the DuckDB file is accessible via the API, so exposing port 8000 without auth is equivalent to exposing the database, (c) firewall rules apply to all VMs with the matching network tag -- accidentally broad rules expose other VMs.
+**Prevention:** Classification error analysis categories should be:
+- **Correct**: predicted class matches GT class
+- **Misclassified**: predicted class differs from GT class (with the confused pair noted)
+- **No prediction**: GT exists but no prediction (if applicable)
+- **Confident wrong**: high-confidence incorrect predictions (most actionable for model improvement)
 
-**Why it happens:**
-AWS opens ports 22/80/443 in common security groups. GCP's default is more restrictive -- only SSH (port 22), ICMP, and RDP (port 3389) are allowed by default via the `default-allow-ssh` and `default-allow-icmp` rules. Developers familiar with AWS muscle-memory expect ports to be open.
+Create a `categorize_classification_errors()` function that returns these categories.
 
-**Prevention:**
-1. **Deployment script must create firewall rules automatically:**
-   ```bash
-   gcloud compute firewall-rules create datavisor-web \
-     --allow tcp:80,tcp:443 \
-     --target-tags datavisor \
-     --source-ranges 0.0.0.0/0
-   ```
-2. **Use a reverse proxy (Caddy/nginx) on port 80/443 only.** Never expose port 8000 (FastAPI) or 3000 (Next.js dev) directly. All traffic goes through the proxy.
-3. **Do NOT expose Qdrant port 6333 externally.** It should only be accessible within the docker-compose network. In docker-compose.yml, do not publish the port:
-   ```yaml
-   qdrant:
-     expose: ["6333"]  # internal only, no 'ports:' mapping
-   ```
-4. **Tag the VM** with a specific network tag and scope firewall rules to that tag
-5. **Document the firewall rules** in the deployment script README -- this is the #1 support question for GCP deployments
-
-**Warning signs:**
-- "Connection timed out" when accessing the VM's public IP
-- Services work fine via SSH tunnel but not via direct access
-- Qdrant dashboard is accidentally accessible from the internet
-
-**Phase to address:** GCP deployment (Phase 1 of v1.1)
-
-**Confidence:** HIGH -- verified against [GCP firewall documentation](https://cloud.google.com/compute/docs/networking/firewalls) and common deployment patterns.
+**Phase to address:** Phase 2 (evaluation).
 
 ---
 
-### Pitfall 10: Error Triage State Not Persisted -- Lost on Page Refresh
-
-**Severity:** MAJOR
-**Affects:** Error triage workflow
+### Pitfall 9: One-Annotation-Per-Image Assumption vs Multi-Label
 
-**What goes wrong:**
-The error triage workflow involves the user reviewing error samples and tagging them (FP, TP, FN, confirmed mistake, etc.). If this triage state lives only in the frontend Zustand store (like the current `filter-store.ts` and `ui-store.ts`), all triage progress is lost when the user refreshes the page, navigates away, or the browser crashes.
+**What goes wrong:** Single-label classification means exactly one class per image. But the schema and ingestion pipeline need to enforce this. If someone imports a classification dataset where some images have two labels (multi-label), the system should either reject it or handle it explicitly. Silently accepting multi-label data into a single-label workflow produces wrong accuracy numbers (which label counts as "correct"?).
 
-A 100K-image dataset with 5000 error detections requires significant manual review time. Losing 30 minutes of triage work because of a page refresh makes the feature unusable.
-
-**Why it happens:**
-The v1.0 architecture stores transient UI state in Zustand and persistent data in DuckDB. Developers add triage state to Zustand for speed (no API call on each tag) and plan to "persist later," but the persistence never gets built or gets deferred because it requires a new API endpoint + DuckDB schema change.
+**Why it happens:** The annotations table has no constraint preventing multiple annotations per sample. For detection, that is correct -- one image has many boxes. For single-label classification, it is a data integrity violation. Without enforcement, a malformed CSV with duplicate rows silently corrupts the dataset.
 
 **Prevention:**
-1. **Persist triage decisions to DuckDB immediately.** Add a `triage_status` column to the `annotations` table (or a separate `triage_decisions` table) and PATCH on each tag action.
-2. **Debounce but persist.** Debounce the API call by 500ms to batch rapid changes, but always persist before the user moves to the next image.
-3. **Use optimistic updates:** Update the Zustand store immediately (for snappy UI), then persist to DuckDB in the background. If the persist fails, show a non-blocking error and retry.
-4. **Add the `tags` column on `samples` table (already exists in the schema as `VARCHAR[]`)** for sample-level triage tags. For annotation-level triage, add a new column or use the existing `metadata` JSON column on annotations.
-5. **Consider using the existing saved views system** to persist triage filter state (which errors are visible, what filters are applied).
-
-**Warning signs:**
-- User tags 50 error samples, refreshes, all tags are gone
-- Triage progress is not visible to the same user in a different browser tab
-- "Save triage" button exists but is easy to forget
-
-**Phase to address:** Error triage workflow (Phase 4 of v1.1)
+1. During ingestion, validate that each sample_id has exactly one annotation (per source)
+2. If duplicates found, warn the user and either take the first or reject
+3. Add `task_type` validation: classification datasets must have max 1 annotation per sample per source
+4. Consider a `multi_label` flag on the dataset for future extensibility, but enforce `single_label` for this milestone
 
-**Confidence:** HIGH -- based on analysis of existing Zustand stores in the codebase, which are all transient. The `samples.tags` column exists for persistence.
+**Phase to address:** Phase 1 (ingestion validation).
 
 ---
 
-## Moderate Pitfalls
-
-Mistakes that cause delays, degraded UX, or technical debt.
+### Pitfall 10: API Response Models Leaking Detection Fields to Classification Clients
 
-### Pitfall 11: Docker Compose File Mounts Break Image Path Resolution
+**What goes wrong:** The `AnnotationResponse` model returns `bbox_x`, `bbox_y`, `bbox_w`, `bbox_h`, `area`, `is_crowd` for every annotation. For classification, these fields are meaningless (0.0 or sentinel values). The `EvaluationResponse` returns `iou_threshold`, `map50`, `map75`, `map50_95` which have no meaning for classification. Frontend code consuming these fields wastes bandwidth and creates confusion. The `BatchAnnotationsResponse` groups by `sample_id` and returns lists -- for classification, each list has exactly one item, which is a different UX pattern than detection's variable-length lists.
 
-**Severity:** MODERATE
-**Affects:** Docker containerization, image serving
-
-**What goes wrong:**
-The current `StorageBackend` in `app/repositories/storage.py` resolves local image paths with `Path(path).resolve()`. During ingestion, the user provides an image directory like `/Users/ortizeg/datasets/coco/images/`. This absolute host path is stored in the `datasets.image_dir` column and used to serve images.
-
-In Docker, this host path does not exist inside the container. The container filesystem has a different root. Even with a volume mount (`-v /Users/ortizeg/datasets:/data/datasets`), the stored path (`/Users/ortizeg/datasets/coco/images/`) does not match the container path (`/data/datasets/coco/images/`).
-
-**Why it happens:**
-The v1.0 system was designed for local execution where host paths and process paths are identical. Docker introduces a path namespace boundary. The DuckDB database remembers absolute host paths from ingestion, which become invalid inside the container.
+**Why it happens:** Pydantic models were designed for detection. Adding classification means the same endpoint returns structurally different data depending on dataset type.
 
 **Prevention:**
-1. **Store relative paths in DuckDB, not absolute paths.** During ingestion, strip the base dataset directory and store only the relative portion. The base directory is configured at runtime via environment variable.
-2. **Alternatively, use a canonical mount point.** Require datasets to be mounted at a fixed container path (e.g., `/data/datasets/`) and store paths relative to that root.
-3. **For existing datasets (v1.0 migration):** Provide a path remapping configuration:
-   ```yaml
-   # docker-compose.yml
-   environment:
-     DATAVISOR_PATH_REMAP: "/Users/ortizeg/datasets:/data/datasets"
-   ```
-4. **Update `StorageBackend.resolve_image_path()`** to apply the path remap before resolution.
-5. **Test image serving in Docker immediately** after the first successful build -- this will break early.
-
-**Warning signs:**
-- Thumbnails show broken image icons in Docker but work locally
-- `FileNotFoundError` in logs for paths that exist on the host but not in the container
-- Ingesting a dataset inside Docker works, but datasets ingested before dockerization cannot serve images
-
-**Phase to address:** Docker containerization (Phase 1 of v1.1)
-
-**Confidence:** HIGH -- verified by reading the existing `storage.py` code which uses `Path(path).resolve()` and the `datasets.image_dir` column which stores absolute paths.
+1. Create a `ClassificationAnnotationResponse` with just `id`, `dataset_id`, `sample_id`, `category_name`, `source`, `confidence`
+2. Create a `ClassificationEvaluationResponse` with `accuracy`, `precision`, `recall`, `f1`, `per_class_metrics`, `confusion_matrix` (no mAP, no IoU threshold)
+3. Or: use a discriminated union response that includes bbox/mAP fields only for detection
+4. The API should return the task-appropriate response model based on dataset type
+5. Frontend types should reflect this: `DetectionAnnotation | ClassificationAnnotation`
 
----
+**Phase to address:** Phase 1 (API design). Affects both backend and frontend types.
 
-### Pitfall 12: Keyboard Shortcuts Conflict with Browser and Input Field Defaults
+---
 
-**Severity:** MODERATE
-**Affects:** Keyboard shortcuts feature
+## Minor Pitfalls
 
-**What goes wrong:**
-Common keyboard shortcut choices conflict with browser defaults or text input:
-- `Delete` / `Backspace`: Navigates back in Firefox; deletes text in input fields
-- `Space`: Scrolls the page; toggles checkboxes
-- Arrow keys: Scroll the page; move cursor in text inputs
-- `Ctrl+A`: Select all (browser default)
-- `Ctrl+Z`: Browser undo in text fields
-- `Escape`: Closes the detail modal (already implemented); also closes browser dialogs
+### Pitfall 11: Class Imbalance Statistics Need Different Visualization for 43+ Classes
 
-If shortcuts are registered globally (`document.addEventListener('keydown', ...)`), they fire even when the user is typing in the search input (`search-input.tsx`), the saved view name input, or the annotation label field (if editing annotations). Pressing `Delete` to clear a search term instead deletes the selected annotation.
+**What goes wrong:** The class distribution chart shows annotation count per class. For detection, classes are relatively balanced (~10 classes). For classification with 43 jersey numbers, you get a bar chart with 43 bars where some numbers appear 1000x and others appear 3x. The chart becomes unreadable and the bars for rare classes are invisible.
 
-**Why it happens:**
-Developers test shortcuts with no focus on input elements. The shortcut handler does not check `document.activeElement` or `event.target.tagName`. Global event listeners are the easiest to implement but the hardest to get right.
+**Prevention:** For high-cardinality classification, add:
+- A sortable table view (already partially exists via `class-distribution.tsx`)
+- A "long tail" indicator showing how many classes have < N samples
+- A log-scale option for the bar chart
+- Top-K / Bottom-K filtering
 
-**Prevention:**
-1. **Check focus before handling.** In the keydown handler:
-   ```typescript
-   const tag = (e.target as HTMLElement).tagName;
-   if (tag === 'INPUT' || tag === 'TEXTAREA' || tag === 'SELECT') return;
-   if ((e.target as HTMLElement).isContentEditable) return;
-   ```
-2. **Use a shortcut library** like `react-hotkeys-hook` that handles focus scoping automatically.
-3. **Scope shortcuts to specific components.** Navigation shortcuts (arrow keys for next/prev image) should only work when the grid or modal is focused, not globally.
-4. **Avoid single-key shortcuts** that conflict with browser defaults. Use modifier keys for destructive actions: `Shift+Delete` to delete annotation, not just `Delete`.
-5. **Show a shortcut overlay** (triggered by `?` key) that lists available shortcuts -- this also serves as documentation.
-
-**Warning signs:**
-- User cannot type the letter "d" in the search box because it triggers "delete annotation"
-- Arrow keys scroll the page instead of navigating images
-- `Escape` closes both the annotation editor and the modal simultaneously
-
-**Phase to address:** Keyboard shortcuts (Phase 5 of v1.1)
-
-**Confidence:** HIGH -- standard web development pattern, verified against current codebase which has the `search-input.tsx` component and `<dialog>` elements that consume keyboard events.
+**Phase to address:** Phase 3 (UI polish).
 
 ---
 
-### Pitfall 13: CORS Configuration Must Change from Wildcard to Specific Origin in Production
-
-**Severity:** MODERATE
-**Affects:** Authentication, deployment
-
-**What goes wrong:**
-The current `app/main.py` has:
-```python
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins for dev -- will restrict later
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-```
-
-Per the CORS specification, `allow_origins=["*"]` and `allow_credentials=True` is **invalid**. Browsers reject this combination -- you cannot use wildcards with credentials. When basic auth is added (which sends credentials), the browser will block all cross-origin requests.
-
-Even if you switch to `allow_origins=["http://35.202.x.x:3000"]`, you must update this value for every deployment. This is fragile.
-
-**Why it happens:**
-The wildcard was intentional for development (the comment says "will restrict later"). But the interaction between `allow_credentials=True` and `allow_origins=["*"]` is a spec violation that browsers enforce silently -- the request fails with a cryptic CORS error in the console.
-
-**Prevention:**
-1. **Use a reverse proxy (repeated from Pitfalls 3 and 4).** If frontend and API share the same origin, CORS is not needed at all. Remove the CORS middleware entirely.
-2. **If CORS is needed:** Set `allow_origins` from an environment variable:
-   ```python
-   origins = settings.allowed_origins.split(",") if settings.allowed_origins else ["http://localhost:3000"]
-   ```
-3. **Never combine `allow_origins=["*"]` with `allow_credentials=True`.** The spec forbids it.
-4. **Test CORS from a real browser** (not curl/httpx which do not enforce CORS). Open the browser console and check for CORS errors.
+### Pitfall 12: Embedding/Similarity Features Work Unchanged -- Don't Over-Adapt
 
-**Warning signs:**
-- API calls work from curl but fail from the browser with "CORS policy" errors
-- Adding basic auth breaks all frontend requests
-- The error says `Access-Control-Allow-Origin` must not be `*` when credentials are included
+**What goes wrong:** The embeddings pipeline (`embeddings.py`, `embedding-scatter.tsx`) is image-level, not annotation-level. Developers may waste time trying to "adapt" the embedding scatter for classification when it already works correctly. The scatter plot colored by class label is actually more meaningful for classification than detection.
 
-**Phase to address:** Docker containerization / auth (Phase 1 of v1.1)
+**Prevention:** Leave the embedding pipeline alone. It already operates at the image level. The only change needed is coloring scatter points by the classification label instead of (or in addition to) detection class names. This is a minor frontend change, not a pipeline change.
 
-**Confidence:** HIGH -- verified against [FastAPI CORS documentation](https://fastapi.tiangolo.com/tutorial/cors/) and the CORS specification.
+**Phase to address:** Phase 3 (UI). Minimal effort.
 
 ---
 
-### Pitfall 14: GCP Persistent Disk Not Mounted on VM Restart
+### Pitfall 13: The "Second System Effect" -- Over-Generalizing the Architecture
 
-**Severity:** MODERATE
-**Affects:** GCP VM deployment
+**What goes wrong:** After seeing the pattern of "detection vs classification," developers try to build a fully generic task-agnostic framework that handles detection, classification, segmentation, keypoint detection, and object tracking. This leads to:
+- Abstract base classes with 20 methods each
+- A plugin system for task types
+- Configuration-driven UI rendering
+- Generic evaluation frameworks
 
-**What goes wrong:**
-The deployment script creates a GCP VM and attaches a persistent disk for data storage. Docker volumes point to a directory on this disk. But if the VM restarts (maintenance event, preemptible VM, manual restart), the persistent disk may not auto-mount. The VM comes back up, Docker starts, but the volume mount points to an empty directory. DuckDB creates a new empty database, Qdrant creates empty collections, and the user thinks their data is gone.
+All of this for what should be: add classification support as a second, simpler task type.
 
-**Why it happens:**
-Attaching a disk to a GCP VM does not auto-mount it. You must: (a) format the disk (first time), (b) create a mount point, (c) mount it, and (d) add an entry to `/etc/fstab` for persistence across reboots. Developers do steps (a)-(c) manually during setup and forget step (d). The VM restarts fine -- but the disk is attached, not mounted.
+**Why it happens:** The existing codebase already has a `plugins/` directory with `base_plugin.py`, `hooks.py`, and `registry.py`. The temptation to make task types a plugin is strong. But classification is fundamentally simpler than detection, not a peer abstraction.
 
 **Prevention:**
-1. **Deployment script must add an fstab entry:**
-   ```bash
-   echo "UUID=$(blkid -s UUID -o value /dev/sdb) /mnt/data ext4 defaults,nofail 0 2" >> /etc/fstab
-   ```
-2. **Use `nofail` option** so the VM boots even if the disk mount fails (prevents boot loops)
-3. **Add a health check** in the startup script that verifies the data directory is mounted before starting Docker:
-   ```bash
-   if ! mountpoint -q /mnt/data; then
-     mount /dev/sdb /mnt/data || echo "FATAL: Data disk not mounted"
-   fi
-   ```
-4. **Use GCP startup scripts** that run on every boot, not just first boot
-5. **Prefer standard persistent disks** over local SSDs (which do NOT survive VM stop/start)
-
-**Warning signs:**
-- Data disappears after VM restart but returns after manual `mount` command
-- `df -h` shows the data directory is on the root filesystem (small) instead of the persistent disk (large)
-- Docker logs show DuckDB initializing a fresh schema on startup (because it created a new empty database file)
-
-**Phase to address:** GCP deployment (Phase 1 of v1.1)
-
-**Confidence:** HIGH -- standard GCP operational knowledge, verified against [GCP persistent disk documentation](https://cloud.google.com/compute/docs/disks/add-persistent-disk).
+1. Support exactly two task types: detection and classification
+2. Use simple if/else at routing boundaries, not inheritance hierarchies
+3. Classification-specific code should be straightforward functions, not subclasses
+4. Resist adding segmentation support "while we're at it"
+5. The plugin system should stay for its current purpose, not be repurposed for task types
 
----
-
-## Minor Pitfalls
+**Detection:** If you find yourself writing `class TaskType(ABC)` with 10+ abstract methods, stop.
 
-Mistakes that cause annoyance or minor rework.
+**Phase to address:** Phase 0 (design). Set explicit scope boundaries before coding.
 
-### Pitfall 15: Annotation Delete Without Undo Causes Data Loss Anxiety
+---
 
-**Severity:** MINOR
-**Affects:** Annotation editing
+### Pitfall 14: Prediction Import Format Mismatch
 
-**What goes wrong:**
-The user accidentally deletes an annotation. There is no undo. The annotation is gone from DuckDB. The user must re-create it from scratch or re-ingest the dataset (losing all other edits). This makes users hesitant to use the editing feature.
+**What goes wrong:** The `DetectionAnnotationParser` expects per-image JSON files with bbox annotations. Classification predictions are typically a single file: CSV, JSONL, or JSON mapping `image -> predicted_class`. Using the detection prediction parser for classification predictions will fail silently (no bboxes found, all predictions skipped).
 
-**Prevention:**
-1. **Soft delete first.** Add a `deleted_at` timestamp column. Mark annotations as deleted rather than removing them. Purge after 30 days or on explicit "purge deleted" action.
-2. **Undo buffer.** Keep the last N deleted annotations in Zustand state. Show an "Undo" toast for 10 seconds after deletion. On undo, re-insert the annotation.
-3. **At minimum:** Show a confirmation dialog for delete actions. "Delete this 'person' annotation? This cannot be undone."
+**Concrete code:** `app/ingestion/detection_annotation_parser.py` looks for `ann.get("bbox", {})` in each annotation -- classification predictions have no bboxes.
 
-**Phase to address:** Annotation editing (Phase 3 of v1.1)
+**Prevention:** Create a `ClassificationPredictionParser` that accepts:
+- CSV: `image_filename,predicted_class,confidence`
+- JSONL: `{"filename": "img.jpg", "class": "cat", "confidence": 0.95}`
+- JSON mapping: `{"img.jpg": {"class": "cat", "confidence": 0.95}}`
 
-**Confidence:** HIGH -- standard UX pattern.
+**Phase to address:** Phase 1 (ingestion).
 
 ---
 
-### Pitfall 16: `docker-compose up` OOMs on Small GCP VMs with Embedding Model
+### Pitfall 15: Statistics Summary Counts Diverge Between Detection and Classification
 
-**Severity:** MINOR
-**Affects:** GCP deployment
-
-**What goes wrong:**
-The `EmbeddingService.load_model()` in `app/main.py`'s lifespan loads the DINOv2-base model at startup. On a GCP `e2-standard-2` (2 vCPU, 8GB RAM) running Docker (overhead: ~200MB), Qdrant (overhead: ~500MB), Next.js (overhead: ~200MB), and DINOv2 (overhead: ~1.5GB in CPU mode), total memory pressure approaches 4-5GB before any data is loaded. Loading a 100K-sample dataset into DuckDB can add another 1-2GB.
+**What goes wrong:** The `get_dataset_statistics()` endpoint in `statistics.py` computes `gt_annotations` and `pred_annotations` by counting rows in the `annotations` table. For detection, annotation count != image count (many boxes per image). For classification, annotation count == image count (one label per image). If both task types share the same endpoint without context, the "500 annotations" label is confusing for classification -- users expect "500 images classified" or "500 labels."
 
 **Prevention:**
-1. **Lazy model loading is partially implemented** (VLM is lazy, but EmbeddingService loads at startup). Make embedding model loading lazy too -- only load when the user triggers embedding generation.
-2. **Document minimum VM specs:** Recommend `e2-standard-4` (4 vCPU, 16GB RAM) for comfortable operation, `e2-standard-2` (8GB) as absolute minimum.
-3. **Add memory limits to docker-compose services** to prevent one service from OOM-killing others:
-   ```yaml
-   services:
-     api:
-       deploy:
-         resources:
-           limits:
-             memory: 6G
-     qdrant:
-       deploy:
-         resources:
-           limits:
-             memory: 2G
-   ```
-
-**Phase to address:** GCP deployment (Phase 1 of v1.1)
-
-**Confidence:** MEDIUM -- memory estimates based on typical model sizes; actual numbers depend on the specific DINOv2 variant and batch size.
-
----
+1. For classification, relabel "annotations" as "labels" in the summary
+2. Add `images_with_labels` and `images_without_labels` to classification statistics
+3. Add class balance metrics: min/max/median samples per class, number of classes with < 5 samples
+4. These changes are frontend-only if the backend returns raw counts
 
-## Integration Pitfalls Matrix
-
-How new v1.1 features interact with the existing v1.0 system.
-
-| New Feature | Existing Component | Integration Risk | Specific Pitfall |
-|---|---|---|---|
-| Docker | DuckDB file | WAL file loss on unclean shutdown | P1: Must mount entire `data/` directory |
-| Docker | Qdrant local mode | Cannot run embedded in multi-container setup | P2: Must migrate to server mode |
-| Docker | Image path storage | Host absolute paths invalid in container | P11: Must use relative paths or path remapping |
-| Docker | Next.js env vars | `NEXT_PUBLIC_API_URL` baked at build time | P3: Must use reverse proxy or runtime injection |
-| Docker | PyTorch/Transformers | 8-12GB image size | P6: Use CPU-only torch, multi-stage build |
-| Basic Auth | SSE streams | EventSource cannot set auth headers | P4: Must use cookie-based auth or fetch polyfill |
-| Basic Auth | CORS middleware | Wildcard + credentials is spec-invalid | P13: Remove CORS via reverse proxy |
-| Annotation Edit | SVG overlay | SVG coord system differs from Canvas | P5: Separate read-only (SVG) from edit (Konva) |
-| Annotation Edit | DuckDB writes | No existing UPDATE pattern, no transactions | P7: Add explicit transactions for mutations |
-| Annotation Edit | `annotations` table | No unique constraints | P7: Verify ID uniqueness in app code |
-| Smart Ingestion | COCOParser | Expects single JSON + single image dir | P8: Must handle multi-split structures |
-| Error Triage | Zustand stores | UI state lost on refresh | P10: Persist triage decisions to DuckDB |
-| Keyboard Shortcuts | Search input | Global handlers capture input keystrokes | P12: Check activeElement before handling |
-| GCP Deploy | Firewall | Default deny blocks all ports | P9: Script must create rules |
-| GCP Deploy | Persistent disk | Disk not auto-mounted on restart | P14: Add fstab entry |
-| GCP Deploy | Memory | Model loading exhausts small VM RAM | P16: Lazy loading, document minimum specs |
+**Phase to address:** Phase 3 (UI).
 
 ---
 
 ## Phase-Specific Warnings
 
-Which pitfalls to address in which phase, and what to watch for.
+| Phase Topic | Likely Pitfall | Mitigation |
+|-------------|---------------|------------|
+| Schema design | P1 (schema pollution), P4 (breaking existing) | Separate `classifications` table, never alter existing columns |
+| Ingestion | P6 (format detection), P9 (multi-label), P14 (prediction format) | New format scanners, validation, separate parsers |
+| Evaluation | P2 (metric confusion), P7 (triage), P8 (error categories) | Separate evaluation function, classification-specific triage |
+| UI | P3 (conditional spaghetti), P5 (confusion matrix scaling), P11 (class imbalance viz) | Task adapter pattern, compact matrix view, log-scale charts |
+| Architecture | P13 (over-generalization) | Two task types only, simple branching, no abstract hierarchies |
+| API | P10 (response model leakage) | Task-specific response models or discriminated unions |
 
-| Phase | Pitfalls to Address | Critical Action | Verification |
-|---|---|---|---|
-| Docker + Deploy | P1, P2, P3, P4, P6, P9, P11, P13, P14, P16 | Use reverse proxy (Caddy), mount `data/` dir, migrate Qdrant to server mode | Access app from browser (not curl) via public IP; restart VM; verify data persists |
-| Smart Ingestion | P8 | Detection is suggestion, not action; confirm before ingest | Test with 5+ dataset layouts including edge cases |
-| Annotation Editing | P5, P7, P15 | Konva coordinate normalization; DuckDB transactions; soft delete | Edit annotation, save, refresh, verify position unchanged |
-| Error Triage | P10 | Persist triage state to DuckDB, not just Zustand | Tag 10 errors, refresh page, verify tags persist |
-| Keyboard Shortcuts | P12 | Check focus before handling; modifier keys for destructive actions | Type in search box with shortcuts enabled |
+## Summary: Top 3 Rules of Thumb
 
----
+1. **Separate table, not shared table.** Classification and detection annotations are structurally different. A `classifications` table is less code than 30+ null-checks in a shared table.
 
-## "What Might I Have Missed?" Review
+2. **Branch at the boundary, not in the leaf.** Route to detection-specific or classification-specific code at the API endpoint or page level, not inside individual components/queries.
 
-Areas of uncertainty that could not be fully verified:
-
-1. **Python 3.14 + torch in Docker:** The `requires-python = ">=3.14"` constraint means the Docker image must use Python 3.14. While official Docker images exist (`python:3.14-slim`), not all ML packages have pre-built wheels for 3.14. Source compilation in Docker adds build time and image size. **Confidence: MEDIUM** -- wheels availability not verified for torch 2.10 on Python 3.14.
-
-2. **DuckDB file locking across Docker volume backends:** Docker volume mounts use different storage drivers (overlay2, btrfs, devicemapper). DuckDB relies on POSIX file locking for WAL safety. NFS-backed volumes (common in Kubernetes, not typical for Compose) may not support file locking correctly. For standard Docker Compose with bind mounts on ext4/APFS, this should be fine. **Confidence: MEDIUM** -- not verified for all storage drivers.
-
-3. **Konva performance with many annotation boxes:** The existing SVG overlay handles arbitrary annotation counts. If an image has 500+ annotations and all are rendered as Konva shapes with Transformers, canvas performance may degrade. **Confidence: LOW** -- not measured; Konva documentation claims good performance but does not specify limits for interactive Transformer usage.
-
-4. **Qdrant data migration from local to server mode:** The exact steps to migrate existing data from the embedded SQLite-based Qdrant local mode to the Qdrant server's RocksDB-based storage are not documented. The existing `_sync_from_duckdb` method re-creates the collection from DuckDB embeddings, which is a workaround but means the first Docker startup must re-sync all embeddings. For 100K samples, this may take several minutes. **Confidence: MEDIUM** -- the code path exists but has not been tested at scale for this migration.
-
----
+3. **Classification is simpler -- keep it that way.** No IoU, no spatial matching, no bbox rendering. The evaluation function should be ~50 lines, not adapted from 560 lines of detection code.
 
 ## Sources
 
-### Official Documentation (HIGH confidence)
-- [DuckDB Files Created](https://duckdb.org/docs/stable/operations_manual/footprint_of_duckdb/files_created_by_duckdb) -- WAL, lock files, tmp directory behavior
-- [DuckDB Concurrency](https://duckdb.org/docs/stable/connect/concurrency) -- MVCC, optimistic concurrency, single-writer model
-- [Qdrant Installation](https://qdrant.tech/documentation/guides/installation/) -- Docker deployment, server mode
-- [qdrant-client README](https://github.com/qdrant/qdrant-client) -- local mode vs server mode migration
-- [Konva Transformer Docs](https://konvajs.org/docs/react/Transformer.html) -- scale vs dimension behavior, normalization pattern
-- [Next.js Environment Variables](https://nextjs.org/docs/pages/guides/environment-variables) -- NEXT_PUBLIC build-time inlining
-- [FastAPI CORS](https://fastapi.tiangolo.com/tutorial/cors/) -- wildcard + credentials restriction
-- [GCP Persistent Disks](https://cloud.google.com/compute/docs/disks/add-persistent-disk) -- mount, fstab, restart behavior
-- [GCP Firewall Rules](https://cloud.google.com/compute/docs/networking/firewalls) -- default deny policy
-- [MDN EventSource](https://developer.mozilla.org/en-US/docs/Web/API/EventSource/withCredentials) -- header limitations
-
-### GitHub Issues (MEDIUM confidence)
-- [DuckDB WAL Lock File Issue #10002](https://github.com/duckdb/duckdb/issues/10002) -- lock file not cleaned on forced close
-- [DuckDB WAL Issue #10952](https://github.com/duckdb/duckdb/issues/10952) -- .wal stays open after parquet import
-- [Konva Coordinate Issue #830](https://github.com/konvajs/konva/issues/830) -- dragging and zooming alter coordinates
-- [Konva Transformer BBox Issue #1296](https://github.com/konvajs/konva/issues/1296) -- incorrect bounding box with stroke and scale
-- [WHATWG EventSource Headers Issue #2177](https://github.com/whatwg/html/issues/2177) -- cannot set headers on EventSource
-- [Next.js Docker Env Vars Discussion #17641](https://github.com/vercel/next.js/discussions/17641) -- NEXT_PUBLIC in Docker
-
-### Community Sources (LOW-MEDIUM confidence)
-- [Konva Transformer Explained](https://longviewcoder.com/2022/04/28/what-the-hell-did-the-transformer-actually-do-to-my-shape/) -- detailed walkthrough of Transformer behavior
-- [Building Canvas Editors in React](https://www.alikaraki.me/blog/canvas-editors-konva) -- Konva patterns and gotchas
-- [Next.js Runtime Env Vars](https://nemanjamitic.com/blog/2025-12-13-nextjs-runtime-environment-variables/) -- solutions for Docker runtime configuration
-- [PyTorch Docker Optimization](https://mveg.es/posts/optimizing-pytorch-docker-images-cut-size-by-60percent/) -- 60% image size reduction strategies
-- [Running Qdrant with Docker Compose](https://www.spasov.me/blog/running-qdrant-with-docker-compose-api-access-networking-and-api-keys) -- networking and API key configuration
-- [Secure EventSource Authentication](https://openillumi.com/en/en-eventsource-auth-header-solution/) -- workarounds for SSE auth
-- [FastAPI Security Best Practices](https://blog.greeden.me/en/2025/07/29/fastapi-security-best-practices-from-authentication-authorization-to-cors/) -- auth and CORS configuration
-
----
-*Pitfalls research for: DataVisor v1.1 -- Docker deployment, auth, annotation editing, smart ingestion, error triage*
-*Researched: 2026-02-12*
+- Direct codebase analysis of all referenced files in the DataVisor repository
+- DuckDB ALTER TABLE documentation: limited support for changing NOT NULL constraints (MEDIUM confidence, from training data)
+- General software engineering: Strategy pattern for task-type polymorphism (HIGH confidence, well-established pattern)
diff --git a/.planning/research/STACK.md b/.planning/research/STACK.md
index 0c2db3d..e68b119 100644
--- a/.planning/research/STACK.md
+++ b/.planning/research/STACK.md
@@ -1,792 +1,371 @@
-# Stack Research: v1.1 Additions
+# Technology Stack: Classification Dataset Support
 
-**Project:** DataVisor v1.1 -- Deployment, Workflow & Competitive Parity
-**Researched:** 2026-02-12
-**Scope:** New stack additions ONLY. Existing stack (FastAPI, DuckDB, Qdrant, Next.js, Tailwind, deck.gl, Recharts, Pydantic AI, Moondream2) is validated and NOT re-researched.
+**Project:** DataVisor - Classification Dataset Extension (v1.2)
+**Researched:** 2026-02-18
 **Overall confidence:** HIGH
 
 ---
 
-## 1. Docker Compose Architecture
-
-### Service Topology
-
-Four services in a single `docker-compose.yml`:
-
-| Service | Image | Purpose | Port | Volume Mounts |
-|---------|-------|---------|------|---------------|
-| `backend` | Custom (Dockerfile.backend) | FastAPI + DuckDB + Qdrant (local mode) + ML models | 8000 | `./data:/app/data`, `./plugins:/app/plugins` |
-| `frontend` | Custom (Dockerfile.frontend) | Next.js standalone server | 3000 | none (static build) |
-| `caddy` | `caddy:2-alpine` | Reverse proxy, HTTPS, basic auth | 80, 443 | `./Caddyfile:/etc/caddy/Caddyfile`, `caddy_data:/data` |
-
-**Three services, not four.** Qdrant stays in local/embedded mode (current approach via `QdrantClient(path=...)`) rather than running as a separate Docker container. Rationale:
-
-- The current codebase uses `qdrant-client` in local mode with on-disk persistence at `data/qdrant/`. This works without a Qdrant server process.
-- For a single-user tool with <1M vectors, local mode is equivalent in performance to server mode.
-- Eliminates a container, reduces memory footprint, and simplifies the compose file.
-- If Qdrant server mode is ever needed (e.g., multiple workers), the code change is one line: `QdrantClient(path=...)` to `QdrantClient(url="http://qdrant:6333")`.
-
-**Why not keep it to two services (combine backend + frontend)?** Separate containers for backend and frontend allow independent rebuilds. The FastAPI container has heavy Python/ML dependencies (~4GB with torch); the Next.js container is ~150MB standalone. Rebuilding frontend CSS does not trigger a 10-minute Python image rebuild.
-
-### Docker Compose File
-
-```yaml
-services:
-  backend:
-    build:
-      context: .
-      dockerfile: Dockerfile.backend
-    ports:
-      - "8000:8000"
-    volumes:
-      - ./data:/app/data          # DuckDB + Qdrant + thumbnails persist here
-      - ./plugins:/app/plugins    # Plugin directory
-    environment:
-      - DATAVISOR_DB_PATH=/app/data/datavisor.duckdb
-      - DATAVISOR_QDRANT_PATH=/app/data/qdrant
-      - DATAVISOR_THUMBNAIL_CACHE_DIR=/app/data/thumbnails
-      - DATAVISOR_AUTH_USERNAME=${AUTH_USERNAME:-admin}
-      - DATAVISOR_AUTH_PASSWORD=${AUTH_PASSWORD}
-      - DATAVISOR_VLM_DEVICE=cpu  # Override for GPU: cuda
-    restart: unless-stopped
-
-  frontend:
-    build:
-      context: ./frontend
-      dockerfile: Dockerfile.frontend
-      args:
-        - NEXT_PUBLIC_API_URL=http://backend:8000
-    ports:
-      - "3000:3000"
-    environment:
-      - HOSTNAME=0.0.0.0
-    depends_on:
-      - backend
-    restart: unless-stopped
-
-  caddy:
-    image: caddy:2-alpine
-    ports:
-      - "80:80"
-      - "443:443"
-    volumes:
-      - ./Caddyfile:/etc/caddy/Caddyfile:ro
-      - caddy_data:/data
-      - caddy_config:/config
-    depends_on:
-      - frontend
-      - backend
-    restart: unless-stopped
-
-volumes:
-  caddy_data:
-  caddy_config:
-```
-
-### Confidence: HIGH
-Sources: [FastAPI Docker docs](https://fastapi.tiangolo.com/deployment/docker/), [Next.js output standalone docs](https://nextjs.org/docs/app/api-reference/config/next-config-js/output), [Caddy Docker docs](https://hub.docker.com/_/caddy)
-
----
-
-## 2. Backend Dockerfile
-
-### Recommended: Multi-stage build with uv
+## Key Finding: No New Dependencies Required
 
-```dockerfile
-# Stage 1: Build dependencies
-FROM python:3.14-slim AS builder
+Classification support requires **zero new libraries**. The existing stack already contains everything needed. The work is entirely architectural -- extending parsers, adapting the DB schema, branching evaluation logic, and conditionally rendering overlays.
 
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
-
-WORKDIR /app
-COPY pyproject.toml uv.lock ./
-RUN uv sync --frozen --no-dev --no-editable
-
-# Stage 2: Runtime
-FROM python:3.14-slim AS runner
-
-WORKDIR /app
-COPY --from=builder /app/.venv /app/.venv
-COPY app/ ./app/
-COPY plugins/ ./plugins/
-
-ENV PATH="/app/.venv/bin:$PATH"
-ENV DATAVISOR_HOST=0.0.0.0
-ENV DATAVISOR_PORT=8000
-
-EXPOSE 8000
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
-```
-
-**Key decisions:**
-- `python:3.14-slim` not `alpine` -- avoids musl compilation issues with numpy/torch/scipy wheels.
-- `uv sync --frozen` for deterministic installs from lockfile.
-- No `gunicorn` needed -- single-user tool, one uvicorn worker is sufficient. Gunicorn adds complexity with no benefit for single-user.
-- Volume-mount `data/` at runtime, not baked into image. Data persists across container rebuilds.
-
-### New Python dependency: None
-
-No new Python packages needed for Docker. The existing `pyproject.toml` already has everything. Auth additions (see section 3) use FastAPI's built-in `fastapi.security` module -- zero new dependencies.
-
-### Confidence: HIGH
+This is the most important finding: the complexity is in the plumbing, not the tooling.
 
 ---
 
-## 3. Frontend Dockerfile
-
-### Recommended: Next.js standalone multi-stage build
+## 1. JSONL Parsing -- Python stdlib `json` module
 
-Requires adding `output: "standalone"` to `next.config.ts`:
+No library needed. The classification format is line-delimited JSON:
 
-```typescript
-const nextConfig: NextConfig = {
-  output: "standalone",
-  images: {
-    unoptimized: true,
-  },
-};
+```json
+{"image":"filename.jpg","prefix":"Read the number.","suffix":"3"}
 ```
 
-```dockerfile
-# Stage 1: Dependencies
-FROM node:22-alpine AS deps
-WORKDIR /app
-COPY package.json package-lock.json ./
-RUN npm ci
+Python's built-in `json.loads()` per line is the correct approach. `ijson` (already installed for COCO streaming) is overkill -- JSONL is inherently streamable by reading line-by-line. Each line is a complete JSON object.
 
-# Stage 2: Build
-FROM node:22-alpine AS builder
-WORKDIR /app
-COPY --from=deps /app/node_modules ./node_modules
-COPY . .
+**Implementation:** A new `ClassificationParser(BaseParser)` in `app/ingestion/classification_parser.py`.
 
-ARG NEXT_PUBLIC_API_URL=http://localhost:8000
-ENV NEXT_PUBLIC_API_URL=$NEXT_PUBLIC_API_URL
+| Concern | Approach | Why |
+|---------|----------|-----|
+| Streaming | `for line in open(path)` | JSONL is naturally line-streamable |
+| Batching | Accumulate dicts, yield DataFrames every `batch_size` lines | Matches existing `COCOParser` pattern |
+| Category extraction | First pass: collect unique `suffix` values | No explicit category list in JSONL format |
+| Memory | O(batch_size) not O(dataset) | Same streaming guarantee as COCO path |
 
-RUN npm run build
+**BaseParser compatibility:** The existing `BaseParser` ABC defines `parse_categories`, `build_image_batches`, and `build_annotation_batches`. The classification parser implements all three. The key difference: `build_annotation_batches` yields rows with `bbox_x=0, bbox_y=0, bbox_w=0, bbox_h=0` (sentinel values -- see section 5 for schema rationale).
 
-# Stage 3: Runner (~150MB total)
-FROM node:22-alpine AS runner
-WORKDIR /app
-
-ENV NODE_ENV=production
-
-RUN addgroup --system --gid 1001 nodejs && \
-    adduser --system --uid 1001 nextjs
-
-COPY --from=builder /app/public ./public
-COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./
-COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
-
-USER nextjs
-EXPOSE 3000
-ENV PORT=3000
-ENV HOSTNAME=0.0.0.0
-
-CMD ["node", "server.js"]
-```
-
-**Key decision:** `NEXT_PUBLIC_API_URL` is baked at build time (Next.js inlines env vars prefixed with `NEXT_PUBLIC_` during `next build`). For Docker Compose, the build arg sets this to the internal service name. For external access, Caddy proxies both frontend and backend under the same domain, so the browser hits `/api/` which Caddy routes to the backend.
+**Image dimensions:** Classification JSONL does not include width/height (unlike COCO JSON). The parser uses `PIL.Image.open(path).size` (Pillow already installed as `pillow>=12.1.1`) for header-only dimension reads. This requires access to the image directory during parsing, which the existing `build_image_batches(file_path, dataset_id, split, image_dir)` signature already supports.
 
 ### Confidence: HIGH
-Source: [Next.js standalone Docker docs](https://nextjs.org/docs/app/api-reference/config/next-config-js/output)
+Source: Direct codebase inspection of `BaseParser` and `COCOParser` interfaces.
 
 ---
 
-## 4. Single-User Authentication
-
-### Recommended: FastAPI HTTPBasic + Caddy basic_auth (defense in depth)
-
-**Layer 1: Caddy reverse proxy (primary auth gate)**
-
-Caddy handles basic auth at the edge. The browser prompts for credentials before any request reaches FastAPI or Next.js. This protects the entire application (frontend + backend + API docs) with zero code changes.
-
-```
-your-domain.com {
-    basic_auth {
-        admin $2a$14$HASHED_PASSWORD
-    }
-
-    handle /api/* {
-        reverse_proxy backend:8000
-    }
-
-    handle {
-        reverse_proxy frontend:3000
-    }
-}
-```
-
-Generate the password hash with: `caddy hash-password --plaintext 'your-password'`
+## 2. Classification Evaluation Metrics -- `scikit-learn` (already installed, >=1.8.0)
 
-**Layer 2: FastAPI HTTPBasic (API-level protection)**
+The project already depends on `scikit-learn>=1.8.0`. It provides everything needed:
 
-For defense in depth and for direct API access (bypassing Caddy during development), add FastAPI's built-in HTTPBasic:
+| Metric | scikit-learn Function | Purpose |
+|--------|----------------------|---------|
+| Accuracy | `accuracy_score(y_true, y_pred)` | Top-level summary stat |
+| Per-class P/R/F1 | `precision_recall_fscore_support(y_true, y_pred, average=None)` | Replaces detection's per-class AP table |
+| Macro F1 | `precision_recall_fscore_support(..., average='macro')` | Overall model quality (class-balanced) |
+| Weighted F1 | `precision_recall_fscore_support(..., average='weighted')` | Overall model quality (sample-weighted) |
+| Confusion Matrix | `confusion_matrix(y_true, y_pred)` | Direct NxN array, no IoU matching |
 
-```python
-import secrets
-from fastapi import Depends, HTTPException, status
-from fastapi.security import HTTPBasic, HTTPBasicCredentials
-from app.config import get_settings
-
-security = HTTPBasic()
-
-def verify_credentials(
-    credentials: HTTPBasicCredentials = Depends(security),
-) -> str:
-    settings = get_settings()
-    is_user = secrets.compare_digest(
-        credentials.username.encode("utf8"),
-        settings.auth_username.encode("utf8"),
-    )
-    is_pass = secrets.compare_digest(
-        credentials.password.encode("utf8"),
-        settings.auth_password.encode("utf8"),
-    )
-    if not (is_user and is_pass):
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Incorrect credentials",
-            headers={"WWW-Authenticate": "Basic"},
-        )
-    return credentials.username
-```
+**Critical difference from detection evaluation:** Classification is dramatically simpler. No IoU matching, no confidence thresholds, no PR curves. Each image has exactly one GT label and one predicted label. The entire evaluation is `confusion_matrix(y_true, y_pred)` plus `classification_report()`.
 
-Add to `Settings`:
-```python
-auth_username: str = "admin"
-auth_password: str = ""  # Empty = auth disabled (local dev)
-auth_enabled: bool = False  # Toggle via DATAVISOR_AUTH_ENABLED=true
-```
+**New service:** `app/services/classification_evaluation.py` -- separate from `app/services/evaluation.py` because the logic is fundamentally different (no IoU, no bounding boxes, no `supervision` dependency).
 
-**Why NOT JWT, OAuth2, or session-based auth:**
-- This is a single-user personal tool deployed on a cloud VM.
-- HTTP Basic Auth is built into FastAPI (zero dependencies), supported by every browser natively (credentials dialog), and sufficient for single-user protection.
-- JWT adds token management complexity with no benefit for one user.
-- OAuth2 requires an identity provider -- massive overkill.
-- Session cookies require server-side session storage -- unnecessary complexity.
+**What detection metrics map to in classification:**
 
-**Why Caddy over Nginx:**
-- Caddy has automatic HTTPS (Let's Encrypt) with zero configuration. Nginx requires certbot setup, cron renewal, and manual config.
-- Caddy is a single static binary. Config is ~10 lines vs Nginx's verbose syntax.
-- `basic_auth` is a built-in Caddy directive. Nginx requires `htpasswd` file generation.
-- For a single-user tool, Caddy's simplicity wins decisively.
+| Detection Concept | Classification Equivalent | Notes |
+|-------------------|--------------------------|-------|
+| mAP@50/75/50:95 | Accuracy + Macro F1 | No IoU thresholds in classification |
+| PR Curves per class | Per-class P/R/F1 table | No confidence sweep for single-label |
+| IoU threshold slider | Removed (not applicable) | No spatial overlap concept |
+| Confidence threshold | Kept only if predictions include confidence scores | Filters which predictions are considered |
+| Confusion Matrix | Confusion Matrix (same) | Simpler: no "background" row/col |
+| Error Analysis (TP/FP/FN/Label Error) | Correct/Incorrect per class | No spatial matching needed |
 
-### New dependencies: None (backend), caddy:2-alpine Docker image (infrastructure)
 ### Confidence: HIGH
-Source: [FastAPI HTTP Basic Auth docs](https://fastapi.tiangolo.com/advanced/security/http-basic-auth/), [Caddy basic_auth directive](https://caddyserver.com/docs/caddyfile/directives/basic_auth)
+Source: `scikit-learn>=1.8.0` already in `pyproject.toml` line 20. Functions verified in scikit-learn stable docs.
 
 ---
 
-## 5. GCP Compute Engine Deployment
-
-### Recommended: Shell script with `gcloud` CLI (not Terraform)
-
-**Why not Terraform:** This is a single VM for a personal tool. Terraform adds a dependency (Terraform binary + state management + HCL learning curve) for managing one resource. A shell script using `gcloud` CLI is simpler, auditable, and reproducible.
-
-### Deployment script: `scripts/deploy-gcp.sh`
-
-```bash
-#!/usr/bin/env bash
-# Creates a GCP Compute Engine VM with Docker + Docker Compose,
-# then deploys DataVisor via docker compose.
-
-PROJECT_ID="${GCP_PROJECT_ID:?Set GCP_PROJECT_ID}"
-ZONE="${GCP_ZONE:-us-central1-a}"
-INSTANCE_NAME="${GCP_INSTANCE:-datavisor}"
-MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-standard-4}"  # 4 vCPU, 16GB RAM
-
-# Create VM with Container-Optimized OS
-gcloud compute instances create "$INSTANCE_NAME" \
-  --project="$PROJECT_ID" \
-  --zone="$ZONE" \
-  --machine-type="$MACHINE_TYPE" \
-  --image-family=cos-stable \
-  --image-project=cos-cloud \
-  --boot-disk-size=50GB \
-  --tags=http-server,https-server \
-  --metadata-from-file=startup-script=scripts/vm-startup.sh
-
-# Open firewall for HTTP/HTTPS
-gcloud compute firewall-rules create allow-http-https \
-  --project="$PROJECT_ID" \
-  --allow=tcp:80,tcp:443 \
-  --target-tags=http-server,https-server \
-  --description="Allow HTTP and HTTPS for DataVisor"
-```
+## 3. Confusion Matrix Frontend -- Reuse Existing Component
 
-### VM startup script: `scripts/vm-startup.sh`
-
-```bash
-#!/usr/bin/env bash
-# Runs on first boot of Container-Optimized OS VM.
-# Installs Docker Compose and starts DataVisor.
-
-# COS already has Docker; install docker-compose plugin
-docker compose version || {
-    mkdir -p ~/.docker/cli-plugins/
-    curl -SL https://github.com/docker/compose/releases/latest/download/docker-compose-linux-x86_64 \
-        -o ~/.docker/cli-plugins/docker-compose
-    chmod +x ~/.docker/cli-plugins/docker-compose
-}
-
-# Clone repo and start
-cd /opt
-git clone https://github.com/YOUR_USER/data-visor.git
-cd data-visor
-
-# Create .env with auth credentials
-cat > .env <<EOF
-AUTH_USERNAME=admin
-AUTH_PASSWORD=CHANGE_ME_ON_DEPLOY
-EOF
-
-docker compose up -d --build
-```
+The existing `confusion-matrix.tsx` component accepts `matrix: number[][]` and `labels: string[]`. It is already format-agnostic. For classification:
 
-**Machine type recommendation:**
-- `e2-standard-4` (4 vCPU, 16GB RAM): Sufficient for DINOv2 embedding generation (CPU mode) and Moondream2 inference. ~$100/month.
-- For GPU (faster VLM/embedding inference): `n1-standard-4` + NVIDIA T4 GPU. ~$250/month. Add `--accelerator=type=nvidia-tesla-t4,count=1` to gcloud create.
-- For cost savings: Use preemptible/spot VMs (`--provisioning-model=SPOT`). DataVisor is stateful but can tolerate restarts.
-
-**Alternative: Local development script** (`scripts/run-local.sh`):
-```bash
-#!/usr/bin/env bash
-# Start DataVisor locally without Docker
-cd "$(dirname "$0")/.."
-uv run uvicorn app.main:app --host 0.0.0.0 --port 8000 &
-cd frontend && npm run dev &
-wait
-```
+- **No "background" row/column** -- classification has no concept of "no detection"
+- The backend simply returns labels without "background"
+- The component renders correctly without any changes
+
+**Zero frontend changes needed for the confusion matrix visualization.**
 
-### New dependencies: None (uses gcloud CLI, assumed available)
 ### Confidence: HIGH
-Source: [GCP Container-Optimized OS docs](https://cloud.google.com/container-optimized-os/docs), [GCP Compute Engine docs](https://cloud.google.com/compute/docs)
+Source: Direct inspection of `frontend/src/components/stats/confusion-matrix.tsx` (accepts generic `number[][]`).
 
 ---
 
-## 6. Annotation Editing (Canvas Library)
-
-### Recommended: react-konva 19.2.0 + konva 10.2.0
-
-| Library | Version | Purpose |
-|---------|---------|---------|
-| `konva` | 10.2.0 | HTML5 Canvas 2D framework with built-in Transformer (resize/rotate handles) |
-| `react-konva` | 19.2.0 | React 19 bindings for Konva (required: react-konva@19 for react@19) |
+## 4. Annotation Overlay -- Conditional Class Label Badge
 
-**Why react-konva over Fabric.js:**
+For classification datasets, there are no bounding boxes. The existing `AnnotationOverlay` renders SVG `<rect>` elements. For classification, render a class label badge instead.
 
-The existing v1.0 annotation overlay uses SVG (`<svg>` with `<rect>` elements) in read-only mode with `pointer-events-none`. For editing (move/resize/delete), we need interactive canvas elements. The decision is between:
+**Approach:** The `AnnotationOverlay` component branches on `bbox_w > 0`:
+- Detection annotations (`bbox_w > 0`): Render `<rect>` + label text (existing behavior)
+- Classification annotations (`bbox_w === 0`): Render a colored pill/badge with the class name in the top-left corner
 
-1. **react-konva** (recommended): First-class React integration via declarative JSX (`<Rect>`, `<Transformer>`, `<Stage>`). Works within React's component model. The `Transformer` component provides resize handles out of the box. react-konva@19.2.0 is explicitly built for React 19 -- the exact version this project uses.
+This reuses the existing SVG viewBox coordinate system and `getSourceColor()` utility. No new component needed -- extend the existing one with a conditional branch.
 
-2. **Fabric.js**: More built-in features (SVG export, rich text on canvas, image filters). But no official React wrapper -- requires imperative DOM manipulation via `useRef` + `useEffect`, fighting React's declarative model. The React ecosystem for Fabric.js is fragmented (multiple unofficial wrappers, none well-maintained).
+**Alternative considered:** Separate `ClassificationOverlay` component. Rejected because it would require duplicating the SVG viewBox setup and the caller would need to branch on dataset type before rendering. A single component with an internal branch is cleaner.
 
-3. **Keep SVG + make interactive**: SVG `<rect>` elements can be made draggable/resizable with mouse event handlers. However, SVG drag-and-resize requires manual coordinate math, hit-testing, and handle rendering. Canvas libraries solve this problem completely.
-
-**react-konva wins because:**
-- Declarative React API (`<Rect x={...} draggable onDragEnd={...} />`)
-- Built-in `Transformer` component for resize handles (no manual implementation)
-- react-konva@19 is verified compatible with React 19.2.3 and Konva 10.2.0
-- Already in the v1.0 STACK.md as the recommended canvas library (was not used in v1.0 because annotations were read-only, but the recommendation stands)
-- Used by multiple annotation tools in production (Konva docs have specific bounding box annotation examples)
+### Confidence: HIGH
+Source: Direct inspection of `frontend/src/components/grid/annotation-overlay.tsx`.
 
-**Architecture for annotation editing:**
+---
 
-The sample modal currently renders a full-resolution `<img>` with an SVG overlay. For editing mode:
+## 5. Database Schema -- `dataset_type` Column
 
-```
-Read-only mode (default):
-  <img> + <AnnotationOverlay> (existing SVG, unchanged)
-
-Edit mode (toggle):
-  <Stage> (Konva canvas, same dimensions as image)
-    <Layer>  (background image)
-      <Image> (Konva Image, not HTML img)
-    </Layer>
-    <Layer>  (annotations -- interactive)
-      <Rect draggable /> (for each bbox)
-      <Transformer />   (resize handles on selected rect)
-    </Layer>
-  </Stage>
-```
+Two approaches considered for storing classification annotations:
 
-This means the existing SVG overlay is PRESERVED for the read-only grid view (lightweight, no canvas overhead for 50+ thumbnails). The Konva canvas is only mounted in the sample modal when edit mode is toggled.
+**Option A (recommended): Sentinel bbox values (0,0,0,0) + `dataset_type` column**
+- Pro: No schema migration on `annotations` table, all existing queries work unchanged
+- Pro: The `dataset_type` column on `datasets` is the single dispatch point for all conditional logic
+- Con: Semantically imprecise (bbox columns have values that mean "not applicable")
 
-**SSR compatibility:** react-konva requires browser DOM. Use Next.js dynamic import:
-```typescript
-const AnnotationEditor = dynamic(
-  () => import("@/components/detail/annotation-editor"),
-  { ssr: false }
-);
-```
+**Option B (rejected): Nullable bbox columns**
+- Con: Breaks every existing query that assumes bbox is NOT NULL
+- Con: Requires extensive SQL changes across statistics, evaluation, error analysis, and filter builder
+- Con: DuckDB ALTER COLUMN to change NOT NULL constraints on existing data is non-trivial
 
-### Installation
+**Implementation:** One new column on `datasets`:
 
-```bash
-cd frontend
-npm install konva@^10.2.0 react-konva@^19.2.0
+```sql
+ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection';
 ```
 
-### What NOT to use
+Values: `'detection'` or `'classification'`. Added in `duckdb_repo.py`'s `initialize_schema()` method, following the existing pattern of idempotent `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` migrations (already used for `prediction_count`, `tags`, `image_dir`).
 
-| Avoid | Why |
-|-------|-----|
-| Fabric.js | No official React wrapper. Imperative API fights React's model. |
-| paper.js | Scriptographer port -- legacy design, small community, no React bindings. |
-| pixi.js | WebGL-focused game renderer. Overkill for 2D bounding boxes. |
-| SVG-only editing | Manual coordinate math for drag/resize/handles. Reinventing what Konva provides. |
-| react-konva@18 | Incompatible with React 19. Must use react-konva@19.x. |
+**No changes to the `annotations` table schema.** Classification annotations store:
+- `category_name`: the class label (from JSONL `suffix` field)
+- `bbox_x=0, bbox_y=0, bbox_w=0, bbox_h=0`: sentinel values
+- `area=0`
+- `source='ground_truth'`
+- One annotation per sample (1:1 mapping, unlike detection's 1:N)
 
 ### Confidence: HIGH
-Sources: [react-konva npm](https://www.npmjs.com/package/react-konva) (v19.2.0 verified), [konva npm](https://www.npmjs.com/package/konva) (v10.2.0 verified), [Konva Transformer docs](https://konvajs.org/docs/react/Transformer.html), [Konva bounding box annotation example](https://blog.intzone.com/using-konva-js-to-annotate-image-with-bounding-boxes/)
+Source: Direct inspection of `app/repositories/duckdb_repo.py` schema and migration patterns.
 
 ---
 
-## 7. Keyboard Shortcuts
-
-### Recommended: react-hotkeys-hook 5.2.4
-
-| Library | Version | Purpose |
-|---------|---------|---------|
-| `react-hotkeys-hook` | 5.2.4 | Declarative keyboard shortcut hook for React |
-
-**Why react-hotkeys-hook:**
-
-- **Declarative hook API:** `useHotkeys('ctrl+s', () => save())` -- no setup, no providers, no configuration objects. Fits naturally into React functional components.
-- **Scoped shortcuts:** Can scope hotkeys to specific elements via ref. This is critical for annotation editing (arrow keys move selected bbox) vs grid view (arrow keys navigate grid).
-- **Active maintenance:** v5.2.4 published 9 days ago (as of research date). 692 dependent packages on npm.
-- **React 19 compatible:** Hook-based, no class components, no deprecated lifecycle methods.
-- **Modifier support:** `ctrl+z`, `shift+click`, `meta+s` (Mac Cmd), sequential keys (`g then i` for vim-style).
-- **Tiny:** ~3KB gzipped. No dependencies.
-
-**Alternatives considered:**
-
-| Library | Why Not |
-|---------|---------|
-| `cmdk` (v1.1.1) | Command palette component, not a shortcut framework. Complementary -- could use cmdk for Cmd+K command palette AND react-hotkeys-hook for shortcuts. But cmdk is NOT needed for v1.1 scope (just navigation shortcuts, not a command palette). |
-| `react-hotkeys` (v2.0.0) | Last published 6 years ago. Unmaintained. Uses deprecated class-component HOC pattern. |
-| `@react-hook/hotkey` | Minimal, but lacks scoping and modifier key combinations. |
-| Custom `useEffect` + `addEventListener` | Works for trivial cases, but becomes unmaintainable with 15+ shortcuts. No conflict resolution, no scoping, no disable/enable. |
-| `tinykeys` | Minimal (<1KB), no React-specific features. Would need custom wrapper for scoping and component lifecycle. |
-
-**Planned shortcut map (for roadmap context):**
-
-| Shortcut | Action | Scope |
-|----------|--------|-------|
-| `j` / `k` | Next / previous sample in modal | Modal |
-| `Escape` | Close modal / deselect annotation | Global |
-| `Delete` / `Backspace` | Delete selected annotation | Edit mode |
-| `Ctrl+Z` / `Cmd+Z` | Undo annotation edit | Edit mode |
-| `e` | Toggle edit mode in modal | Modal |
-| `1-5` | Quick-tag error category (FP/FN/LE etc.) | Triage mode |
-| `Space` | Toggle annotation visibility | Grid/Modal |
-| `g` | Toggle grid/embedding view | Global |
-| `f` | Focus filter search | Global |
-| `?` | Show keyboard shortcuts help | Global |
-
-### Installation
-
-```bash
-cd frontend
-npm install react-hotkeys-hook@^5.2.4
-```
-
-### Confidence: HIGH
-Sources: [react-hotkeys-hook npm](https://www.npmjs.com/package/react-hotkeys-hook) (v5.2.4 verified), [react-hotkeys-hook docs](https://react-hotkeys-hook.vercel.app/)
-
----
-
-## 8. Dataset Ingestion UI
-
-### Recommended: No new libraries needed
+## 6. Format Auto-Detection -- Extend `FolderScanner`
 
-The smart ingestion UI (point at folder, auto-detect structure, import) is a frontend UX feature with backend logic. No new dependencies:
+The current `FolderScanner` only detects COCO JSON. For classification, add a parallel detection path:
 
-**Backend (Python):**
-- `pathlib` (stdlib) + `fsspec`/`gcsfs` (already installed) for directory traversal and structure detection.
-- The existing `app/ingestion/` module already handles COCO JSON parsing via `ijson`. Extend it with a structure detection service.
-- DuckDB (already installed) for storing split metadata.
+| Format | File Extension | Detection Heuristic |
+|--------|---------------|-------------------|
+| COCO JSON | `.json` | Top-level key `"images"` (existing) |
+| Classification JSONL | `.jsonl` | First line parses as JSON with `"image"` + `"suffix"` keys |
 
-**Frontend (TypeScript):**
-- Stepper/wizard UI for the import flow: built with existing Tailwind CSS components. No component library needed.
-- File tree visualization: flat list with indentation (CSS-only), or simple recursive component. No tree library needed for the scope (showing detected splits, not a full file explorer).
+**New method:** `_is_classification_jsonl(file_path: Path) -> bool`:
+1. Check file extension is `.jsonl`
+2. Read first line
+3. Parse with `json.loads()`
+4. Verify required keys: `image`, `suffix`
 
-**What the ingestion detector does (backend service):**
-1. Accept a root path (local or GCS).
-2. Walk directory structure looking for patterns:
-   - `train/`, `val/`, `test/` subdirectories -> split detection
-   - `images/` + `annotations/` or `labels/` -> format detection
-   - `*.json` files -> check for COCO format keys (`images`, `annotations`, `categories`)
-   - `*.txt` files alongside images -> YOLO format detection
-3. Return a detection result: `{ format: "coco", splits: ["train", "val"], annotation_file: "path/to/instances.json", image_dirs: [...] }`
-4. User confirms/adjusts in UI, then triggers import with the validated config.
+**Layout detection:** Classification JSONL datasets follow the same Roboflow layout patterns already detected (Layout B: split dirs with co-located files). The scanner looks for `.jsonl` files instead of `.json` files within split directories.
 
-### What NOT to add
+**The `ScanResult.format` field** changes from always `"coco"` to `"coco" | "classification_jsonl"`. This propagates through:
+- `IngestRequest.format` (already supports arbitrary strings)
+- `DatasetResponse.format` (already a `str` field)
+- `datasets.format` column in DuckDB (already `VARCHAR`)
 
-| Avoid | Why |
-|-------|-----|
-| `react-dropzone` | DataVisor reads from server-side paths (local disk or GCS), not browser file uploads. The user provides a path, not drag-and-drop files. |
-| `react-arborist` or `react-complex-tree` | Over-engineered for showing a simple detected folder structure. A flat list with indentation suffices. |
-| Any upload library | Images stay on disk/GCS. DataVisor reads them in-place. No upload flow needed. |
-
-### Confidence: HIGH (no new dependencies, pure application logic)
+### Confidence: HIGH
+Source: Direct inspection of `app/services/folder_scanner.py` and `app/models/scan.py`.
 
 ---
 
-## 9. Error Triage Workflow
+## 7. Evaluation API Response -- Discriminated Union
 
-### Recommended: No new libraries needed
+The `/datasets/{id}/evaluation` endpoint currently returns `EvaluationResponse` (mAP, PR curves, etc). For classification, it needs a different shape.
 
-The error triage workflow (tag errors, highlight/dim, rank worst images) builds on existing infrastructure:
+**Approach:** New response model with `dataset_type` discriminator:
 
-**Backend:**
-- DuckDB already stores error categories (`error_type` column in annotations/analysis tables).
-- Extend with a `triage_status` column (enum: `unreviewed`, `confirmed`, `dismissed`, `needs_review`).
-- "Worst images" ranking: composite score from existing DuckDB data (error count + confidence spread + embedding outlier distance). Pure SQL aggregation query.
+```python
+class ClassificationEvaluationResponse(BaseModel):
+    dataset_type: Literal["classification"] = "classification"
+    accuracy: float
+    macro_f1: float
+    weighted_f1: float
+    per_class_metrics: list[ClassificationPerClassMetrics]
+    confusion_matrix: list[list[int]]
+    confusion_matrix_labels: list[str]
+
+class ClassificationPerClassMetrics(BaseModel):
+    class_name: str
+    precision: float
+    recall: float
+    f1: float
+    support: int  # number of GT samples for this class
+```
 
-**Frontend:**
-- Tag buttons (FP/FN/Label Error/Confirmed): Tailwind-styled buttons + existing `apiPatch` calls.
-- Highlight/dim: CSS opacity on grid cells based on triage status. Already have the pattern from source discriminator (GT vs predictions solid/dashed lines).
-- Keyboard shortcuts for quick-tagging: covered by react-hotkeys-hook (section 7).
-- Progress indicator (e.g., "47/312 errors triaged"): simple counter from DuckDB aggregation.
+**Endpoint dispatch:** The router queries `dataset_type` from the `datasets` table and calls the appropriate evaluation function:
 
-### What NOT to add
+```python
+dataset_type = cursor.execute(
+    "SELECT dataset_type FROM datasets WHERE id = ?", [dataset_id]
+).fetchone()[0]
+
+if dataset_type == "classification":
+    return compute_classification_evaluation(cursor, dataset_id, source, conf_threshold, split=split)
+else:
+    return compute_evaluation(cursor, dataset_id, source, iou_threshold, conf_threshold, split=split)
+```
 
-| Avoid | Why |
-|-------|-----|
-| `@dnd-kit` (drag-and-drop) | Not needed. Triage is tag-based, not drag-and-drop-based. |
-| Any kanban/board library | Triage is not a kanban workflow. It is sequential review with keyboard shortcuts. |
-| Separate triage database | DuckDB already handles this. Adding a column to existing tables is sufficient. |
+**Alternative considered:** Extending the existing `EvaluationResponse` with optional fields. Rejected because the metrics are fundamentally different -- a union type with many optional fields would be confusing and error-prone for the frontend to consume.
 
-### Confidence: HIGH (no new dependencies, extends existing patterns)
+### Confidence: HIGH
+Source: Direct inspection of `app/models/evaluation.py` and `app/routers/statistics.py`.
 
 ---
 
-## 10. Caddy Reverse Proxy
-
-### Recommended: caddy:2-alpine Docker image
-
-| Technology | Version | Purpose |
-|------------|---------|---------|
-| Caddy | 2.x (latest `caddy:2-alpine`) | Reverse proxy, automatic HTTPS, basic auth |
+## 8. Frontend Evaluation Panel -- Conditional Rendering
 
-**Why Caddy over Nginx/Traefik:**
+The existing `evaluation-panel.tsx` renders PR curves, mAP summary, per-class AP table, and confusion matrix. For classification datasets, it renders a different set of components:
 
-| Criterion | Caddy | Nginx | Traefik |
-|-----------|-------|-------|---------|
-| Automatic HTTPS | Built-in, zero-config Let's Encrypt | Requires certbot + cron | Built-in but more complex config |
-| Config simplicity | 10-line Caddyfile | 30+ line nginx.conf | YAML/TOML, label-based discovery |
-| Basic auth | Built-in directive | Requires htpasswd file | Middleware config |
-| Docker image size | ~40MB (alpine) | ~25MB (alpine) | ~100MB |
-| Learning curve | Minimal | Moderate | Moderate |
+**Detection evaluation panel (existing, unchanged):**
+- IoU threshold slider
+- Confidence threshold slider
+- mAP@50/75/50:95 summary cards
+- PR curves (Recharts line chart)
+- Per-class AP table
+- Confusion matrix
 
-For a single-VM, single-user tool, Caddy's automatic HTTPS alone is decisive. No certbot setup, no renewal cron, no debugging expired certificates.
-
-**Caddyfile:**
-
-```
-{$DOMAIN:localhost} {
-    basic_auth {
-        {$AUTH_USERNAME:admin} {$AUTH_PASSWORD_HASH}
-    }
-
-    handle /api/* {
-        uri strip_prefix /api
-        reverse_proxy backend:8000
-    }
-
-    handle {
-        reverse_proxy frontend:3000
-    }
-}
-```
+**Classification evaluation panel (new conditional branch):**
+- Confidence threshold slider (only if predictions have confidence)
+- Accuracy + Macro F1 + Weighted F1 summary cards
+- Per-class P/R/F1 table (reuse the same table component, different columns)
+- Confusion matrix (reuse existing component)
 
-**For local development (no Caddy):** Access backend at `localhost:8000` and frontend at `localhost:3000` directly. Caddy is only needed in the Docker Compose deployment on the VM.
+**No new chart types needed.** The per-class table uses the same Tailwind-styled table pattern. Summary cards use the same card component. The confusion matrix component is reused directly. The PR curve chart is simply not rendered for classification datasets.
 
-**HTTPS for local dev:** Not needed. Caddy only runs in the cloud deployment. For local development, HTTP is fine.
+**Dispatch mechanism:** The `DatasetResponse` type gets a `dataset_type` field. The evaluation panel checks this and renders the appropriate metrics. The response type itself carries the `dataset_type` discriminator.
 
-### Configuration: Caddyfile at project root
 ### Confidence: HIGH
-Sources: [Caddy Docker image](https://hub.docker.com/_/caddy), [Caddy reverse proxy quickstart](https://caddyserver.com/docs/quick-starts/reverse-proxy), [Caddy basic_auth directive](https://caddyserver.com/docs/caddyfile/directives/basic_auth)
+Source: Direct inspection of `frontend/src/components/stats/evaluation-panel.tsx` and `frontend/src/types/evaluation.ts`.
 
 ---
 
-## Complete New Dependencies Summary
+## 9. Backend Parser Factory -- Dispatch Pattern
 
-### Frontend (npm)
+The ingestion service currently hardcodes `COCOParser`. It needs a factory:
 
-```bash
-cd frontend
-npm install konva@^10.2.0 react-konva@^19.2.0 react-hotkeys-hook@^5.2.4
+```python
+def get_parser(format: str) -> BaseParser:
+    if format == "coco":
+        return COCOParser(batch_size=1000)
+    elif format == "classification_jsonl":
+        return ClassificationParser(batch_size=1000)
+    raise ValueError(f"Unknown format: {format}")
 ```
 
-| Package | Version | Size (gzipped) | Purpose |
-|---------|---------|----------------|---------|
-| `konva` | 10.2.0 | ~65KB | Canvas 2D framework |
-| `react-konva` | 19.2.0 | ~8KB | React 19 bindings for Konva |
-| `react-hotkeys-hook` | 5.2.4 | ~3KB | Keyboard shortcuts |
-
-**Total new frontend footprint:** ~76KB gzipped. Minimal impact.
-
-### Backend (Python)
-
+The `IngestionService.ingest_with_progress()` method changes from:
+```python
+parser = COCOParser(batch_size=1000)
 ```
-No new Python dependencies.
+to:
+```python
+parser = get_parser(format)
 ```
 
-Auth uses FastAPI's built-in `fastapi.security.HTTPBasic`. Docker uses the existing `uvicorn`. Ingestion detection uses `pathlib` + existing `fsspec`. Triage uses existing DuckDB columns.
-
-### Infrastructure (Docker images)
-
-| Image | Size | Purpose |
-|-------|------|---------|
-| `caddy:2-alpine` | ~40MB | Reverse proxy + HTTPS + auth |
-| `python:3.14-slim` | ~150MB | Backend base image |
-| `node:22-alpine` | ~180MB | Frontend build base |
+The rest of the ingestion flow (batch inserts, thumbnail generation, plugin hooks) works unchanged because all parsers implement the same `BaseParser` interface.
 
----
-
-## What NOT to Add (and Why)
+**dataset_type assignment:** When creating the dataset record, set `dataset_type` based on format:
+```python
+dataset_type = "classification" if format == "classification_jsonl" else "detection"
+```
 
-| Technology | Why Skip |
-|------------|----------|
-| **Qdrant Docker container** | Already using local mode (`QdrantClient(path=...)`). Works in-process. No server needed for single-user. |
-| **Gunicorn** | Single-user tool. One uvicorn worker is sufficient. Gunicorn adds process management complexity for zero benefit. |
-| **Nginx** | Caddy is simpler for this use case (auto HTTPS, built-in basic auth). |
-| **Terraform** | One VM. Shell script with `gcloud` CLI is simpler and more auditable. |
-| **JWT/OAuth2/Auth0** | Single-user tool. HTTP Basic Auth is sufficient. JWT adds token lifecycle management for one user. |
-| **Fabric.js** | No official React wrapper. Imperative API fights React's declarative model. |
-| **cmdk** | Command palette is a nice-to-have, not needed for v1.1 keyboard shortcuts. Could add in v1.2. |
-| **react-dropzone** | DataVisor reads server-side paths, not browser uploads. |
-| **Kubernetes/Cloud Run** | One VM running Docker Compose. K8s is massive overkill. Cloud Run does not support persistent volumes for DuckDB/Qdrant. |
-| **PostgreSQL** | DuckDB is sufficient. PostgreSQL adds a container, connection management, and migrations for no analytical query benefit. |
-| **Redis** | No caching layer needed. DuckDB queries are fast enough for single-user. No session store needed (HTTP Basic Auth is stateless). |
-| **shadcn/ui** | Was recommended in v1.0 STACK.md but was not used -- v1.0 shipped with hand-written Tailwind components. The existing component patterns are consistent and sufficient. Adding shadcn/ui now would create style inconsistency between old and new components. |
+### Confidence: HIGH
+Source: Direct inspection of `app/services/ingestion.py` and `app/ingestion/base_parser.py`.
 
 ---
 
-## Integration Points with Existing Stack
-
-### Backend auth integration
-
-Add auth dependency to FastAPI routers. The existing router pattern (`app.include_router(...)`) supports `dependencies` parameter for blanket protection:
+## 10. Error Analysis Adaptation
 
-```python
-# In main.py -- protect all API routes
-if get_settings().auth_enabled:
-    from app.auth import verify_credentials
-    for router in [datasets.router, samples.router, ...]:
-        router.dependencies.append(Depends(verify_credentials))
-```
+The existing `error_analysis.py` categorizes detection errors using IoU matching (TP, Hard FP, Label Error, FN). For classification, this simplifies to:
 
-Or use FastAPI middleware for simpler blanket auth (preferred for single-user):
+- **Correct:** GT label == predicted label
+- **Incorrect:** GT label != predicted label (with the specific GT/predicted pair recorded)
 
-```python
-# Middleware approach -- every request except /health
-@app.middleware("http")
-async def auth_middleware(request, call_next):
-    if request.url.path == "/health":
-        return await call_next(request)
-    if not settings.auth_enabled:
-        return await call_next(request)
-    # Validate Basic auth header...
-```
+No IoU matching, no bounding box comparison. The classification error analysis is a simple label comparison per image.
 
-### Frontend API calls with auth
+**Implementation:** New function `classify_errors()` in `app/services/classification_evaluation.py` (colocated with classification metrics, not in `error_analysis.py` which is detection-specific).
 
-The existing `apiFetch` function in `lib/api.ts` needs auth header injection when deployed with auth. Two approaches:
+**Existing `ErrorAnalysisResponse` model reuse:** The response shape can be simplified but the same pattern works -- `ErrorSummary` with counts, `PerClassErrors` with per-class breakdowns, and `samples_by_type` grouping.
 
-**Option A (recommended): Caddy handles auth, frontend unchanged.**
-Caddy validates Basic Auth at the edge. If credentials are valid, requests pass through to backend. The frontend never sees or sends auth headers -- Caddy strips them. This means ZERO frontend changes for auth.
+### Confidence: HIGH
+Source: Direct inspection of `app/services/error_analysis.py`.
 
-**Option B (direct backend auth): Add credentials to fetch.**
-If accessing the backend directly (without Caddy), add `Authorization` header:
-```typescript
-const headers: HeadersInit = { "Content-Type": "application/json" };
-if (process.env.NEXT_PUBLIC_AUTH_ENABLED === "true") {
-    headers["Authorization"] = `Basic ${btoa(`${username}:${password}`)}`;
-}
-```
+---
 
-**Recommendation: Option A.** Caddy-level auth means the frontend code stays unchanged. The browser's native Basic Auth dialog handles credential entry. No login page, no token storage, no auth state management.
+## Complete Stack Summary
 
-### Konva integration with existing modal
+### New Files to Create
 
-The existing `SampleModal` renders `<AnnotationOverlay>` (SVG) over an `<img>`. The edit mode wraps the same image + annotations in a Konva `<Stage>`:
+| File | Purpose | Dependencies Used |
+|------|---------|-------------------|
+| `app/ingestion/classification_parser.py` | Parse JSONL format | `json` stdlib, `pandas`, `PIL.Image` |
+| `app/services/classification_evaluation.py` | Accuracy/F1/confusion matrix | `sklearn.metrics` |
+| `app/models/classification_evaluation.py` | Response models for classification eval | `pydantic` |
 
-```
-SampleModal
-  |-- read-only mode: <img> + <AnnotationOverlay> (existing, unchanged)
-  |-- edit mode: <AnnotationEditor> (new, lazy-loaded Konva component)
-        |-- <Stage>
-              |-- <Layer> <Image /> </Layer>  (background)
-              |-- <Layer> <Rect draggable /> ... <Transformer /> </Layer>  (editable annotations)
-```
+### Files to Modify
 
-The annotation data model stays the same (`bbox_x`, `bbox_y`, `bbox_w`, `bbox_h`). Konva `Rect` coordinates map directly. On save, updated coordinates are sent to the backend via existing `apiPatch` pattern.
+| File | Change | Scope |
+|------|--------|-------|
+| `app/repositories/duckdb_repo.py` | Add `dataset_type` column migration | 3 lines |
+| `app/services/ingestion.py` | Parser factory dispatch | ~10 lines |
+| `app/services/folder_scanner.py` | JSONL format detection | ~30 lines |
+| `app/models/dataset.py` | Add `dataset_type` to `DatasetResponse` | 1 line |
+| `app/models/scan.py` | Allow `format` values beyond `"coco"` | Already supports it |
+| `app/routers/statistics.py` | Dispatch evaluation by dataset_type | ~15 lines |
+| `frontend/src/types/evaluation.ts` | Add classification eval types | ~20 lines |
+| `frontend/src/components/stats/evaluation-panel.tsx` | Conditional rendering | ~50 lines |
+| `frontend/src/components/grid/annotation-overlay.tsx` | Class label badge for classification | ~15 lines |
 
-### Keyboard shortcuts integration with Zustand stores
+### No New Backend Dependencies
 
-react-hotkeys-hook calls Zustand store actions directly:
+The `pyproject.toml` does not change. Everything is already installed:
+- `scikit-learn>=1.8.0` -- classification metrics
+- `pillow>=12.1.1` -- image dimension reading
+- `pandas>=3.0.0` -- DataFrame batching
+- `duckdb>=1.4.4` -- schema and queries
 
-```typescript
-useHotkeys("j", () => useUIStore.getState().selectNextSample());
-useHotkeys("escape", () => useUIStore.getState().closeDetailModal());
-useHotkeys("e", () => useUIStore.getState().toggleEditMode());
-```
+### No New Frontend Dependencies
 
-No new store needed. Extend existing `useUIStore` with edit mode state and triage shortcuts.
+The `package.json` does not change:
+- `recharts>=3.7.0` -- existing charts (reused for classification tables)
+- React + Tailwind -- conditional rendering
+- Existing SVG overlay -- class label badges
 
 ---
 
-## Version Compatibility Matrix
+## Alternatives Considered
 
-| Package A | Package B | Compatibility | Notes |
-|-----------|-----------|---------------|-------|
-| react-konva@19.2.0 | react@19.2.3 | Verified | react-konva@19 requires React 19 |
-| react-konva@19.2.0 | konva@10.2.0 | Verified | Works out of the box since konva@10 |
-| react-hotkeys-hook@5.2.4 | react@19.2.3 | Verified | Hook-based, React 19 compatible |
-| konva@10.2.0 | Next.js 16.1.6 | Verified | Must use `dynamic()` with `{ ssr: false }` |
-| caddy:2-alpine | Docker Compose v2 | Verified | Standard Docker image |
-| python:3.14-slim | uv (latest) | Verified | uv supports Python 3.14 |
-| node:22-alpine | Next.js 16.1.6 | Verified | Next.js 16 supports Node 22 |
+| Decision | Chosen | Alternative | Why Not |
+|----------|--------|-------------|---------|
+| JSONL parsing | `json.loads()` per line | `ijson` streaming | JSONL is line-delimited; `json.loads` per line is simpler and equally streaming |
+| Classification metrics | `sklearn.metrics` | Custom numpy | sklearn already installed; classification metrics are trivial |
+| Schema approach | Sentinel bbox (0,0,0,0) + `dataset_type` | Nullable bbox columns | Would break all existing detection queries |
+| Confusion matrix UI | Reuse `confusion-matrix.tsx` | New classification component | Existing component is format-agnostic |
+| Evaluation response | Separate `ClassificationEvaluationResponse` | Extend `EvaluationResponse` with optional fields | Metrics are fundamentally different; union with optionals is confusing |
+| Class label overlay | Extend existing `AnnotationOverlay` | Separate `ClassificationOverlay` | Single component with conditional branch is cleaner |
+| Image dimensions | `PIL.Image.open().size` | `pyvips` | PIL header-only read is sufficient and simpler |
 
 ---
 
-## Risks and Mitigations
+## What NOT to Add
 
-| Risk | Likelihood | Impact | Mitigation |
-|------|-----------|--------|------------|
-| DuckDB single-writer lock inside Docker | Low | Medium | Already handled -- single-user tool, one uvicorn worker. No concurrent writes. |
-| Konva Transformer quirks with scaled coordinates | Medium | Low | Konva `Transformer` works in local coordinate space. Use `scaleX/scaleY` or resize via `boundBoxFunc`. Konva docs have specific examples. |
-| torch Docker image size (~4GB) | High | Low | Accept the size. Use multi-stage build to exclude dev deps. Pin torch version in lockfile to avoid pulling larger versions. Consider `torch-cpu` package if no GPU needed. |
-| `NEXT_PUBLIC_API_URL` baked at build time | Medium | Medium | For Docker Compose, set via build arg. For changing API URLs without rebuild, use Next.js middleware or runtime config (but adds complexity). Keep it simple: rebuild frontend container when API URL changes. |
-| Caddy Let's Encrypt rate limits | Low | Medium | Only an issue if deploying/redeploying many times to different domains. For a single stable domain, no problem. Use staging CA for testing. |
+| Technology | Why Skip |
+|------------|----------|
+| **Any new pip package** | `scikit-learn` + `pillow` already cover all needs |
+| **Any new npm package** | Existing Recharts + Tailwind + SVG cover all visualization needs |
+| **`supervision` for classification** | supervision is detection-focused (IoU, mAP). Classification metrics come from sklearn |
+| **`torchmetrics`** | Would add a PyTorch-ecosystem dependency for metrics sklearn already provides |
+| **Separate classification database table** | Annotations table with sentinel bbox values works cleanly with `dataset_type` dispatch |
+| **GraphQL or new API layer** | REST endpoints with discriminated response types are sufficient |
+| **New chart library** | No new chart types needed -- classification uses tables and the existing confusion matrix |
 
 ---
 
 ## Sources
 
-### Verified (HIGH confidence)
-- [FastAPI HTTP Basic Auth docs](https://fastapi.tiangolo.com/advanced/security/http-basic-auth/) -- auth pattern
-- [Next.js standalone output docs](https://nextjs.org/docs/app/api-reference/config/next-config-js/output) -- Docker build
-- [react-konva npm](https://www.npmjs.com/package/react-konva) -- v19.2.0, React 19 compatibility
-- [konva npm](https://www.npmjs.com/package/konva) -- v10.2.0
-- [react-hotkeys-hook npm](https://www.npmjs.com/package/react-hotkeys-hook) -- v5.2.4
-- [Konva Transformer docs](https://konvajs.org/docs/react/Transformer.html) -- resize/rotate handles
-- [Caddy Docker image](https://hub.docker.com/_/caddy) -- caddy:2-alpine
-- [Caddy reverse proxy docs](https://caddyserver.com/docs/quick-starts/reverse-proxy)
-- [Caddy basic_auth docs](https://caddyserver.com/docs/caddyfile/directives/basic_auth)
-- [FastAPI Docker deployment](https://fastapi.tiangolo.com/deployment/docker/)
-- [Qdrant local mode](https://deepwiki.com/qdrant/qdrant-client/2.2-local-mode) -- path-based client
-- [GCP Container-Optimized OS](https://cloud.google.com/container-optimized-os/docs)
-
-### Cross-referenced (MEDIUM confidence)
-- [Konva bounding box annotation tutorial](https://blog.intzone.com/using-konva-js-to-annotate-image-with-bounding-boxes/)
-- [DevMuscle Konva annotation tool](https://devmuscle.com/blog/react-konva-image-annotation)
-- [Caddy Docker HTTPS guide (Feb 2026)](https://oneuptime.com/blog/post/2026-02-08-how-to-run-caddy-with-docker-and-automatic-https-wildcard-certificates/view)
+- **Existing codebase** (HIGH confidence): Direct inspection of all referenced files
+- **scikit-learn metrics**: Already validated in project dependencies, functions stable across versions
+- **Roboflow JSONL format**: Project context provided by user with sample data
+- **DuckDB ALTER TABLE patterns**: Already used 4 times in `duckdb_repo.py` for idempotent migrations
 
 ---
-*Stack research for: DataVisor v1.1 -- Deployment, Workflow & Competitive Parity*
-*Researched: 2026-02-12*
+*Stack research for: DataVisor Classification Dataset Support*
+*Researched: 2026-02-18*
diff --git a/.planning/research/SUMMARY.md b/.planning/research/SUMMARY.md
index 310d408..9d0a9c4 100644
--- a/.planning/research/SUMMARY.md
+++ b/.planning/research/SUMMARY.md
@@ -1,624 +1,247 @@
-# Project Research Summary: DataVisor v1.1
+# Project Research Summary
 
-**Project:** DataVisor v1.1 — Deployment, Workflow & Competitive Parity
-**Milestone Focus:** Docker deployment, single-user auth, smart dataset ingestion, annotation editing, error triage workflow, keyboard shortcuts
-**Researched:** 2026-02-12
-**Overall Confidence:** HIGH
-
----
+**Project:** DataVisor - Classification Dataset Support (v1.2)
+**Domain:** Single-label image classification integration into existing detection-centric CV dataset tool
+**Researched:** 2026-02-18
+**Confidence:** HIGH
 
 ## Executive Summary
 
-DataVisor v1.1 builds on a proven v1.0 foundation (12,720 LOC, 59 tests) to add production deployment and competitive features. Research across stack, features, architecture, and pitfalls reveals a clear path: **prioritize Docker deployment with Caddy reverse proxy, then layer on smart ingestion and error triage workflows**.
+DataVisor is adding single-label classification dataset support to an existing detection-focused architecture. The research finding that most shapes this work: zero new dependencies are required. The existing stack (scikit-learn, Pillow, DuckDB, Recharts, Tailwind, SVG overlays) already covers every classification need. The complexity is entirely in the plumbing -- threading a `dataset_type` discriminator through every layer of the stack: schema, ingestion parsers, evaluation services, API response models, and frontend rendering. This is not a net-new feature; it is a well-scoped extension of an established codebase.
 
-The recommended approach uses **three-service Docker Compose** (backend, frontend, caddy) with Qdrant remaining in local embedded mode. This simplifies deployment while maintaining single-user focus. Auth is handled at two layers: Caddy's basic_auth for edge protection plus FastAPI dependency injection for API-level defense in depth. The smart ingestion UI uses a folder scanner service that detects COCO/YOLO structures and presents suggestions for user confirmation. Annotation editing adds react-konva ONLY in the detail modal (keeping SVG for grid read-only overlays). The error triage workflow extends the existing error analysis system with DuckDB persistence and a focused keyboard-driven review mode.
+The recommended approach is `dataset_type`-gated dispatch: add a `dataset_type VARCHAR DEFAULT 'detection'` column to the `datasets` table, detect the JSONL format in `FolderScanner`, dispatch to a new `ClassificationParser` in `IngestionService`, branch evaluation at the router level into a separate `compute_classification_evaluation()` function (scikit-learn accuracy/F1/confusion matrix -- no IoU), and conditionally render class label badges instead of SVG bbox overlays in the frontend. This is clean, does not touch existing detection code paths, and leaves existing detection datasets completely unaffected by the migration.
 
-**Critical architectural decisions validated:**
-1. **Keep Qdrant in local mode for Docker** — single-user workload does not justify server mode complexity. Conditional client initialization supports both modes.
-2. **Caddy over nginx for reverse proxy** — automatic HTTPS via Let's Encrypt, built-in basic_auth, simpler config for single-VM deployment.
-3. **react-konva for annotation editing** — v19.2.0 explicitly supports React 19. Konva Transformer provides resize handles out of the box.
-4. **FastAPI HTTPBasic dependency injection** — more testable and composable than middleware for single-user auth.
+The top risk is schema design: making bbox columns nullable or reusing sentinel values (0,0,0,0) in the `annotations` table forces every downstream consumer to defend against null/zero bboxes. There are 30+ bbox references across the codebase. The recommended approach is sentinel values (0.0, never NULL) combined with `dataset_type`-aware dispatch -- never calling detection evaluation or bbox-dependent rendering for classification datasets. The alternative (separate `classifications` table) is architecturally cleaner but adds parallel query paths in every service. Both are viable; the implementation team must decide before writing any ingestion code, as changing it later is a rewrite.
 
-**Key risks addressed upfront:**
-- **DuckDB WAL files lost on container restart** (P1) — mount entire data/ directory, add CHECKPOINT on shutdown
-- **NEXT_PUBLIC_API_URL baked at build time** (P3) — use Caddy reverse proxy to serve frontend and API from same origin
-- **Basic auth over HTTP exposes credentials** (P4) — Caddy handles HTTPS automatically
-- **SVG-to-Canvas coordinate mismatch** (P5) — keep SVG for read-only, use Konva ONLY for edit mode with explicit coordinate conversion
+## Key Findings
 
-With these patterns, v1.1 delivers GCP-deployable production quality while adding FiftyOne/Encord competitive features (smart ingestion, error triage, annotation editing) that the existing v1.0 architecture supports naturally.
+### Recommended Stack
 
----
+Classification support requires no new libraries. The work is entirely architectural -- extending parsers, adapting the DB schema, branching evaluation logic, and conditionally rendering overlays. Python's built-in `json.loads()` per line handles JSONL (line-delimited JSON); `ijson` streaming is unnecessary. scikit-learn's `classification_report()` and `confusion_matrix()` cover all metric needs. The existing `confusion-matrix.tsx` component accepts generic `number[][]` and needs no changes. Annotation overlays branch on `bbox_w === 0` (or `datasetType === "classification"`) to render a class label badge vs an SVG rect. No new `pip` packages. No new `npm` packages.
 
-## Key Findings
+**Core technologies (no changes to existing dependencies):**
+- `json` stdlib: JSONL parsing -- line-by-line `json.loads()`, no streaming library needed
+- `scikit-learn>=1.8.0`: Accuracy, Macro/Weighted F1, per-class P/R/F1, confusion matrix -- already installed
+- `pillow>=12.1.1`: Image dimension reading for classification JSONL (which omits width/height) -- already installed
+- `duckdb>=1.4.4`: One new column (`dataset_type`) via idempotent `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` -- follows existing migration pattern
+- `recharts>=3.7.0` + React/Tailwind: Conditional rendering in evaluation panel and overlays -- no new chart types
 
-### From STACK.md: Technologies for v1.1
-
-The v1.1 stack extends v1.0 with deployment and UX libraries. All choices are validated against official documentation and production usage.
-
-**Docker deployment stack:**
-- **Docker Compose** — three services: backend (FastAPI + DuckDB + Qdrant local), frontend (Next.js standalone), caddy (reverse proxy + HTTPS + auth)
-- **Caddy 2-alpine** — automatic HTTPS, built-in basic_auth, 10-line Caddyfile vs nginx's verbose config
-- **Qdrant local mode** — existing `QdrantClient(path=...)` works in Docker via volume mount. No server container needed for <1M vectors single-user.
-- **Python 3.14-slim base** — avoids musl compilation issues (alpine would break torch/numpy wheels)
-- **Node 22-alpine** — Next.js 16 standalone output reduces image from ~1GB to ~150MB
-
-**Authentication stack:**
-- **FastAPI HTTPBasic** (built-in) — zero new dependencies, works with Caddy's basic_auth for defense in depth
-- **Caddy basic_auth directive** — edge protection with bcrypt password hashing via `caddy hash-password`
-
-**Frontend interaction stack:**
-- **react-konva 19.2.0 + konva 10.2.0** — Canvas-based bbox editing with Transformer (resize/rotate handles). v19 explicitly for React 19.
-- **react-hotkeys-hook 5.2.4** — declarative keyboard shortcuts with scoping. 3KB, actively maintained (published 9 days ago).
-
-**What NOT to add:**
-- Qdrant server container — local mode sufficient, eliminates complexity
-- Gunicorn — single-user tool, one uvicorn worker is enough
-- JWT/OAuth2 — HTTP Basic Auth sufficient for personal deployment
-- Nginx — Caddy's auto-HTTPS wins decisively for single-VM
-- Fabric.js — no official React wrapper, imperative API fights React model
-- cmdk — command palette is nice-to-have, not needed for v1.1 shortcuts
-
-**New frontend dependencies (total ~76KB gzipped):**
-```
-npm install konva@^10.2.0 react-konva@^19.2.0 react-hotkeys-hook@^5.2.4
-```
-
-**Backend: ZERO new Python dependencies** — auth uses FastAPI built-ins, smart ingestion uses pathlib + existing fsspec
-
-**Confidence:** HIGH — every dependency verified via official docs and npm/PyPI version checks
-
-### From FEATURES.md: Competitive Gap Analysis
-
-Competitive analysis of DataVisor vs FiftyOne (Voxel51) and Encord reveals 16 features across 6 categories. v1.1 closes table stakes gaps while adding differentiators.
-
-**MUST build for v1.1 (16 features):**
-
-| Feature | Priority | Complexity | Competitor Reference |
-|---------|----------|------------|---------------------|
-| YOLO + VOC format import | Table Stakes | Medium | FiftyOne supports 15+ formats; missing YOLO is critical gap |
-| Train/val/test split handling | Table Stakes | Medium | FiftyOne tags samples with split; every real dataset has splits |
-| Smart folder detection UI | Differentiator | Medium | Neither FiftyOne nor Encord auto-detects — opportunity to leapfrog |
-| Dataset export (COCO, YOLO) | Table Stakes | Medium | FiftyOne exports to all import formats; completes curation loop |
-| Bbox editing (move/resize/delete) | Table Stakes | High | Encord has full editor; FiftyOne delegates to CVAT. DataVisor targets "quick corrections only" |
-| Interactive confusion matrix | Table Stakes | High | FiftyOne's killer feature — click cell to filter to GT/pred pairs |
-| Near-duplicate detection | Table Stakes | Low | FiftyOne Brain + Encord Active both provide this |
-| Image quality metrics | Table Stakes | Low | Brightness, sharpness, contrast for AI agent |
-| Error triage mode | Differentiator | Medium | FiftyOne is programmatic; Encord is multi-stage. Keyboard-driven review is faster |
-| Worst images composite ranking | Differentiator | Medium | Neither competitor has single composite "badness" score |
-| Docker deployment | Table Stakes | Medium | FiftyOne Enterprise has Helm chart; OSS has Dockerfile |
-| Basic auth | Table Stakes | Low | FiftyOne OSS has none; Enterprise has full RBAC |
-| Deployment scripts (local + GCP) | Table Stakes | Low-Medium | FiftyOne provides SSH tunnel; DataVisor targets cloud VM |
-| Keyboard shortcuts (Tier 1) | Table Stakes | Medium | FiftyOne has partial support; Encord has comprehensive shortcuts |
-| "Find Similar" UI button | Table Stakes | Low | Existing Qdrant infrastructure, just needs UI exposure |
-| Interactive histograms | Differentiator | Medium | FiftyOne has this; click bar to filter grid |
-
-**DEFER to v1.2+ (9 features):**
-- Create new annotations — edit/delete sufficient for v1.1
-- CVAT/Label Studio integration — export achieves same goal
-- PR curves + per-class AP — confusion matrix is priority
-- Mistakenness/hardness scoring — requires model logits import
-- Custom workspaces — current layout works
-- Customizable hotkeys — fixed defaults sufficient
-- Model zoo / in-app inference — import predictions workflow is pragmatic
-- View expression Python API — UI filtering covers 90%
-- Demo/quickstart dataset — nice for onboarding but not core
-
-**OUT OF SCOPE:**
-- 3D point cloud viz — different rendering pipeline (per PROJECT.md)
-- Video support — image-only for now (per PROJECT.md)
-- Map/geolocation — no current geo dataset need
-- Multi-user auth — personal tool (per PROJECT.md)
-
-**Build order dependencies:**
-```
-[Docker + Auth + Deploy] (parallel foundation)
-     |
-     v
-[YOLO + VOC Parsers] -> [Smart Folder Detection] -> [Split Handling]
-     |
-     v
-[Dataset Export] (requires format writers)
-     |
-     v
-[Image Quality Metrics] -> [Near-Duplicate Detection] -> [Composite Score]
-     |                                                       |
-     v                                                       v
-[Bbox Editing] -> [Keyboard Shortcuts] -> [Error Triage Mode]
-     |                                           |
-     v                                           v
-[Interactive Confusion Matrix] -> [Click-to-Filter]
-     |
-     v
-[Interactive Histograms]
-     |
-     v
-["Find Similar" Button]
-```
-
-**Confidence:** HIGH — grounded in official FiftyOne/Encord documentation
-
-### From ARCHITECTURE.md: Feature Integration
-
-Architecture research analyzed the existing v1.0 codebase (12,720 LOC, 50+ files) to identify integration points for v1.1 features.
-
-**Current v1.0 architecture snapshot:**
-- **Backend:** FastAPI with 9 routers, 7 services, DuckDB (6 tables) + Qdrant local mode
-- **Frontend:** Next.js 16 with 3 Zustand stores, 14 TanStack Query hooks, TanStack Virtual grid
-- **Key properties:** DuckDB single-connection cursor-per-request, Qdrant local mode, SVG annotation overlays, SSE for progress streams
-
-**Feature 1: Docker Deployment**
-
-Three-service topology (NOT four — Qdrant stays local):
-```
-[caddy :80/:443] -> [backend :8000 (FastAPI + DuckDB + Qdrant local)]
-                 -> [frontend :3000 (Next.js standalone)]
-```
-
-**Integration points:**
-- Create: `Dockerfile.backend`, `Dockerfile.frontend`, `docker-compose.yml`, `nginx/default.conf`
-- Modify: `app/config.py` (add `qdrant_url` for conditional mode), `similarity_service.py` (conditional client), `next.config.ts` (add `output: "standalone"`)
-
-**Qdrant mode switch:**
-```python
-# Conditional: local mode (dev) vs server mode (optional future)
-if settings.qdrant_url:
-    self.client = QdrantClient(url=settings.qdrant_url)
-else:
-    self.client = QdrantClient(path=str(path))  # existing
-```
-
-**Critical decisions:**
-- Keep Qdrant local — single-user <1M vectors does not need server container
-- Use Caddy reverse proxy — NEXT_PUBLIC_API_URL becomes `/api/` (same origin, no CORS)
-- Multi-stage builds — backend ~4GB (PyTorch CPU-only), frontend ~150MB
-- Single uvicorn worker — DuckDB single-writer constraint preserved
-
-**Feature 2: Single-User Auth**
-
-FastAPI dependency injection pattern (NOT middleware):
-```python
-# app/auth.py
-def verify_auth(credentials: HTTPBasicCredentials = Depends(security)) -> str:
-    # secrets.compare_digest to prevent timing attacks
-    ...
-
-# app/main.py
-app.include_router(datasets.router, dependencies=[Depends(verify_auth)])
-```
-
-**Why DI over middleware:**
-- Existing codebase uses Depends() for 9 dependencies already
-- Easier to exclude /health endpoint
-- Testable and composable
-- Recommended by FastAPI community (GitHub Discussion #8867)
-
-**SSE auth challenge:** EventSource cannot set Authorization headers. Solution: cookie-based session after initial Basic Auth login.
-
-**Feature 3: Smart Ingestion UI**
-
-New folder scanner service detects dataset structures:
-```
-POST /ingestion/scan { root_path }
-  -> FolderScanner.scan()
-  -> ScanResult { annotation_files, image_dirs, suggested_imports }
-  -> Frontend: confirmation UI
-  -> POST /datasets/ingest (existing SSE endpoint)
-```
-
-**Integration points:**
-- Create: `app/services/folder_scanner.py`, `app/routers/ingestion.py`, `app/models/scan.py`
-- Modify: `app/models/dataset.py` (add `split` field), `ingestion.py` (pass split to parser)
-- Frontend: new `/ingest` page with scan results display
-
-**Feature 4: Annotation Editing**
-
-**Critical observation:** Existing overlay is SVG, NOT react-konva. Architecture decision: use Konva ONLY in detail modal, keep SVG for grid.
-
-```
-sample-modal.tsx
-  |-- [Read-only mode] AnnotationOverlay (SVG, existing)
-  |-- [Edit mode] AnnotationEditor (NEW, react-konva)
-        |-- <Stage><Layer>
-        |     |-- <Image> (background)
-        |     |-- <Rect draggable /> (per annotation)
-        |     |-- <Transformer /> (resize handles)
-```
-
-**New backend endpoints:**
-- `PATCH /annotations/batch` — update bbox coordinates
-- `DELETE /annotations/{id}` — remove annotation
-
-**Data flow:**
-```
-User clicks "Edit" -> AnnotationEditStore.startEditing()
-  -> Modal switches to Konva
-  -> User drags/resizes (Konva handles visuals)
-  -> "Save" -> PATCH /annotations/batch
-  -> Invalidate TanStack Query cache
-  -> Modal switches back to SVG
-```
-
-**Coordinate normalization critical:** Konva Transformer modifies scaleX/scaleY, not width/height. Must normalize:
-```typescript
-const sx = node.scaleX(), sy = node.scaleY();
-node.scaleX(1); node.scaleY(1);
-const newW = node.width() * sx;
-const newH = node.height() * sy;
-```
-
-**Feature 5: Error Triage Workflow**
-
-Extends existing `error_analysis.py` with tagging and ranking:
-
-**New DuckDB table:**
-```sql
-CREATE TABLE triage_labels (
-    annotation_id VARCHAR,
-    dataset_id VARCHAR,
-    label VARCHAR,  -- 'confirmed', 'dismissed', 'needs_review'
-    created_at TIMESTAMP
-)
-```
-
-**New components:**
-- `app/routers/triage.py` — CRUD for triage labels
-- `app/services/triage_service.py` — "worst images" ranking algorithm
-- Frontend: `triage-store.ts` (5th Zustand store), `triage-action-bar.tsx`, `worst-images-panel.tsx`
-
-**Worst images ranking:**
-```python
-score = (2 * hard_fp_count) + (3 * label_error_count) + (1 * fn_count)
-        + (0.5 * low_confidence_count) - (0.1 * tp_count)
-```
-
-**Feature 6: Keyboard Shortcuts**
-
-react-hotkeys-hook for declarative shortcuts:
-```typescript
-useHotkeys('shift+/', () => openShortcutHelp(true));
-useHotkeys('g', () => setActiveTab('grid'));
-useHotkeys('/', () => document.querySelector('[data-shortcut-target="search"]')?.focus());
-```
-
-**Tier 1 shortcuts (v1.1):**
-- `?` — help overlay
-- Arrow keys — prev/next sample
-- `Escape` — close modal
-- `Space` — toggle label visibility
-- `Delete` — delete annotation (edit mode)
-- `Ctrl+Z` — undo (edit mode)
-- `1-9` — quick-assign class
-
-**Scoping:** Component-level via react-hotkeys-hook's ref scoping. Prevents firing when input fields are focused.
-
-**Build order:**
-```
-Phase 1: Docker Deployment (enables cloud deployment)
-  -> Phase 2: Auth (enables secure access)
-    -> Phase 3: Smart Ingestion (parallel with 4, 5)
-    -> Phase 4: Error Triage (parallel with 3, 5)
-    -> Phase 5: Annotation Editing (parallel with 3, 4)
-  -> Phase 6: Keyboard Shortcuts (last, layers on all UI)
-```
-
-**New files:** 13 backend, 14 frontend
-**Modified files:** 17 (app/config.py, main.py, repositories, services; frontend stores, components)
-**New Zustand stores:** 2 (annotation-edit-store, triage-store) — total 5 stores
-
-**Confidence:** HIGH — grounded in codebase analysis (12,720 LOC verified) + official docs
-
-### From PITFALLS.md: Domain-Specific Risks
-
-Pitfall research identified 16 risks across Docker, auth, annotation editing, ingestion, and deployment.
-
-**CRITICAL pitfalls (5):**
-
-**P1: DuckDB WAL file loss on Docker restart**
-- **Risk:** WAL file created alongside .duckdb file. If container killed before clean shutdown, WAL persists. If volume mounts only the .duckdb file (not directory), WAL vanishes -> silent data loss.
-- **Prevention:** Mount entire `data/` directory. Set `stop_grace_period: 30s`. Add `CHECKPOINT` in lifespan shutdown. Set `checkpoint_threshold='8MB'` for more frequent checkpoints.
-
-**P2: Qdrant local mode works fine in Docker (clarification)**
-- **Research update:** Local mode CAN run in Docker. The earlier concern was unfounded — local mode via `QdrantClient(path=...)` works with volume mount. Server mode is optional for multi-worker scenarios.
-- **Decision:** Keep local mode for v1.1. Conditional switch supports future migration.
-
-**P3: NEXT_PUBLIC_API_URL baked at build time**
-- **Risk:** `NEXT_PUBLIC_*` vars are inlined during `next build`. Built image has hardcoded API URL. Cannot change at runtime.
-- **Prevention:** Use Caddy reverse proxy to serve frontend and backend from same origin. Frontend calls `/api/` which Caddy routes to backend:8000. No CORS, no URL config needed.
-
-**P4: Basic auth over HTTP exposes credentials**
-- **Risk:** Base64 encoding is not encryption. HTTP transmits credentials in cleartext.
-- **Prevention:** Caddy provides automatic HTTPS via Let's Encrypt (zero config). EventSource (SSE) limitation requires cookie-based session, not per-request Basic Auth headers.
-
-**P5: SVG-to-Canvas coordinate mismatch**
-- **Risk:** SVG uses viewBox with preserveAspectRatio for automatic scaling. Konva uses Stage/Layer coordinates with manual scale. Transformer changes scaleX/scaleY, not width/height. Annotations can drift if coordinates not normalized.
-- **Prevention:** Compute single scale factor on load. Normalize Transformer scale to 1 on dragEnd/transformEnd. Write utility functions `toPixelSpace()` and `toDisplaySpace()` for all conversions. Keep SVG for read-only, Konva ONLY for edit mode.
-
-**MAJOR pitfalls (5):**
-
-**P6: Docker image bloat (8-12GB)**
-- **Risk:** PyTorch + transformers with CUDA = ~8GB image. Build takes 30+ min, push/pull times out.
-- **Prevention:** Use CPU-only PyTorch (200MB vs 2.5GB). Multi-stage build. `--no-cache-dir` everywhere. Pin versions.
-
-**P7: DuckDB annotation mutations without transactions**
-- **Risk:** v1.0 is read-heavy append-only. v1.1 adds UPDATE/DELETE. No PRIMARY KEY enforcement. Concurrent edits may conflict. Denormalized counts can drift.
-- **Prevention:** Wrap mutations in explicit transactions. Recompute counts from source tables. Verify annotation ID uniqueness in app code.
-
-**P8: Smart folder detection edge cases**
-- **Risk:** Datasets use 20+ conventions (COCO, YOLO, Roboflow, CVAT, custom). Detection heuristic cannot handle all. Dangerous: folder named `train/` containing train images (not training data).
-- **Prevention:** Detection is suggestion, not action. Show confidence scores. Manual override. Start COCO-only. Depth limit (3 levels max). No symlinks.
-
-**P9: GCP firewall blocks all ports by default**
-- **Risk:** GCP has default-deny inbound. VM starts, docker-compose runs, but `http://35.x.x.x:3000` times out. Developer spends 30 min debugging Docker before realizing it's firewall.
-- **Prevention:** Deployment script creates firewall rules automatically. Use port 80/443 only. Qdrant port 6333 NOT exposed (internal only). Tag VM and scope rules.
-
-**P10: Error triage state lost on page refresh**
-- **Risk:** Triage decisions in Zustand only. User tags 50 errors, refreshes page, all gone. 100K dataset with 5000 errors = significant manual work lost.
-- **Prevention:** Persist to DuckDB immediately. Debounce 500ms for rapid changes. Optimistic updates (Zustand first, DuckDB background). Use existing `samples.tags` column.
-
-**MODERATE pitfalls (4):**
-
-**P11: Docker volume mounts break image path resolution** — Store relative paths or path remap
-**P12: Keyboard shortcuts conflict with browser defaults** — Check activeElement, use modifiers for destructive actions
-**P13: CORS wildcard + credentials is spec-invalid** — Remove CORS via reverse proxy
-**P14: GCP persistent disk not auto-mounted on restart** — Add fstab entry with `nofail`
+**New files to create (backend):**
+- `app/ingestion/classification_parser.py` -- JSONL parser implementing `BaseParser`
+- `app/services/classification_evaluation.py` -- accuracy/F1/confusion matrix, separate from detection evaluation
+- `app/models/classification_evaluation.py` -- `ClassificationEvaluationResponse` Pydantic model
 
-**MINOR pitfalls (2):**
+**New files to create (frontend):**
+- `src/components/grid/classification-label.tsx` -- class label pill overlay for grid cells and modal
 
-**P15: Annotation delete without undo** — Soft delete or undo buffer
-**P16: docker-compose OOMs on small VMs** — Lazy model loading, document e2-standard-4 minimum
+See `.planning/research/STACK.md` for full file-level modification list (9 backend files, 15+ frontend files).
 
-**Integration pitfall matrix:**
+### Expected Features
 
-| Feature | Existing Component | Pitfall | Prevention |
-|---------|-------------------|---------|------------|
-| Docker | DuckDB file | P1: WAL loss | Mount `data/` dir, CHECKPOINT shutdown |
-| Docker | Next.js env | P3: Build-time URL | Caddy reverse proxy (same origin) |
-| Docker | PyTorch | P6: 8GB image | CPU-only torch, multi-stage |
-| Auth | SSE streams | P4: EventSource headers | Cookie-based session |
-| Auth | CORS | P13: Wildcard+creds | Reverse proxy removes CORS |
-| Annotation Edit | SVG overlay | P5: Coord mismatch | Keep SVG read-only, Konva edit-only |
-| Annotation Edit | DuckDB | P7: No transactions | Explicit transactions |
-| Smart Ingestion | COCOParser | P8: Edge cases | Suggestion not action |
-| Error Triage | Zustand | P10: State lost | Persist to DuckDB |
-| GCP Deploy | Firewall | P9: Blocks ports | Script creates rules |
+Classification support breaks into three natural groups by user need: data ingestion and browsing, evaluation and error analysis, and polish/differentiators.
 
-**Confidence:** MEDIUM-HIGH — critical pitfalls verified via official docs (DuckDB WAL, Next.js env vars, EventSource limitations). Edge cases (folder detection, Konva coords) based on community patterns.
+**Must have (table stakes):**
+- TS-1: JSONL ingestion parser + `dataset_type` schema extension -- nothing else works without classification data in the database
+- TS-2: Class label badge on thumbnails -- users expect to see the class label on the image; without it the tool shows unlabeled images
+- TS-3: Classification evaluation metrics (accuracy, macro F1, weighted F1, per-class P/R/F1) -- the universal expectation for any classification tool
+- TS-4: Classification confusion matrix -- the primary diagnostic tool for classification; existing component reused, no background row/column
+- TS-5: Classification error analysis (Correct / Misclassified / Missing Prediction) -- simpler than detection; replaces IoU-based TP/FP/FN categories
+- TS-6: Classification prediction import (JSONL or CSV format) -- required to enable any GT-vs-prediction workflow
+- TS-7: Sample detail modal adaptation -- class label display, class change dropdown, no bbox editor
+- TS-8: Statistics dashboard adaptation -- relabel "annotations" as "labeled images", remove bbox area histogram, hide IoU slider
 
----
+**Should have (differentiators):**
+- D-1: Misclassification drill-down view -- click confusion matrix cell to see all images with GT=class_i, predicted=class_j; "most confused pairs" is the single most actionable classification view
+- D-2: Per-class performance sparklines in the metrics table -- color-coded bars (green/yellow/red) for P/R/F1 per class
+- D-5: Embedding scatter coloring by correct/incorrect -- misclassified samples shown as red dots over the t-SNE; deep-inside-cluster mistakes reveal label errors
 
-## Implications for Roadmap
+**Defer (v2+):**
+- Multi-label classification (different data model, different metrics, different UI -- scope explosion)
+- Top-K evaluation (requires importing full probability distributions per image)
+- PR curves for classification (confusion matrix + per-class P/R table are more informative)
+- D-3: Confidence calibration / reliability diagram
+- D-4: Per-split comparison table
 
-Based on synthesized research, **6 feature groupings** are recommended with clear dependencies and pitfall mitigation.
+**Anti-features (explicitly avoid for this milestone):**
+- mAP for classification -- wrong metric, confuses users
+- IoU threshold slider for classification datasets -- meaningless, hide it
+- Detection-specific error categories ("Hard FP") for classification -- no spatial component
 
-### Phase 1: Docker Deployment & Auth (Foundation)
-**Why first:** Establishes cloud deployment scaffold. Every other feature builds on this. Critical pitfalls (P1, P3, P4, P13) MUST be addressed here — retrofit is extremely painful.
+See `.planning/research/FEATURES.md` for the full feature dependency graph and MVP recommendation.
 
-**Delivers:** GCP-deployable Docker Compose setup with automatic HTTPS and basic auth.
+### Architecture Approach
 
-**Features:**
-- Docker Compose (backend, frontend, caddy)
-- Qdrant local mode with conditional switch
-- Multi-stage Dockerfiles (CPU-only PyTorch)
-- Caddy reverse proxy (HTTPS + basic_auth)
-- FastAPI HTTPBasic dependency injection
-- GCP deployment script + firewall rules
-- Local run script
+The `dataset_type` column on `datasets` is the single source of truth for conditional behavior across all layers. The pattern is: detect format in `FolderScanner` -> dispatch parser via a registry in `IngestionService` -> set `dataset_type` on the dataset record -> thread `dataset_type` through API responses to the frontend -> branch evaluation at the router into separate detection/classification functions -> conditionally render badge vs bbox overlay in components. No polymorphism needed; simple if/else at well-defined boundary points.
 
-**Avoids pitfalls:**
-- P1: Mount `data/` directory, CHECKPOINT on shutdown
-- P3: Caddy serves frontend and API from same origin (no NEXT_PUBLIC_API_URL issue)
-- P4: Caddy auto-HTTPS, cookie-based session for SSE
-- P6: Multi-stage build, CPU-only torch, ~4GB backend image
-- P9: Deployment script creates firewall rules
-- P11: Path remapping for Docker volume mounts
-- P13: Reverse proxy removes CORS entirely
-- P14: fstab entry for persistent disk
-
-**Dependencies:** NONE — foundational
-**Research flag:** SKIP — Docker, Caddy, FastAPI auth are well-documented
-
-### Phase 2: Smart Ingestion UI
-**Why second:** Builds on Docker foundation (new endpoints need auth). Completes the "no-code dataset import" workflow.
-
-**Delivers:** Point at folder, auto-detect structure, import with confirmation.
-
-**Features:**
-- Folder scanner service (detect COCO/YOLO/splits)
-- Scan endpoint with structure detection
-- Frontend ingestion wizard with preview
-- Split detection in existing parser
-- Multi-format support hooks (YOLO/VOC deferred to later)
+**Major components:**
+1. `FolderScanner` (modified) -- adds `_is_classification_jsonl()` detection before falling through to COCO; `ScanResult.format` becomes `"coco" | "classification_jsonl"` (already supports arbitrary strings)
+2. Parser registry in `IngestionService` (modified) -- maps format strings to parser classes; `ClassificationParser` implements `BaseParser` with sentinel bbox values (0.0) and one annotation per sample
+3. `duckdb_repo.py:initialize_schema()` (modified) -- one `ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection'`; no changes to `annotations` table
+4. `compute_classification_evaluation()` (new) -- separate from 560-line detection evaluation; ~50 lines using scikit-learn; router branches on `dataset_type` before calling any detection logic
+5. `ClassificationEvaluationResponse` (new) -- separate Pydantic model with `accuracy`, `macro_f1`, `weighted_f1`, `per_class_metrics`, `confusion_matrix`; discriminated union in frontend TypeScript types
+6. `AnnotationOverlay` (modified) -- branches on `datasetType === "classification"` to render class label pill vs SVG rect; `datasetType` prop threaded from page-level dataset query
 
-**Avoids pitfalls:**
-- P8: Detection is suggestion not action, confidence scores, manual override
+**Patterns to follow:**
+- Thread `datasetType` from the top-level dataset query through props; never re-fetch inside child components
+- Parser registry for format dispatch, not hardcoded `COCOParser()`
+- One annotation per sample for classification -- `GROUP BY sample_id LIMIT 1` is safe; no IoU matching needed
+- Branch at boundaries (router, page), not inside leaf components/queries
 
-**Dependencies:** Phase 1 (auth for new endpoints)
-**Research flag:** SKIP — folder scanning patterns are standard
+**Anti-patterns to avoid:**
+- Separate tables per task type (doubles query maintenance surface)
+- Stuffing classification metrics into detection response fields (field names become lies)
+- Making `compute_evaluation()` handle both types (grafts classification into 560 lines of spatial detection logic)
+- Frontend feature detection via `annotations[0]?.bbox_x === null` (fragile; fails on empty samples)
 
-### Phase 3: Annotation Editing
-**Why parallel with Phase 2/4:** Independent of ingestion and triage. Depends only on existing sample modal.
+See `.planning/research/ARCHITECTURE.md` for full component inventory and suggested build order.
 
-**Delivers:** Quick corrections in-app (move/resize/delete bboxes).
+### Critical Pitfalls
 
-**Features:**
-- react-konva integration in detail modal ONLY
-- AnnotationEditor component with Transformer
-- AnnotationEditStore (new Zustand store)
-- PATCH /annotations/batch endpoint
-- Coordinate normalization utilities
+1. **Schema pollution via nullable/sentinel bbox columns** -- If bbox columns become nullable, 30+ codebase references must each guard against NULL. If sentinel values (0,0,0,0) are used without `dataset_type`-aware dispatch, the detection evaluation computes IoU on zero-size boxes (NaN/0) and the overlay renders invisible 0-area rects. Prevention: use sentinel values (0.0) AND `dataset_type`-gated code paths that never invoke bbox-dependent logic for classification datasets. Decide the schema approach before writing any parser code; changing it later is a codebase-wide rewrite.
 
-**Avoids pitfalls:**
-- P5: Keep SVG read-only, Konva edit-only; explicit coord conversion
-- P7: Explicit transactions for mutations
-- P15: Soft delete or confirmation dialog
+2. **Metric confusion -- IoU/mAP leaking into classification evaluation** -- The entire `evaluation.py` (560 lines) is IoU-centric. Passing classification data through it produces nonsensical mAP scores. The `supervision` library expects `xyxy` bounding boxes. Prevention: separate `compute_classification_evaluation()` function; router branches before any detection logic runs. The classification evaluation should be ~50 lines, not a modified version of 560.
 
-**Dependencies:** Phase 1 (modal exists, auth for new endpoint)
-**Research flag:** SKIP — react-konva Transformer is documented
+3. **UI conditional spaghetti** -- 10+ frontend components each need different rendering for classification vs detection. If scattered `if (taskType === 'classification')` checks accumulate without a clear pattern, adding a third task type (segmentation) becomes a codebase-wide search-and-update problem. Prevention: thread `datasetType` as a prop from the page; branch at component boundaries (`AnnotationOverlay`, `EvaluationPanel`), not inside individual render expressions deep in the tree. Audit: if `taskType` checks exceed 10, the abstraction is wrong.
 
-### Phase 4: Error Triage Workflow
-**Why parallel with Phase 2/3:** Extends existing error analysis. Independent of ingestion and annotation editing.
+4. **Breaking existing detection workflows via schema migration** -- DuckDB's `ALTER COLUMN DROP NOT NULL` support varies by version. A failed migration leaves the schema partially altered with no rollback. Prevention: never change existing column constraints; use only `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` (the existing migration pattern). Use sentinel bbox values (0.0) for classification, not NULL.
 
-**Delivers:** Keyboard-driven error review workflow with persistence.
+5. **Confusion matrix scaling for 43+ classes** -- The current confusion matrix renders as an HTML table with 32px-minimum cells. A 43-class matrix = 1,849 cells = 1,408px minimum width, unreadable on any screen. Prevention: add a "top confused pairs" ranked list as the default view; make the full NxN matrix opt-in (or canvas-rendered); threshold filter to hide cells below N occurrences.
 
-**Features:**
-- triage_labels DuckDB table
-- Triage API endpoints
-- TriageStore (new Zustand store)
-- Worst images ranking algorithm
-- Grid highlight/dim mode
-- Keyboard shortcuts for triage (1/2/3 keys)
+See `.planning/research/PITFALLS.md` for 15 pitfalls including moderate (format detection false positives, annotation triage, one-annotation-per-image enforcement, prediction format mismatch) and minor (class imbalance visualization, second-system over-generalization).
 
-**Avoids pitfalls:**
-- P10: Persist triage decisions to DuckDB immediately
+## Implications for Roadmap
+
+Based on combined research, the feature dependency graph and architecture's stated build order converge on a 3-phase structure. The critical path is: schema + ingestion -> display -> evaluation. Evaluation features are all blocked on having classification data in the database and having a prediction import path.
 
-**Dependencies:** Phase 1 (existing error analysis built in v1.0)
-**Research flag:** SKIP — extends existing patterns
+### Phase 1: Foundation -- Schema, Ingestion, and Display
 
-### Phase 5: Keyboard Shortcuts
-**Why last:** Layers on top of all UI features. Must reference grid, modal, triage, annotation editing.
+**Rationale:** Everything else is blocked on this. The `dataset_type` column must exist before any parser can set it. The parser must run before any data appears in the database. The frontend badge must render before users can browse the dataset meaningfully. This phase has no external unknowns -- all implementation details are confirmed from direct codebase inspection.
 
-**Delivers:** Power-user keyboard navigation.
+**Delivers:** A classification dataset can be scanned, ingested, browsed in the image grid, and inspected in the sample modal. The statistics dashboard shows class distribution and labeled image counts.
 
-**Features:**
-- react-hotkeys-hook integration
-- Global shortcuts (tab switching, help)
-- Component-level shortcuts (grid nav, modal nav, edit mode)
-- Shortcut help overlay (? key)
+**Addresses features:**
+- TS-1: JSONL ingestion parser + schema extension (`dataset_type` column, `ClassificationParser`, format auto-detection in `FolderScanner`)
+- TS-2: Class label badge on thumbnails (`classification-label.tsx`, `AnnotationOverlay` conditional branch)
+- TS-7: Sample detail modal adaptation (no bbox editor, class label display, class change dropdown)
+- TS-8: Statistics dashboard adaptation (relabeled metrics, no bbox area histogram, hidden IoU slider)
 
 **Avoids pitfalls:**
-- P12: Check activeElement before handling, use modifiers for destructive actions
+- P1 (schema pollution): sentinel bbox 0.0 + `dataset_type`-gated dispatch; never nullable
+- P4 (breaking existing detection): `ADD COLUMN IF NOT EXISTS` only; no `ALTER COLUMN`
+- P6 (format detection false positives): new `_is_classification_jsonl()` before COCO fallthrough
+- P9 (multi-label enforcement): parser validates one annotation per `sample_id` per source
+- P3 (UI spaghetti early signal): establish the `datasetType` prop threading pattern here; don't let conditionals scatter
 
-**Dependencies:** Phases 1-4 (shortcuts reference all UI)
-**Research flag:** SKIP — react-hotkeys-hook is straightforward
+**Research flag:** Standard patterns -- skip `/gsd:research-phase`. All implementation details are clear from direct codebase inspection. Zero implementation ambiguity.
+
+---
 
-### Phase 6: Competitive Features (Deferred to v1.2)
-**Why deferred:** Core v1.1 delivers deployment + workflows. These add ecosystem value but are not blocking.
+### Phase 2: Evaluation and Error Analysis
 
-**Features:**
-- YOLO + VOC parsers
-- Dataset export (COCO, YOLO)
-- Interactive confusion matrix + click-to-filter
-- Near-duplicate detection
-- Image quality metrics
-- "Find Similar" UI button
-- Interactive histograms
+**Rationale:** Blocked on Phase 1 (needs classification data in DB and a prediction import path). Evaluation is the core analytical value of DataVisor. Classification evaluation is dramatically simpler than detection (no IoU, ~50 lines) but the API contract must be clean from the start to avoid frontend confusion.
+
+**Delivers:** Users can import classification predictions, view accuracy/F1 metrics, explore the confusion matrix, and identify misclassified images via error analysis. The triage system works for classification (correct/incorrect per image, not per-bbox IoU).
+
+**Addresses features:**
+- TS-6: Classification prediction import (JSONL/CSV, one row per image)
+- TS-3: Classification evaluation metrics (accuracy, macro F1, weighted F1, per-class P/R/F1)
+- TS-4: Classification confusion matrix adaptation (no background row/col; existing click-to-filter works as-is)
+- TS-5: Classification error analysis (Correct / Misclassified / Missing Prediction)
+
+**Avoids pitfalls:**
+- P2 (metric confusion): separate `compute_classification_evaluation()`; router branches before detection logic
+- P7 (triage assumes IoU): new `match_classification_annotations()` with sample_id equality matching, not IoU
+- P8 (error categories don't map): new `categorize_classification_errors()` with classification-specific categories
+- P10 (API response leakage): `ClassificationEvaluationResponse` is a separate Pydantic model
 
-**Research flag:** SKIP — annotation format specs documented, existing patterns extend
+**Uses from stack:**
+- `sklearn.metrics.classification_report()` and `confusion_matrix()` -- one function call replaces 560 lines of detection evaluation
+- TypeScript discriminated union: `DetectionEvaluationResponse | ClassificationEvaluationResponse`
+
+**Research flag:** Standard patterns -- skip `/gsd:research-phase`. Classification evaluation is textbook scikit-learn usage. The router branching pattern is explicitly designed in ARCHITECTURE.md.
 
 ---
 
-### Phase Structure Summary
-
-**Dependency graph:**
-```
-Phase 1 (Docker + Auth) — FOUNDATIONAL
-    |
-    +-> Phase 2 (Smart Ingestion) — NEW ENDPOINTS
-    +-> Phase 3 (Annotation Edit) — PARALLEL
-    +-> Phase 4 (Error Triage) — PARALLEL
-    |
-    v
-Phase 5 (Keyboard Shortcuts) — LAYER ON ALL UI
-
-Phase 6 (Competitive Features) — DEFERRED v1.2
-```
-
-**Rationale:**
-- Phase 1 is non-negotiable foundation — Docker, auth, deployment
-- Phases 2-4 can proceed in parallel (independent)
-- Phase 5 references all UI, must come last
-- Phase 6 deferred — core v1.1 is deployment + workflows
-
-**Which phases need `/gsd:research-phase`?**
-- **NONE** — all v1.1 features use well-documented patterns (Docker, Caddy, FastAPI auth, react-konva, react-hotkeys-hook)
-- Phase 6 (if built in v1.2) also uses documented patterns (COCO/YOLO specs, Qdrant similarity)
-
-**Critical path:**
-- Phase 1 blocks everything (foundation)
-- Phases 2-4 are independent (can parallelize)
-- Phase 5 depends on 2-4 (shortcuts reference their UI)
+### Phase 3: Polish and Differentiators
+
+**Rationale:** Table stakes (Phases 1-2) make the product functional. This phase makes it useful in practice for high-cardinality classification datasets like jersey numbers (43 classes). The confusion matrix scaling issue will surface immediately with real data. The misclassification drill-down and embedding coloring are the features that make DataVisor more useful than just running `sklearn.metrics.classification_report()` locally.
+
+**Delivers:** The confusion matrix is readable at 43 classes. Per-class metrics have visual encoding (color-coded bars). Misclassified images are accessible via drill-down from the confusion matrix. Embedding scatter shows misclassification status.
+
+**Addresses features:**
+- D-1: Misclassification drill-down view (click confusion matrix cell -> filtered sample view with both labels, sorted by confidence)
+- D-2: Per-class sparklines in metrics table (color-coded P/R/F1 bars)
+- D-5: Embedding scatter coloring by correct/incorrect (existing scatter, new color mode toggle)
+
+**Avoids pitfalls:**
+- P5 (confusion matrix scaling at 43+ classes): "top confused pairs" ranked list as default; full matrix opt-in
+- P11 (class imbalance visualization for 43+ bars): sortable table view, log-scale option for bar chart
+- P3 (UI spaghetti audit): count all `taskType` conditional checks; if > 10, refactor to a `useTaskAdapter` hook
+- P13 (second system effect): explicitly defer segmentation support and any task-type plugin system
+
+**Research flag:** Confusion matrix canvas rendering for large matrices may need brief investigation if the HTML table approach proves unworkable. All other features in this phase use existing components (scatter plot, filter system, Recharts tables). Consider a quick prototype before committing to the HTML table approach for 43+ classes.
 
 ---
 
+### Phase Ordering Rationale
+
+- **Schema first:** The `dataset_type` column and `ClassificationParser` are shared foundations for all downstream phases. No other work can proceed without classification data in the database.
+- **Display before evaluation:** Users need to browse and verify that classification data ingested correctly before attempting evaluation. This also surfaces ingestion bugs early when they are cheap to fix.
+- **Evaluation before polish:** The core GT-vs-predictions workflow must work before investing in visualization enhancements. D-1 (drill-down) depends on TS-4 (confusion matrix) which depends on TS-3 (evaluation).
+- **Parallelizable within Phase 1:** TS-2 (badge), TS-7 (modal), and TS-8 (stats dashboard) can be built concurrently once TS-1 (parser + schema) is complete.
+- **Parallelizable within Phase 2:** TS-6 (prediction import) and TS-5 (error analysis) can proceed in parallel once TS-3 (evaluation metrics) is done.
+
+### Research Flags
+
+**Phases likely needing `/gsd:research-phase` during planning:**
+- None identified. All implementation details are grounded in direct codebase inspection. The research team confirmed exact function signatures, component props, and SQL schema for all referenced files.
+
+**One item to confirm at Phase 1 implementation start (not a blocker):**
+- DuckDB `ALTER COLUMN DROP NOT NULL` syntax: MEDIUM confidence. The recommended approach (sentinel 0.0 values, never NULL) avoids this entirely. If the team decides nullable bbox columns are preferred for semantic cleanliness, verify DuckDB version support before committing.
+
+**Phases with standard patterns (skip research-phase):**
+- Phase 1: JSONL parsing, `BaseParser` extension, idempotent column migration -- all established patterns in the existing codebase
+- Phase 2: scikit-learn classification metrics, router branching, Pydantic discriminated unions -- textbook patterns
+- Phase 3: Recharts table customization, existing scatter plot coloring, confusion matrix threshold filtering -- existing component extension
+
 ## Confidence Assessment
 
 | Area | Confidence | Notes |
 |------|------------|-------|
-| **Stack** | HIGH | Every dependency verified: react-konva 19.2.0 for React 19, react-hotkeys-hook 5.2.4 (published 9 days ago), Caddy 2-alpine. Zero new backend deps (auth uses FastAPI built-ins). Versions confirmed via npm/PyPI. |
-| **Features** | HIGH | Competitive analysis grounded in official FiftyOne v1.12.0 and Encord docs. 16 must-build features mapped to competitors. Dependencies validated (smart ingestion depends on auth, triage depends on error analysis). |
-| **Architecture** | HIGH | Integration points verified against actual v1.0 codebase (12,720 LOC, 50+ files). Qdrant local mode decision reversed after deeper research (can run in Docker). Konva-only-for-edit pattern avoids grid performance issues. |
-| **Pitfalls** | MEDIUM-HIGH | Critical pitfalls verified via official docs (DuckDB WAL behavior, Next.js NEXT_PUBLIC inlining, EventSource header limitations, Konva Transformer scale behavior). Edge cases (folder detection, Docker OOM) based on community patterns and GitHub issues. |
-
-**Overall confidence:** **HIGH**
+| Stack | HIGH | Direct codebase inspection confirmed all dependencies are already installed. Zero new packages needed. |
+| Features | HIGH | FiftyOne, Roboflow, Cleanlab, Google ML docs all confirm the same feature expectations. Classification metrics are industry-standard. |
+| Architecture | HIGH | All architectural decisions grounded in actual codebase files. Component interfaces and SQL schema inspected directly. One MEDIUM item: DuckDB nullable column ALTER syntax. |
+| Pitfalls | HIGH | All 15 pitfalls are grounded in specific files and line numbers in the actual codebase, not theoretical risks. |
 
-The stack is validated, features are grounded in competitive analysis, architecture patterns are proven against existing codebase, and pitfalls have clear prevention strategies. The main uncertainty (Qdrant local vs server) was resolved — local mode works fine for v1.1 single-user deployment.
+**Overall confidence:** HIGH
 
-### Gaps to Address During Implementation
+### Gaps to Address
 
-**Docker volume semantics for DuckDB WAL:**
-- Research confirmed WAL behavior and directory mount requirement, but real-world testing (kill -9 container, verify WAL replay) needed to validate prevention.
+- **Schema decision -- sentinel vs nullable bbox:** Both STACK.md and PITFALLS.md recommend sentinel values (0.0) to avoid nullable bbox complexity. ARCHITECTURE.md leans toward nullable for semantic cleanliness. The team must make one decision before Phase 1 implementation. Recommendation: sentinel values (simpler, zero migration risk to existing data, no DuckDB ALTER version concerns).
 
-**Konva coordinate normalization at different image scales:**
-- Transformer scale pattern documented, but implementation with real datasets (varied aspect ratios, zoom levels) will validate edge cases.
+- **DuckDB `ALTER COLUMN DROP NOT NULL` syntax:** The recommended architecture avoids this by using sentinel values instead of NULL. If nullable columns are chosen, verify DuckDB ALTER syntax for the installed version before implementation begins. Fallback: recreate the table with nullable columns if ALTER fails.
 
-**GCS image serving with signed URL expiry:**
-- Stack research mentioned GCS support exists (fsspec), but signed URL refresh during long browsing sessions needs implementation design.
+- **Roboflow JSONL format completeness:** Research confirmed `{"image":"filename.jpg","prefix":"prompt","suffix":"class_label"}` as the target format. Validate against an actual Roboflow classification export before finalizing the parser -- optional fields (e.g., split, confidence) may be present and should be handled gracefully.
 
-**react-hotkeys-hook focus scoping with nested modals:**
-- Documentation confirms scoping, but interaction between modal shortcuts (arrow keys) and edit mode shortcuts (delete) needs testing.
-
----
+- **Confusion matrix at 43+ classes -- canvas vs HTML table:** The HTML table approach with 1,849 cells may be acceptable with CSS overflow and threshold filtering. Prototype this early in Phase 3 to decide if a canvas-based renderer is needed, rather than discovering it at the end of the phase.
 
 ## Sources
 
-### Stack Research (HIGH confidence)
-- [Caddy Docker image](https://hub.docker.com/_/caddy) — caddy:2-alpine, automatic HTTPS
-- [Caddy reverse proxy quickstart](https://caddyserver.com/docs/quick-starts/reverse-proxy)
-- [Caddy basic_auth directive](https://caddyserver.com/docs/caddyfile/directives/basic_auth)
-- [FastAPI HTTP Basic Auth](https://fastapi.tiangolo.com/advanced/security/http-basic-auth/)
-- [FastAPI Docker deployment](https://fastapi.tiangolo.com/deployment/docker/)
-- [Next.js standalone output](https://nextjs.org/docs/app/api-reference/config/next-config-js/output)
-- [react-konva npm](https://www.npmjs.com/package/react-konva) — v19.2.0 verified
-- [konva npm](https://www.npmjs.com/package/konva) — v10.2.0 verified
-- [Konva Transformer docs](https://konvajs.org/docs/react/Transformer.html)
-- [react-hotkeys-hook npm](https://www.npmjs.com/package/react-hotkeys-hook) — v5.2.4 verified
-- [Qdrant local mode](https://deepwiki.com/qdrant/qdrant-client/2.2-local-mode)
-- [GCP Container-Optimized OS](https://cloud.google.com/container-optimized-os/docs)
-
-### Feature Research (HIGH confidence)
-- [FiftyOne Import Datasets (v1.12.0)](https://docs.voxel51.com/user_guide/import_datasets.html)
-- [FiftyOne Export Datasets (v1.11.1)](https://docs.voxel51.com/user_guide/export_datasets.html)
-- [FiftyOne Evaluation (v1.11.1)](https://docs.voxel51.com/user_guide/evaluation.html)
-- [FiftyOne Brain](https://docs.voxel51.com/brain.html)
-- [FiftyOne Annotation (v1.11.0)](https://docs.voxel51.com/user_guide/annotation.html)
-- [Encord Annotate Overview](https://docs.encord.com/platform-documentation/Annotate/annotate-overview)
-- [Encord Label Editor](https://docs.encord.com/platform-documentation/Annotate/annotate-label-editor)
-- [Encord Active Quality Metrics](https://docs.encord.com/platform-documentation/Active/active-quality-metrics/active-model-quality-metrics)
-- [Encord Editor Shortcuts](https://docs.encord.com/platform-documentation/Annotate/annotate-label-editor/annotate-label-editor-settings-shortcuts)
-
-### Architecture Research (HIGH confidence)
-- DataVisor codebase: app/main.py, dependencies.py, config.py, repositories/duckdb_repo.py, services/similarity_service.py — existing patterns verified
-- DataVisor codebase: frontend/src/stores/*.ts, components/**/*.tsx — 3 Zustand stores, SVG annotation overlay confirmed
-- [FastAPI Dependency Injection (PropelAuth)](https://www.propelauth.com/post/fastapi-auth-with-dependency-injection)
-- [FastAPI Auth Discussion #8867](https://github.com/fastapi/fastapi/discussions/8867)
-- [Konva Drag and Resize Limits](https://konvajs.org/docs/select_and_transform/Resize_Limits.html)
-- [Qdrant Python Client](https://python-client.qdrant.tech/qdrant_client.qdrant_client)
-
-### Pitfall Research (MEDIUM-HIGH confidence)
-- [DuckDB Files Created](https://duckdb.org/docs/stable/operations_manual/footprint_of_duckdb/files_created_by_duckdb) — WAL behavior
-- [DuckDB Concurrency](https://duckdb.org/docs/stable/connect/concurrency) — single-writer model
-- [DuckDB WAL Issue #10002](https://github.com/duckdb/duckdb/issues/10002) — lock file not cleaned
-- [Next.js Environment Variables](https://nextjs.org/docs/pages/guides/environment-variables) — NEXT_PUBLIC build-time
-- [Next.js Docker Env Discussion #17641](https://github.com/vercel/next.js/discussions/17641)
-- [MDN EventSource](https://developer.mozilla.org/en-US/docs/Web/API/EventSource/withCredentials) — header limitations
-- [WHATWG EventSource Issue #2177](https://github.com/whatwg/html/issues/2177) — cannot set headers
-- [Konva Coordinate Issue #830](https://github.com/konvajs/konva/issues/830) — dragging and zooming
-- [Konva Transformer BBox Issue #1296](https://github.com/konvajs/konva/issues/1296) — incorrect bbox with scale
-- [GCP Persistent Disks](https://cloud.google.com/compute/docs/disks/add-persistent-disk)
-- [GCP Firewall Rules](https://cloud.google.com/compute/docs/networking/firewalls)
+### Primary (HIGH confidence)
+- DataVisor codebase -- direct inspection of `duckdb_repo.py`, `evaluation.py`, `coco_parser.py`, `base_parser.py`, `folder_scanner.py`, `annotation-overlay.tsx`, `grid-cell.tsx`, `sample-modal.tsx`, `statistics.py`, `evaluation-panel.tsx`, `annotation.ts`, `evaluation.ts`
+- scikit-learn stable documentation -- `classification_report()`, `confusion_matrix()`, `precision_recall_fscore_support()`
+- FiftyOne official documentation -- `evaluate_classifications()` API, classification label rendering conventions, evaluate_classifications tutorial
+- Cleanlab official documentation -- image classification tutorial, datalab image issues
+- Google ML Crash Course -- classification metrics definitions (accuracy, precision, recall, F1)
+- Evidently AI -- multi-class metrics reference
 
----
+### Secondary (MEDIUM confidence)
+- Roboflow classification export format -- confirmed via user-provided sample data; may have undocumented optional fields
+- DuckDB ALTER TABLE behavior -- general knowledge; version-specific behavior should be verified at implementation time
 
-**Ready for Requirements Definition:** YES
+### Tertiary (LOW confidence)
+- None -- all findings are grounded in HIGH or MEDIUM sources
 
-SUMMARY.md synthesizes findings from 4 parallel research files. Orchestrator can proceed to requirements definition for v1.1 roadmap.
+---
+*Research completed: 2026-02-18*
+*Ready for roadmap: yes*

From 927afab42eef741ee1cc90ee96fc90ffa805269b Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 17:02:21 -0500
Subject: [PATCH 06/38] docs: define milestone v1.2 requirements

---
 .planning/REQUIREMENTS.md | 94 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 .planning/REQUIREMENTS.md

diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
new file mode 100644
index 0000000..d20a552
--- /dev/null
+++ b/.planning/REQUIREMENTS.md
@@ -0,0 +1,94 @@
+# Requirements: DataVisor
+
+**Defined:** 2026-02-18
+**Core Value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
+
+## v1.2 Requirements
+
+Requirements for classification dataset support. Each maps to roadmap phases.
+
+### Ingestion
+
+- [ ] **INGEST-01**: User can import a classification dataset from a directory containing JSONL annotations and images
+- [ ] **INGEST-02**: System auto-detects dataset type (detection vs classification) from annotation format during import
+- [ ] **INGEST-03**: User can import multi-split classification datasets (train/valid/test) in a single operation
+- [ ] **INGEST-04**: Schema stores dataset_type on the datasets table and handles classification annotations without bbox values
+
+### Display
+
+- [ ] **DISP-01**: User sees class label badges on grid thumbnails for classification datasets
+- [ ] **DISP-02**: User sees class label (GT and prediction) prominently in the sample detail modal
+- [ ] **DISP-03**: User can edit the GT class label via dropdown in the detail modal
+- [ ] **DISP-04**: Statistics dashboard shows classification-appropriate metrics (labeled images, class distribution) and hides detection-only elements (bbox area, IoU slider)
+
+### Evaluation
+
+- [ ] **EVAL-01**: User can import classification predictions in JSONL format with confidence scores
+- [ ] **EVAL-02**: User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics
+- [ ] **EVAL-03**: User sees a confusion matrix for classification with click-to-filter support
+- [ ] **EVAL-04**: User sees error analysis categorizing each image as correct, misclassified, or missing prediction
+- [ ] **EVAL-05**: User sees GT vs predicted label comparison on grid thumbnails and in the modal
+
+### Polish
+
+- [ ] **POLISH-01**: Confusion matrix scales to 43+ classes with readable rendering
+- [ ] **POLISH-02**: User can color embedding scatter by GT class, predicted class, or correct/incorrect status
+- [ ] **POLISH-03**: User sees most-confused class pairs summary from the confusion matrix
+- [ ] **POLISH-04**: User sees per-class performance sparklines with color-coded thresholds
+
+## Future Requirements
+
+### Multi-label Classification
+
+- **MLABEL-01**: User can import multi-label classification datasets (multiple labels per image)
+- **MLABEL-02**: User sees multi-label metrics (hamming loss, subset accuracy)
+
+### Advanced Evaluation
+
+- **ADVEVAL-01**: User can import top-K predictions with full probability distributions
+- **ADVEVAL-02**: User sees confidence calibration plot (reliability diagram)
+- **ADVEVAL-03**: User can compare performance across train/valid/test splits side-by-side
+
+## Out of Scope
+
+| Feature | Reason |
+|---------|--------|
+| Multi-label classification | Different data model, metrics, and UI; scope explosion for v1.2 |
+| Top-K evaluation | Requires importing full probability distributions; complicates schema |
+| PR curves for classification | Less informative than confusion matrix + per-class metrics for multi-class |
+| mAP for classification | Detection metric, not applicable to classification |
+| Bbox editing for classification | No bounding boxes in classification datasets |
+| IoU threshold controls for classification | No spatial matching in classification |
+
+## Traceability
+
+Which phases cover which requirements. Updated during roadmap creation.
+
+| Requirement | Phase | Status |
+|-------------|-------|--------|
+| INGEST-01 | — | Pending |
+| INGEST-02 | — | Pending |
+| INGEST-03 | — | Pending |
+| INGEST-04 | — | Pending |
+| DISP-01 | — | Pending |
+| DISP-02 | — | Pending |
+| DISP-03 | — | Pending |
+| DISP-04 | — | Pending |
+| EVAL-01 | — | Pending |
+| EVAL-02 | — | Pending |
+| EVAL-03 | — | Pending |
+| EVAL-04 | — | Pending |
+| EVAL-05 | — | Pending |
+| POLISH-01 | — | Pending |
+| POLISH-02 | — | Pending |
+| POLISH-03 | — | Pending |
+| POLISH-04 | — | Pending |
+
+**Coverage:**
+- v1.2 requirements: 17 total
+- Mapped to phases: 0
+- Unmapped: 17
+
+---
+*Requirements defined: 2026-02-18*
+*Last updated: 2026-02-18 after initial definition*

From 0b196c127b5ceb5c887c8865abf0687103bbd8ca Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 19:51:16 -0500
Subject: [PATCH 07/38] docs: create milestone v1.2 roadmap (3 phases)

---
 .planning/REQUIREMENTS.md | 40 ++++++++++++++++++------------------
 .planning/ROADMAP.md      | 43 +++++++++++++++++++++++++++++++++++++++
 .planning/STATE.md        | 28 +++++++++++++++++--------
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
index d20a552..665e268 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/REQUIREMENTS.md
@@ -66,29 +66,29 @@ Which phases cover which requirements. Updated during roadmap creation.
 
 | Requirement | Phase | Status |
 |-------------|-------|--------|
-| INGEST-01 | — | Pending |
-| INGEST-02 | — | Pending |
-| INGEST-03 | — | Pending |
-| INGEST-04 | — | Pending |
-| DISP-01 | — | Pending |
-| DISP-02 | — | Pending |
-| DISP-03 | — | Pending |
-| DISP-04 | — | Pending |
-| EVAL-01 | — | Pending |
-| EVAL-02 | — | Pending |
-| EVAL-03 | — | Pending |
-| EVAL-04 | — | Pending |
-| EVAL-05 | — | Pending |
-| POLISH-01 | — | Pending |
-| POLISH-02 | — | Pending |
-| POLISH-03 | — | Pending |
-| POLISH-04 | — | Pending |
+| INGEST-01 | Phase 15 | Pending |
+| INGEST-02 | Phase 15 | Pending |
+| INGEST-03 | Phase 15 | Pending |
+| INGEST-04 | Phase 15 | Pending |
+| DISP-01 | Phase 15 | Pending |
+| DISP-02 | Phase 15 | Pending |
+| DISP-03 | Phase 15 | Pending |
+| DISP-04 | Phase 15 | Pending |
+| EVAL-01 | Phase 16 | Pending |
+| EVAL-02 | Phase 16 | Pending |
+| EVAL-03 | Phase 16 | Pending |
+| EVAL-04 | Phase 16 | Pending |
+| EVAL-05 | Phase 16 | Pending |
+| POLISH-01 | Phase 17 | Pending |
+| POLISH-02 | Phase 17 | Pending |
+| POLISH-03 | Phase 17 | Pending |
+| POLISH-04 | Phase 17 | Pending |
 
 **Coverage:**
 - v1.2 requirements: 17 total
-- Mapped to phases: 0
-- Unmapped: 17
+- Mapped to phases: 17
+- Unmapped: 0
 
 ---
 *Requirements defined: 2026-02-18*
-*Last updated: 2026-02-18 after initial definition*
+*Last updated: 2026-02-18 after roadmap creation*
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 20563c9..6c2a17a 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -4,6 +4,7 @@
 
 - v1.0 MVP - Phases 1-7 (shipped 2026-02-12) — [archive](.planning/milestones/v1.0-ROADMAP.md)
 - v1.1 Deployment, Workflow & Competitive Parity - Phases 8-14 (shipped 2026-02-13) — [archive](.planning/milestones/v1.1-ROADMAP.md)
+- v1.2 Classification Dataset Support - Phases 15-17 (in progress)
 
 ## Phases
 
@@ -73,6 +74,45 @@
 
 </details>
 
+### v1.2 Classification Dataset Support (In Progress)
+
+**Milestone Goal:** First-class single-label classification dataset support with full feature parity to detection workflows -- from ingestion through evaluation to polish.
+
+#### Phase 15: Classification Ingestion & Display
+**Goal**: Users can import, browse, and inspect classification datasets with the same ease as detection datasets
+**Depends on**: Phase 14 (existing codebase)
+**Requirements**: INGEST-01, INGEST-02, INGEST-03, INGEST-04, DISP-01, DISP-02, DISP-03, DISP-04
+**Success Criteria** (what must be TRUE):
+  1. User can point the ingestion wizard at a folder with JSONL annotations and images, and the system auto-detects it as a classification dataset
+  2. User can import multi-split classification datasets (train/valid/test) in a single operation, just like detection datasets
+  3. User sees class label badges on grid thumbnails instead of bounding box overlays when browsing a classification dataset
+  4. User sees GT class label prominently in the sample detail modal and can change it via a dropdown
+  5. Statistics dashboard shows classification-appropriate metrics (labeled images count, class distribution) with no detection-only elements visible (no bbox area histogram, no IoU slider)
+**Plans**: TBD
+
+#### Phase 16: Classification Evaluation
+**Goal**: Users can import predictions and analyze classification model performance with accuracy, F1, confusion matrix, and error categorization
+**Depends on**: Phase 15
+**Requirements**: EVAL-01, EVAL-02, EVAL-03, EVAL-04, EVAL-05
+**Success Criteria** (what must be TRUE):
+  1. User can import classification predictions in JSONL format with confidence scores and see them alongside ground truth
+  2. User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics in the evaluation panel
+  3. User sees a confusion matrix and can click any cell to filter the grid to images with that GT/predicted class pair
+  4. User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view
+  5. User sees GT vs predicted label comparison on grid thumbnails and in the detail modal
+**Plans**: TBD
+
+#### Phase 17: Classification Polish
+**Goal**: Classification workflows are production-ready for high-cardinality datasets (43+ classes) with visual aids that surface actionable insights
+**Depends on**: Phase 16
+**Requirements**: POLISH-01, POLISH-02, POLISH-03, POLISH-04
+**Success Criteria** (what must be TRUE):
+  1. Confusion matrix renders readably at 43+ classes with threshold filtering and overflow handling
+  2. User can color the embedding scatter plot by GT class, predicted class, or correct/incorrect status
+  3. User sees a ranked list of most-confused class pairs derived from the confusion matrix
+  4. User sees per-class performance sparklines with color-coded thresholds (green/yellow/red) in the metrics table
+**Plans**: TBD
+
 ## Progress
 
 | Phase | Milestone | Plans Complete | Status | Completed |
@@ -91,3 +131,6 @@
 | 12. Interactive Viz & Discovery | v1.1 | 3/3 | Complete | 2026-02-13 |
 | 13. Keyboard Shortcuts | v1.1 | 2/2 | Complete | 2026-02-13 |
 | 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
+| 15. Classification Ingestion & Display | v1.2 | 0/TBD | Not started | - |
+| 16. Classification Evaluation | v1.2 | 0/TBD | Not started | - |
+| 17. Classification Polish | v1.2 | 0/TBD | Not started | - |
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 18f5796..f777f60 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -5,14 +5,16 @@
 See: .planning/PROJECT.md (updated 2026-02-18)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** v1.2 Classification Dataset Support
+**Current focus:** Phase 15 - Classification Ingestion & Display
 
 ## Current Position
 
-Phase: Not started (defining requirements)
-Plan: —
-Status: Defining requirements
-Last activity: 2026-02-18 — Milestone v1.2 started
+Phase: 15 of 17 (Classification Ingestion & Display)
+Plan: 0 of TBD in current phase
+Status: Ready to plan
+Last activity: 2026-02-18 -- Roadmap created for v1.2 milestone
+
+Progress: [##########################..] 88% (v1.0 + v1.1 complete, v1.2 starting)
 
 ## Performance Metrics
 
@@ -31,6 +33,12 @@ Last activity: 2026-02-18 — Milestone v1.2 started
 ### Decisions
 
 Decisions are logged in PROJECT.md Key Decisions table.
+Recent decisions affecting current work:
+
+- Schema approach: sentinel bbox values (0.0) over nullable columns -- avoids 30+ null guards
+- Separate classification evaluation function (~50 lines) vs modifying 560-line detection eval
+- Thread `datasetType` prop from page level, branch at component boundaries
+- Parser registry in IngestionService for format dispatch
 
 ### Pending Todos
 
@@ -38,15 +46,17 @@ None.
 
 ### Blockers/Concerns
 
-None active.
+- Confirm Roboflow JSONL format against actual export before finalizing parser
+- Confusion matrix at 43+ classes may need canvas rendering -- prototype early in Phase 17
 
 ### Roadmap Evolution
 
-- v1.0: 7 phases (1-7), 21 plans — shipped 2026-02-12
-- v1.1: 7 phases (8-14), 20 plans — shipped 2026-02-13
+- v1.0: 7 phases (1-7), 21 plans -- shipped 2026-02-12
+- v1.1: 7 phases (8-14), 20 plans -- shipped 2026-02-13
+- v1.2: 3 phases (15-17), TBD plans -- in progress
 
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: v1.2 milestone initialization
+Stopped at: Roadmap created for v1.2 milestone
 Resume file: None

From 875434679acd70c9b09aeecfa36a9ed9946531ac Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 20:47:04 -0500
Subject: [PATCH 08/38] docs(15): research phase domain

---
 .../15-RESEARCH.md                            | 555 ++++++++++++++++++
 1 file changed, 555 insertions(+)
 create mode 100644 .planning/phases/15-classification-ingestion-display/15-RESEARCH.md

diff --git a/.planning/phases/15-classification-ingestion-display/15-RESEARCH.md b/.planning/phases/15-classification-ingestion-display/15-RESEARCH.md
new file mode 100644
index 0000000..3d60dc5
--- /dev/null
+++ b/.planning/phases/15-classification-ingestion-display/15-RESEARCH.md
@@ -0,0 +1,555 @@
+# Phase 15: Classification Ingestion & Display - Research
+
+**Researched:** 2026-02-18
+**Domain:** Classification dataset ingestion, schema extension, frontend display adaptation
+**Confidence:** HIGH (this is internal codebase extension, not new technology)
+
+## Summary
+
+Phase 15 adds classification dataset support to a codebase currently built exclusively for object detection. The work spans four layers: (1) a new JSONL annotation parser and format auto-detection in the ingestion pipeline, (2) schema changes to track dataset type and store classification annotations using sentinel bbox values, (3) frontend grid/modal display changes to show class labels instead of bounding boxes, and (4) statistics dashboard adaptation to hide detection-only metrics.
+
+The codebase is well-structured with clear separation of concerns -- parsers in `app/ingestion/`, Pydantic models in `app/models/`, services in `app/services/`, and component-per-feature in `frontend/src/components/`. The existing `BaseParser` ABC and streaming batch pattern provide a natural extension point for a classification JSONL parser. The sentinel bbox approach (bbox values = 0.0) means the annotations table schema is untouched, avoiding null guards in 30+ SQL queries and frontend components.
+
+**Primary recommendation:** Extend the existing parser registry pattern with a `ClassificationJSONLParser` that produces annotation rows with sentinel bbox values (0.0), add `dataset_type VARCHAR DEFAULT 'detection'` to the datasets table, and use the `datasetType` prop threaded from the page level to branch rendering at component boundaries (grid cell, sample modal, stats dashboard).
+
+## Standard Stack
+
+### Core (already in use -- no new dependencies)
+
+| Library | Purpose | Status |
+|---------|---------|--------|
+| DuckDB | Schema storage, SQL queries | In use |
+| FastAPI | API layer | In use |
+| Pydantic | Request/response models | In use |
+| ijson | Streaming JSON parsing | In use (COCO parser) |
+| pandas | DataFrame batch construction | In use |
+| Next.js + React | Frontend framework | In use |
+| Zustand | State management | In use |
+| TanStack Query | Data fetching/caching | In use |
+| Recharts | Charts (class distribution) | In use |
+
+### Supporting (no new libraries needed)
+
+This phase requires zero new dependencies. Classification JSONL files are simple enough to parse with Python's built-in `json` module line-by-line, or with the existing `ijson` dependency if streaming is desired. The frontend changes are pure React component branching.
+
+### Alternatives Considered
+
+| Instead of | Could Use | Tradeoff |
+|------------|-----------|----------|
+| Sentinel bbox (0.0) | Nullable bbox columns | Nullable requires 30+ null guards in SQL queries, filter builder, evaluation, frontend annotation types. Sentinel avoids this entirely. |
+| Separate classification_annotations table | Shared annotations table with sentinels | Separate table would require duplicating all annotation queries, filter logic, statistics queries. Shared table is simpler. |
+| Dynamic format detection at query time | Stored `dataset_type` column | Stored column is a single lookup; dynamic detection requires scanning annotations for non-zero bboxes every time. |
+
+## Architecture Patterns
+
+### Recommended Change Map
+
+```
+Backend:
+  app/ingestion/
+    classification_jsonl_parser.py  # NEW: ClassificationJSONLParser
+  app/services/
+    folder_scanner.py               # MODIFY: detect JSONL + images layout
+    ingestion.py                    # MODIFY: dispatch to parser by format
+    evaluation.py                   # LEAVE (classification eval is Phase 16+)
+  app/repositories/
+    duckdb_repo.py                  # MODIFY: add dataset_type column
+  app/models/
+    dataset.py                      # MODIFY: add dataset_type field
+    scan.py                         # MODIFY: format can be "classification_jsonl"
+    annotation.py                   # NO CHANGE (sentinel bbox values fit existing schema)
+    statistics.py                   # POSSIBLY MODIFY: add labeled_images_count
+  app/routers/
+    ingestion.py                    # MODIFY: error message wording
+    statistics.py                   # MODIFY: classification-aware summary stats
+
+Frontend:
+  types/dataset.ts                  # MODIFY: add dataset_type field
+  types/scan.ts                     # MODIFY: format can include "classification_jsonl"
+  app/datasets/[datasetId]/page.tsx # MODIFY: thread datasetType prop
+  components/grid/grid-cell.tsx     # MODIFY: show class badge instead of bbox overlay
+  components/grid/annotation-overlay.tsx  # NO CHANGE (just not rendered for classification)
+  components/detail/sample-modal.tsx      # MODIFY: show class label + dropdown
+  components/detail/annotation-list.tsx   # MODIFY: hide bbox columns for classification
+  components/stats/stats-dashboard.tsx    # MODIFY: hide detection-only tabs
+  components/stats/annotation-summary.tsx # MODIFY: classification-appropriate labels
+  components/ingest/scan-results.tsx      # MODIFY: show format badge for classification
+```
+
+### Pattern 1: Sentinel BBox Values for Classification
+
+**What:** Classification annotations use bbox_x=0, bbox_y=0, bbox_w=0, bbox_h=0, area=0 as sentinel values. The `category_name` field carries the class label. One annotation per sample (for single-label classification).
+
+**When to use:** When inserting classification annotations into the shared annotations table.
+
+**Example:**
+```python
+# Classification annotation row (sentinel bboxes)
+{
+    "id": str(uuid.uuid4()),
+    "dataset_id": dataset_id,
+    "sample_id": sample_id,
+    "category_name": "dog",         # The class label
+    "bbox_x": 0.0,                  # Sentinel
+    "bbox_y": 0.0,                  # Sentinel
+    "bbox_w": 0.0,                  # Sentinel
+    "bbox_h": 0.0,                  # Sentinel
+    "area": 0.0,                    # Sentinel
+    "is_crowd": False,
+    "source": "ground_truth",
+    "confidence": None,
+    "metadata": None,
+}
+```
+
+### Pattern 2: Parser Dispatch by Format
+
+**What:** The IngestionService currently hardcodes `COCOParser()`. Extend to dispatch by format string.
+
+**When to use:** When `ingest_with_progress` is called.
+
+**Example:**
+```python
+# In IngestionService.ingest_with_progress():
+if format == "coco":
+    parser = COCOParser(batch_size=1000)
+elif format == "classification_jsonl":
+    parser = ClassificationJSONLParser(batch_size=1000)
+else:
+    raise ValueError(f"Unsupported format: {format}")
+```
+
+### Pattern 3: Format Auto-Detection in FolderScanner
+
+**What:** The FolderScanner currently only detects COCO JSON files. Extend to detect classification JSONL files.
+
+**When to use:** During `FolderScanner.scan()`.
+
+**Detection heuristic:** Look for `.jsonl` files in the directory tree. A classification JSONL file contains lines like:
+```json
+{"filename": "image001.jpg", "label": "dog"}
+```
+Peek at the first few lines: if they parse as JSON with `filename` and `label` keys (no `bbox`/`annotations` key), classify as `classification_jsonl`.
+
+**Example:**
+```python
+@staticmethod
+def _is_classification_jsonl(file_path: Path) -> bool:
+    """Check if a file is a classification JSONL annotation file."""
+    try:
+        with open(file_path) as f:
+            for i, line in enumerate(f):
+                if i >= 5:
+                    break
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                if "label" in obj and ("filename" in obj or "file_name" in obj):
+                    if "bbox" not in obj and "annotations" not in obj:
+                        return True
+                return False
+        return False
+    except Exception:
+        return False
+```
+
+### Pattern 4: datasetType Prop Threading
+
+**What:** The dataset page fetches `dataset.dataset_type` and threads it as a prop to child components. Components branch at their boundary rather than deep inside.
+
+**When to use:** Any component whose rendering differs between detection and classification.
+
+**Example:**
+```tsx
+// page.tsx
+<ImageGrid datasetId={datasetId} datasetType={dataset.dataset_type} />
+<StatsDashboard datasetId={datasetId} datasetType={dataset.dataset_type} />
+<SampleModal datasetId={datasetId} samples={allSamples} datasetType={dataset.dataset_type} />
+
+// grid-cell.tsx
+if (datasetType === "classification") {
+  // Show class label badge instead of AnnotationOverlay
+  const gtAnnotation = annotations.find(a => a.source === "ground_truth");
+  return <ClassBadge label={gtAnnotation?.category_name} />;
+} else {
+  return <AnnotationOverlay ... />;
+}
+```
+
+### Anti-Patterns to Avoid
+
+- **Checking dataset_type deep inside components:** Branch at component boundaries (GridCell, SampleModal, StatsDashboard), not inside utility functions or hooks that are shared across both types.
+- **Adding nullable bbox columns:** The sentinel approach was a prior decision. Do not add nullable bbox columns to the annotations table.
+- **Modifying the existing 560-line evaluation.py:** Classification evaluation is separate (~50 lines, Phase 16+). Do not touch `evaluation.py` in this phase.
+- **Storing dataset_type on samples:** It belongs on the datasets table -- one type per dataset, not per sample.
+
+## Don't Hand-Roll
+
+| Problem | Don't Build | Use Instead | Why |
+|---------|-------------|-------------|-----|
+| JSONL parsing | Custom streaming parser | Python `json.loads()` per line | JSONL files are small enough (one line per image), no need for ijson streaming |
+| Image dimension reading | Manual PIL/cv2 calls | Existing `ImageService` | Already handles dimension extraction during thumbnail generation |
+| SQL schema migration | Migration framework | `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` | Already established pattern (see `duckdb_repo.py` lines 84-103) |
+| Frontend format badge | Custom badge component | Tailwind utility classes inline | Consistent with existing scan-results.tsx `splitColor()` pattern |
+
+**Key insight:** This phase is mostly wiring -- connecting an existing architecture to a new data shape. The risky parts are not technical but completeness: ensuring every SQL query, every frontend component, and every display path handles the classification case.
+
+## Common Pitfalls
+
+### Pitfall 1: JSONL Format Ambiguity
+
+**What goes wrong:** Different classification tools produce different JSONL schemas. Some use `"label"`, others `"class"`, `"category"`, or `"class_name"`. Some use `"filename"`, others `"file_name"`, `"image"`, or `"path"`.
+
+**Why it happens:** No industry standard for classification JSONL format.
+
+**How to avoid:** Support the most common key variants in the parser. Normalize on read:
+```python
+filename = obj.get("filename") or obj.get("file_name") or obj.get("image") or obj.get("path", "")
+label = obj.get("label") or obj.get("class") or obj.get("category") or obj.get("class_name", "unknown")
+```
+
+**Warning signs:** Parser silently produces zero annotations because key names don't match.
+
+### Pitfall 2: Classification Samples Without Annotations
+
+**What goes wrong:** If an image file exists in the directory but has no line in the JSONL, it gets inserted as a sample with zero annotations. The grid shows it with no badge, confusingly.
+
+**Why it happens:** JSONL may not list every image (unlabeled images are common in classification datasets).
+
+**How to avoid:** During ingestion, only insert samples that appear in the JSONL file. Or, insert all images but mark unlabeled ones clearly in the UI. Decision: follow the COCO parser pattern -- only insert samples listed in the annotation file.
+
+**Warning signs:** Image count in dataset doesn't match directory image count.
+
+### Pitfall 3: Detection-Only UI Elements Leaking Through
+
+**What goes wrong:** Classification datasets show bbox area histograms, IoU sliders, or empty bounding box overlays with sentinel values rendered as tiny dots at (0,0).
+
+**Why it happens:** Forgetting to gate UI elements on `datasetType`.
+
+**How to avoid:** Audit every component that references bbox values or detection-specific concepts:
+- `AnnotationOverlay` -- skip rendering when `datasetType === "classification"`
+- `annotation-list.tsx` -- hide Bounding Box and Area columns
+- `evaluation-panel.tsx` -- hide IoU slider, use accuracy instead of mAP
+- `stats-dashboard.tsx` -- rename "GT Annotations" to "Labeled Images"
+- `annotation-summary.tsx` -- swap card labels
+
+**Warning signs:** Sentinel bbox values (0,0,0,0) rendered visually anywhere.
+
+### Pitfall 4: Category Ingestion for Classification
+
+**What goes wrong:** The COCO parser extracts categories from a dedicated `categories` array. Classification JSONL files don't have one -- categories are implicitly defined by the set of unique labels.
+
+**Why it happens:** Different format, different category discovery mechanism.
+
+**How to avoid:** The ClassificationJSONLParser must do a first pass to collect unique labels, assign sequential category IDs, then do a second pass to emit annotation batches. Or, single pass collecting labels as encountered.
+
+**Warning signs:** Empty categories table for classification datasets, breaking filter facets.
+
+### Pitfall 5: Multi-Label Classification Collision
+
+**What goes wrong:** If a future dataset has multiple labels per image, the single-annotation-per-sample assumption breaks.
+
+**Why it happens:** Single-label is the common case, but multi-label exists.
+
+**How to avoid:** Design the JSONL parser to handle `"label": ["dog", "outdoor"]` by emitting multiple annotation rows per sample. The sentinel bbox approach supports this naturally (each annotation row has its own category_name). But for Phase 15, scope to single-label only and document the multi-label extension path.
+
+**Warning signs:** JSONL lines with array-valued `label` fields.
+
+## Code Examples
+
+### Classification JSONL Parser Structure
+
+```python
+class ClassificationJSONLParser(BaseParser):
+    """Parse a JSONL file where each line maps filename -> class label."""
+
+    @property
+    def format_name(self) -> str:
+        return "classification_jsonl"
+
+    def parse_categories(self, file_path: Path) -> dict[int, str]:
+        """First pass: collect unique labels -> sequential IDs."""
+        labels: set[str] = set()
+        with open(file_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                label = obj.get("label") or obj.get("class") or obj.get("category", "unknown")
+                labels.add(label)
+        return {i: name for i, name in enumerate(sorted(labels))}
+
+    def build_image_batches(
+        self, file_path: Path, dataset_id: str, split: str | None = None, image_dir: str = ""
+    ) -> Iterator[pd.DataFrame]:
+        """Yield sample rows from JSONL. Each line = one image."""
+        batch = []
+        for i, line in enumerate(open(file_path)):
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            filename = obj.get("filename") or obj.get("file_name", "")
+            sample_id = f"{split}_{i}" if split else str(i)
+            batch.append({
+                "id": sample_id,
+                "dataset_id": dataset_id,
+                "file_name": filename,
+                "width": obj.get("width", 0),
+                "height": obj.get("height", 0),
+                "thumbnail_path": None,
+                "split": split,
+                "metadata": None,
+                "image_dir": image_dir,
+            })
+            if len(batch) >= self.batch_size:
+                yield pd.DataFrame(batch)
+                batch = []
+        if batch:
+            yield pd.DataFrame(batch)
+
+    def build_annotation_batches(
+        self, file_path: Path, dataset_id: str, categories: dict[int, str], split: str | None = None
+    ) -> Iterator[pd.DataFrame]:
+        """Yield annotation rows with sentinel bbox values."""
+        batch = []
+        cat_name_to_id = {v: k for k, v in categories.items()}
+        for i, line in enumerate(open(file_path)):
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            label = obj.get("label") or obj.get("class") or obj.get("category", "unknown")
+            sample_id = f"{split}_{i}" if split else str(i)
+            ann_id = f"{split}_ann_{i}" if split else f"ann_{i}"
+            batch.append({
+                "id": ann_id,
+                "dataset_id": dataset_id,
+                "sample_id": sample_id,
+                "category_name": label,
+                "bbox_x": 0.0,
+                "bbox_y": 0.0,
+                "bbox_w": 0.0,
+                "bbox_h": 0.0,
+                "area": 0.0,
+                "is_crowd": False,
+                "source": "ground_truth",
+                "confidence": None,
+                "metadata": None,
+            })
+            if len(batch) >= self.batch_size:
+                yield pd.DataFrame(batch)
+                batch = []
+        if batch:
+            yield pd.DataFrame(batch)
+```
+
+### Schema Migration (DuckDB)
+
+```python
+# In duckdb_repo.py initialize_schema():
+self.connection.execute(
+    "ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection'"
+)
+```
+
+### Frontend Class Badge (Grid Cell)
+
+```tsx
+// Inside GridCell, replacing AnnotationOverlay for classification datasets:
+function ClassBadge({ label }: { label?: string }) {
+  if (!label) return null;
+  return (
+    <div className="absolute bottom-1 left-1 z-10">
+      <span className="rounded bg-black/60 px-1.5 py-0.5 text-[10px] font-semibold text-white">
+        {label}
+      </span>
+    </div>
+  );
+}
+```
+
+### Frontend Class Label in Detail Modal
+
+```tsx
+// In SampleModal, for classification datasets:
+// Show GT class label prominently with dropdown to change it
+<div className="flex items-center gap-2">
+  <span className="text-sm font-medium text-zinc-500">Class:</span>
+  <select
+    value={gtAnnotation?.category_name ?? ""}
+    onChange={(e) => {
+      if (gtAnnotation) {
+        // Update the annotation's category_name
+        updateCategoryMutation.mutate({
+          annotationId: gtAnnotation.id,
+          category_name: e.target.value,
+        });
+      }
+    }}
+    className="rounded border border-zinc-300 px-2 py-1 text-sm"
+  >
+    {categories.map((cat) => (
+      <option key={cat} value={cat}>{cat}</option>
+    ))}
+  </select>
+</div>
+```
+
+### Classification-Aware Statistics Summary
+
+```tsx
+// In AnnotationSummary, swap card definitions based on datasetType:
+const DETECTION_CARDS = [
+  { key: "total_images", label: "Total Images" },
+  { key: "gt_annotations", label: "GT Annotations" },
+  { key: "pred_annotations", label: "Predictions" },
+  { key: "total_categories", label: "Categories" },
+];
+
+const CLASSIFICATION_CARDS = [
+  { key: "total_images", label: "Total Images" },
+  { key: "gt_annotations", label: "Labeled Images" },
+  { key: "pred_annotations", label: "Predictions" },
+  { key: "total_categories", label: "Classes" },
+];
+```
+
+## Existing Codebase Surface Area
+
+### Files That MUST Change
+
+| File | Change | Reason |
+|------|--------|--------|
+| `app/repositories/duckdb_repo.py` | Add `dataset_type` column | INGEST-04 |
+| `app/ingestion/classification_jsonl_parser.py` | NEW file | INGEST-01 |
+| `app/services/folder_scanner.py` | Detect JSONL layouts | INGEST-02 |
+| `app/services/ingestion.py` | Parser dispatch, store dataset_type | INGEST-01, INGEST-02 |
+| `app/models/dataset.py` | Add `dataset_type` to response | INGEST-04 |
+| `app/models/scan.py` | Format can be `classification_jsonl` | INGEST-02 |
+| `app/routers/datasets.py` | Return dataset_type in responses | INGEST-04 |
+| `frontend/src/types/dataset.ts` | Add `dataset_type` field | INGEST-04 |
+| `frontend/src/types/scan.ts` | Format type update | INGEST-02 |
+| `frontend/src/app/datasets/[datasetId]/page.tsx` | Thread `datasetType` prop | DISP-01 through DISP-04 |
+| `frontend/src/components/grid/grid-cell.tsx` | Show class badge for classification | DISP-01 |
+| `frontend/src/components/detail/sample-modal.tsx` | Show class label + dropdown | DISP-02, DISP-03 |
+| `frontend/src/components/detail/annotation-list.tsx` | Hide bbox columns for classification | DISP-02 |
+| `frontend/src/components/stats/stats-dashboard.tsx` | Hide detection-only tabs | DISP-04 |
+| `frontend/src/components/stats/annotation-summary.tsx` | Classification-appropriate labels | DISP-04 |
+| `frontend/src/components/ingest/scan-results.tsx` | Format badge for classification | INGEST-02 |
+
+### Files That SHOULD NOT Change
+
+| File | Reason |
+|------|--------|
+| `app/services/evaluation.py` | Detection evaluation untouched; classification eval is separate (future phase) |
+| `app/ingestion/coco_parser.py` | COCO format unchanged |
+| `app/ingestion/prediction_parser.py` | Detection predictions unchanged |
+| `app/services/error_analysis.py` | Detection-specific error categories |
+| `app/ingestion/detection_annotation_parser.py` | Detection predictions unchanged |
+
+### Backend API Changes Needed
+
+1. **New annotation update endpoint for category_name** (DISP-03): Currently `PUT /annotations/{id}` only updates bbox. Need to add `PATCH /annotations/{id}/category` or extend the existing PUT to accept `category_name`.
+
+2. **Statistics endpoint** (DISP-04): The `GET /datasets/{id}/statistics` endpoint returns detection-centric summary stats. For classification datasets, `gt_annotations` should reflect "labeled images" (distinct sample_ids with GT annotations) rather than raw annotation count.
+
+3. **Dataset response**: `GET /datasets/{id}` needs to include `dataset_type`.
+
+### Classification JSONL Expected Format
+
+```jsonl
+{"filename": "img001.jpg", "label": "cat"}
+{"filename": "img002.jpg", "label": "dog"}
+{"filename": "img003.jpg", "label": "cat"}
+```
+
+Alternative accepted keys:
+- `filename` / `file_name` / `image` / `path`
+- `label` / `class` / `category` / `class_name`
+- Optional: `width`, `height`, `confidence`, `split`
+
+### Folder Layouts to Detect
+
+**Layout D (Classification JSONL):** Split directories with JSONL + images:
+```
+dataset/
+  train/
+    annotations.jsonl
+    img001.jpg
+    img002.jpg
+  val/
+    annotations.jsonl
+    img003.jpg
+```
+
+**Layout E (Flat Classification):** Single JSONL at root:
+```
+dataset/
+  labels.jsonl
+  images/
+    img001.jpg
+    img002.jpg
+```
+
+## State of the Art
+
+| Old Approach | Current Approach | When Changed | Impact |
+|--------------|------------------|--------------|--------|
+| Hard-coded COCO parser | Parser dispatch by format string | Phase 15 | Enables multi-format support |
+| No dataset_type tracking | `dataset_type` column on datasets | Phase 15 | Frontend can branch rendering |
+| Detection-only statistics | Type-aware statistics | Phase 15 | Classification users see relevant metrics |
+
+## Open Questions
+
+1. **Image dimensions for classification JSONL**
+   - What we know: COCO JSON includes width/height per image. Classification JSONL typically doesn't.
+   - What's unclear: Should the parser read image dimensions from disk during ingestion, or store 0/0 and resolve later during thumbnail generation?
+   - Recommendation: Read dimensions during thumbnail generation (existing `ImageService` path). Store 0/0 initially if not present in JSONL. The grid cell uses `object-cover` which doesn't need dimensions. The annotation overlay (not used for classification) needs dimensions. Detail modal image loads at full-res naturally.
+
+2. **Multi-label classification**
+   - What we know: Phase 15 scopes to single-label. Multi-label is a future extension.
+   - What's unclear: Should the JSONL parser error on array labels or silently take the first?
+   - Recommendation: If `label` is an array, emit one annotation row per label. This is forward-compatible and costs nothing with the sentinel bbox approach.
+
+3. **Classification prediction import**
+   - What we know: Detection predictions use `DetectionAnnotationParser` or `PredictionParser`. Classification predictions would be a different format.
+   - What's unclear: Is classification prediction import in scope for Phase 15?
+   - Recommendation: Out of scope. Phase 15 focuses on GT ingestion and display. Classification prediction import + evaluation are natural follow-ups.
+
+4. **Annotation update for category_name change (DISP-03)**
+   - What we know: Current `AnnotationUpdate` model only has bbox fields. Current `PUT /annotations/{id}` only updates bbox.
+   - What's unclear: Should we extend the existing endpoint or create a new one?
+   - Recommendation: Add a new `PATCH /annotations/{id}/category` endpoint or extend `AnnotationUpdate` to include optional `category_name`. Extending is simpler since the existing pattern already handles updates. A new field `category_name: str | None = None` on AnnotationUpdate, applied when present, is clean.
+
+## Sources
+
+### Primary (HIGH confidence)
+- **Codebase analysis** -- direct file reads of all affected files listed above
+  - `app/ingestion/base_parser.py` -- BaseParser ABC interface
+  - `app/ingestion/coco_parser.py` -- reference parser implementation
+  - `app/repositories/duckdb_repo.py` -- schema and migration pattern
+  - `app/services/ingestion.py` -- ingestion orchestration
+  - `app/services/folder_scanner.py` -- format detection heuristics
+  - `app/services/evaluation.py` -- evaluation pipeline (560 lines, leave alone)
+  - `app/models/` -- all Pydantic models
+  - `app/routers/` -- all API endpoints
+  - `frontend/src/components/` -- all display components
+  - `frontend/src/types/` -- all TypeScript type definitions
+  - `frontend/src/stores/` -- Zustand stores (filter, UI, ingest)
+
+### Secondary (MEDIUM confidence)
+- Prior decisions from phase description: sentinel bbox values, separate classification eval function, datasetType prop threading, parser registry
+
+## Metadata
+
+**Confidence breakdown:**
+- Standard stack: HIGH -- no new dependencies, all existing libraries
+- Architecture: HIGH -- extending well-established patterns in the codebase
+- Pitfalls: HIGH -- derived from direct codebase analysis, not external sources
+- Code examples: HIGH -- based on actual codebase patterns and verified file contents
+
+**Research date:** 2026-02-18
+**Valid until:** 2026-03-18 (stable -- internal codebase patterns, no external dependency risk)

From 6868104af3440fbb7fabbde05ced0b2ac76c1d69 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 20:51:52 -0500
Subject: [PATCH 09/38] docs(15): create phase plan

---
 .planning/ROADMAP.md                          |   7 +-
 .../15-01-PLAN.md                             | 252 ++++++++++++++++++
 .../15-02-PLAN.md                             | 231 ++++++++++++++++
 3 files changed, 488 insertions(+), 2 deletions(-)
 create mode 100644 .planning/phases/15-classification-ingestion-display/15-01-PLAN.md
 create mode 100644 .planning/phases/15-classification-ingestion-display/15-02-PLAN.md

diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 6c2a17a..b084712 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -88,7 +88,10 @@
   3. User sees class label badges on grid thumbnails instead of bounding box overlays when browsing a classification dataset
   4. User sees GT class label prominently in the sample detail modal and can change it via a dropdown
   5. Statistics dashboard shows classification-appropriate metrics (labeled images count, class distribution) with no detection-only elements visible (no bbox area histogram, no IoU slider)
-**Plans**: TBD
+**Plans**: 2 plans
+Plans:
+- [ ] 15-01-PLAN.md -- Backend: schema migration, ClassificationJSONLParser, FolderScanner detection, IngestionService dispatch, API endpoints
+- [ ] 15-02-PLAN.md -- Frontend: type updates, grid class badges, detail modal class label/dropdown, classification-aware statistics
 
 #### Phase 16: Classification Evaluation
 **Goal**: Users can import predictions and analyze classification model performance with accuracy, F1, confusion matrix, and error categorization
@@ -131,6 +134,6 @@
 | 12. Interactive Viz & Discovery | v1.1 | 3/3 | Complete | 2026-02-13 |
 | 13. Keyboard Shortcuts | v1.1 | 2/2 | Complete | 2026-02-13 |
 | 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
-| 15. Classification Ingestion & Display | v1.2 | 0/TBD | Not started | - |
+| 15. Classification Ingestion & Display | v1.2 | 0/2 | Planned | - |
 | 16. Classification Evaluation | v1.2 | 0/TBD | Not started | - |
 | 17. Classification Polish | v1.2 | 0/TBD | Not started | - |
diff --git a/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md b/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md
new file mode 100644
index 0000000..653a486
--- /dev/null
+++ b/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md
@@ -0,0 +1,252 @@
+---
+phase: 15-classification-ingestion-display
+plan: 01
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+  - app/repositories/duckdb_repo.py
+  - app/models/dataset.py
+  - app/models/scan.py
+  - app/ingestion/base_parser.py
+  - app/ingestion/classification_jsonl_parser.py
+  - app/services/folder_scanner.py
+  - app/services/ingestion.py
+  - app/routers/ingestion.py
+  - app/routers/datasets.py
+  - app/routers/annotations.py
+  - app/routers/statistics.py
+autonomous: true
+
+must_haves:
+  truths:
+    - "POST /ingestion/scan on a folder with JSONL + images returns format='classification_jsonl' with correct splits"
+    - "POST /ingestion/import with classification_jsonl splits creates dataset with dataset_type='classification' and annotations with sentinel bbox values (0.0)"
+    - "GET /datasets/{id} returns dataset_type field"
+    - "PATCH /annotations/{id}/category updates category_name for classification label editing"
+    - "Classification annotations have bbox_x=0, bbox_y=0, bbox_w=0, bbox_h=0, area=0 as sentinel values"
+  artifacts:
+    - path: "app/ingestion/classification_jsonl_parser.py"
+      provides: "ClassificationJSONLParser extending BaseParser"
+      contains: "class ClassificationJSONLParser"
+    - path: "app/repositories/duckdb_repo.py"
+      provides: "dataset_type column migration"
+      contains: "dataset_type"
+    - path: "app/services/folder_scanner.py"
+      provides: "Classification JSONL layout detection"
+      contains: "classification_jsonl"
+  key_links:
+    - from: "app/services/folder_scanner.py"
+      to: "app/models/scan.py"
+      via: "ScanResult with format='classification_jsonl'"
+      pattern: "classification_jsonl"
+    - from: "app/services/ingestion.py"
+      to: "app/ingestion/classification_jsonl_parser.py"
+      via: "parser dispatch by format string"
+      pattern: "ClassificationJSONLParser"
+    - from: "app/services/ingestion.py"
+      to: "app/repositories/duckdb_repo.py"
+      via: "stores dataset_type on dataset record"
+      pattern: "dataset_type"
+---
+
+<objective>
+Add classification dataset ingestion support to the backend: schema migration, JSONL parser, folder scanner detection, parser dispatch, and annotation category update endpoint.
+
+Purpose: Enable the system to auto-detect, parse, and store classification datasets using the existing ingestion pipeline with sentinel bbox values. This is the backend foundation for all classification display work.
+Output: ClassificationJSONLParser, extended FolderScanner, updated IngestionService with parser dispatch, dataset_type column, and category update endpoint.
+</objective>
+
+<execution_context>
+@/Users/ortizeg/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/ortizeg/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/phases/15-classification-ingestion-display/15-RESEARCH.md
+@app/ingestion/base_parser.py
+@app/ingestion/coco_parser.py
+@app/services/folder_scanner.py
+@app/services/ingestion.py
+@app/repositories/duckdb_repo.py
+@app/models/dataset.py
+@app/models/scan.py
+@app/routers/ingestion.py
+@app/routers/datasets.py
+@app/routers/annotations.py
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Schema migration, Pydantic models, and ClassificationJSONLParser</name>
+  <files>
+    app/repositories/duckdb_repo.py
+    app/models/dataset.py
+    app/models/scan.py
+    app/ingestion/base_parser.py
+    app/ingestion/classification_jsonl_parser.py
+  </files>
+  <action>
+    **1. Schema migration** (`app/repositories/duckdb_repo.py`):
+    Add after existing ALTER TABLE statements in `initialize_schema()`:
+    ```python
+    self.connection.execute(
+        "ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection'"
+    )
+    ```
+
+    **2. Pydantic models**:
+    - `app/models/dataset.py`: Add `dataset_type: str = "detection"` field to `DatasetResponse`.
+    - `app/models/scan.py`: No change needed -- `ScanResult.format` already accepts any string. The format field will carry `"classification_jsonl"` for classification datasets.
+
+    **3. BaseParser update** (`app/ingestion/base_parser.py`):
+    Add `image_dir: str = ""` parameter to `build_image_batches` abstract method signature if not already present (check COCOParser -- it already has it in the concrete method, ensure the ABC matches).
+
+    **4. ClassificationJSONLParser** (`app/ingestion/classification_jsonl_parser.py` -- NEW FILE):
+    Create parser extending BaseParser with:
+
+    - `format_name` property returns `"classification_jsonl"`
+    - `parse_categories(file_path)`: Single pass over JSONL, collect unique labels from flexible keys (`label`, `class`, `category`, `class_name`). Return `{i: name for i, name in enumerate(sorted(labels))}`.
+    - `build_image_batches(file_path, dataset_id, split, image_dir)`: Read JSONL line by line. For each line, extract filename from flexible keys (`filename`, `file_name`, `image`, `path`). Generate sample_id as `f"{split}_{i}"` if split else `str(i)`. Yield DataFrames with columns matching samples table: `id, dataset_id, file_name, width, height, thumbnail_path, split, metadata, image_dir`. Set width=0, height=0 (resolved during thumbnail generation). Use `self.batch_size` for batching.
+    - `build_annotation_batches(file_path, dataset_id, categories, split)`: Read JSONL again. For each line, extract label using same flexible keys. Create annotation row with sentinel bbox values: `bbox_x=0.0, bbox_y=0.0, bbox_w=0.0, bbox_h=0.0, area=0.0, is_crowd=False, source="ground_truth", confidence=None, metadata=None`. Sample IDs must match those from `build_image_batches` (same `f"{split}_{i}"` pattern). Annotation IDs: `f"{split}_ann_{i}"` or `f"ann_{i}"`.
+
+    Handle edge cases:
+    - Skip empty lines
+    - If `label` is an array, emit one annotation row per label (forward-compatible for multi-label)
+    - Use `"unknown"` as fallback label if no label key found
+  </action>
+  <verify>
+    - `python -c "from app.ingestion.classification_jsonl_parser import ClassificationJSONLParser; p = ClassificationJSONLParser(); print(p.format_name)"` prints `classification_jsonl`
+    - `python -c "from app.models.dataset import DatasetResponse; print(DatasetResponse.model_fields.keys())"` includes `dataset_type`
+    - All existing tests pass: `cd app && python -m pytest tests/ -x -q`
+  </verify>
+  <done>ClassificationJSONLParser exists with parse_categories, build_image_batches, build_annotation_batches producing sentinel bbox annotations. DatasetResponse includes dataset_type. Schema migration adds dataset_type column.</done>
+</task>
+
+<task type="auto">
+  <name>Task 2: FolderScanner detection, IngestionService dispatch, and API endpoints</name>
+  <files>
+    app/services/folder_scanner.py
+    app/services/ingestion.py
+    app/routers/ingestion.py
+    app/routers/datasets.py
+    app/routers/annotations.py
+    app/routers/statistics.py
+  </files>
+  <action>
+    **1. FolderScanner** (`app/services/folder_scanner.py`):
+    Extend `scan()` to detect classification JSONL layouts BEFORE trying COCO layouts (classification is more specific -- a JSONL file is never COCO):
+
+    In the local scan path, add before `_try_layout_b`:
+    ```python
+    splits = self._try_layout_d(Path(resolved), warnings)
+    if not splits:
+        splits = self._try_layout_e(Path(resolved), warnings)
+    if splits:
+        return ScanResult(
+            root_path=resolved,
+            dataset_name=_basename(resolved),
+            format="classification_jsonl",
+            splits=splits,
+            warnings=warnings,
+        )
+    ```
+
+    Add two new layout detectors:
+
+    `_try_layout_d(root, warnings)` -- **Split directories with JSONL + images**:
+    - Use existing `_detect_split_dirs()` to find split dirs
+    - In each split dir, look for `.jsonl` files
+    - For each `.jsonl` file, call `_is_classification_jsonl(file_path)` (new static method)
+    - If valid, count images in the split dir, create DetectedSplit
+    - Return list of splits
+
+    `_try_layout_e(root, warnings)` -- **Flat JSONL at root**:
+    - Look for `.jsonl` files in root (no recursion)
+    - Check if any are classification JSONL via `_is_classification_jsonl()`
+    - Image dir: prefer `images/` subdir, else root itself
+    - Return single-element split list with name=root.name
+
+    `_is_classification_jsonl(file_path)` -- **Static method**:
+    - Open file, read first 5 non-empty lines
+    - Parse each as JSON
+    - Return True if line has (`filename` or `file_name` or `image` or `path`) AND (`label` or `class` or `category` or `class_name`) AND NOT (`bbox` or `annotations`)
+    - Catch all exceptions, return False
+
+    For GCS: Add similar classification detection in `_scan_gcs()` -- check for `.jsonl` files before `.json` files. Use `_is_classification_jsonl_remote()` that reads via `self.storage.open()`.
+
+    **2. IngestionService** (`app/services/ingestion.py`):
+    - Add import for ClassificationJSONLParser at top
+    - In `ingest_with_progress()`, replace hardcoded `COCOParser(batch_size=1000)` with format-based dispatch:
+      ```python
+      if format == "coco":
+          parser = COCOParser(batch_size=1000)
+      elif format == "classification_jsonl":
+          parser = ClassificationJSONLParser(batch_size=1000)
+      else:
+          raise ValueError(f"Unsupported format: {format}")
+      ```
+    - After the dataset INSERT (step 4), for new datasets set dataset_type:
+      ```python
+      dataset_type = "classification" if format == "classification_jsonl" else "detection"
+      ```
+      Include `dataset_type` in the INSERT VALUES. Update the INSERT statement to include the new column. For the UPDATE path (existing dataset), no change needed -- dataset_type is set on first insert.
+    - Update the existing INSERT INTO datasets to include `dataset_type` column. The INSERT currently uses positional VALUES -- add `dataset_type` after `prediction_count` (or adjust column list). Be careful to match column order.
+
+    **3. Ingestion router** (`app/routers/ingestion.py`):
+    - The `/ingestion/import` endpoint passes `format` through to `ingest_with_progress`. Currently it may not pass format. Ensure the ImportRequest or the stored ScanResult format is threaded through. The simplest approach: add `format: str = "coco"` field to `ImportRequest` model in `app/models/scan.py`, then pass it in the ingestion router's import endpoint to `ingest_with_progress()`.
+
+    **4. Datasets router** (`app/routers/datasets.py`):
+    - Ensure the `GET /datasets` and `GET /datasets/{id}` queries include `dataset_type` in SELECT. Currently using `SELECT *` or explicit columns -- add `dataset_type` to the result mapping into `DatasetResponse`.
+
+    **5. Annotations router** (`app/routers/annotations.py`):
+    - Add a new endpoint: `PATCH /annotations/{annotation_id}/category` accepting `{"category_name": "new_label"}`. It should UPDATE the annotation's `category_name` in DuckDB. Return 200 with the updated annotation. Use a simple Pydantic model `CategoryUpdateRequest(BaseModel): category_name: str`.
+
+    **6. Statistics router** (`app/routers/statistics.py`):
+    - For classification datasets, the `gt_annotations` stat should reflect "labeled images" (count of distinct sample_ids with GT annotations) rather than raw annotation count. Check `dataset_type` from the datasets table, and if `"classification"`, adjust the query. This is a minor conditional in the existing statistics aggregation.
+  </action>
+  <verify>
+    - Create a test JSONL file and verify scanner detection:
+      ```bash
+      mkdir -p /tmp/test_cls/train && echo '{"filename": "a.jpg", "label": "cat"}' > /tmp/test_cls/train/annotations.jsonl && touch /tmp/test_cls/train/a.jpg
+      python -c "
+      from app.services.folder_scanner import FolderScanner
+      s = FolderScanner()
+      r = s.scan('/tmp/test_cls')
+      print(f'format={r.format}, splits={len(r.splits)}, split_name={r.splits[0].name if r.splits else None}')
+      assert r.format == 'classification_jsonl'
+      print('PASS')
+      "
+      ```
+    - All existing tests pass: `cd app && python -m pytest tests/ -x -q`
+    - Server starts without errors: `cd app && timeout 5 python -c "from app.main import app; print('OK')" 2>&1 || true`
+  </verify>
+  <done>FolderScanner detects classification JSONL layouts (D and E). IngestionService dispatches to ClassificationJSONLParser for classification_jsonl format and stores dataset_type. ImportRequest carries format. PATCH /annotations/{id}/category endpoint exists. Statistics endpoint is classification-aware. GET /datasets returns dataset_type. All existing tests pass.</done>
+</task>
+
+</tasks>
+
+<verification>
+1. Scanner returns format="classification_jsonl" for split-dir JSONL layout
+2. Scanner returns format="classification_jsonl" for flat JSONL layout
+3. Scanner still returns format="coco" for existing COCO layouts (no regression)
+4. Dataset INSERT includes dataset_type="classification" for classification imports
+5. DatasetResponse includes dataset_type field
+6. PATCH /annotations/{id}/category updates category_name
+7. All existing tests pass
+</verification>
+
+<success_criteria>
+- Classification JSONL folders are auto-detected by the scanner
+- Parser produces correct annotations with sentinel bbox values
+- dataset_type is stored and returned via API
+- Category update endpoint works for classification label editing
+- Zero regressions in existing detection workflow
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md`
+</output>
diff --git a/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md b/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md
new file mode 100644
index 0000000..9e29888
--- /dev/null
+++ b/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md
@@ -0,0 +1,231 @@
+---
+phase: 15-classification-ingestion-display
+plan: 02
+type: execute
+wave: 2
+depends_on: ["15-01"]
+files_modified:
+  - frontend/src/types/dataset.ts
+  - frontend/src/types/scan.ts
+  - frontend/src/app/datasets/[datasetId]/page.tsx
+  - frontend/src/components/grid/grid-cell.tsx
+  - frontend/src/components/grid/image-grid.tsx
+  - frontend/src/components/detail/sample-modal.tsx
+  - frontend/src/components/detail/annotation-list.tsx
+  - frontend/src/components/stats/stats-dashboard.tsx
+  - frontend/src/components/stats/annotation-summary.tsx
+  - frontend/src/components/ingest/scan-results.tsx
+autonomous: true
+
+must_haves:
+  truths:
+    - "User sees class label badges on grid thumbnails for classification datasets instead of bbox overlays"
+    - "User sees GT class label prominently in sample detail modal with a dropdown to change it"
+    - "Statistics dashboard shows 'Labeled Images' and 'Classes' instead of 'GT Annotations' and 'Categories' for classification datasets"
+    - "Detection-only elements (bbox area histogram, IoU slider) are hidden for classification datasets"
+    - "Scan results page shows 'Classification JSONL' format badge for classification datasets"
+  artifacts:
+    - path: "frontend/src/types/dataset.ts"
+      provides: "Dataset type with dataset_type field"
+      contains: "dataset_type"
+    - path: "frontend/src/components/grid/grid-cell.tsx"
+      provides: "ClassBadge rendering for classification datasets"
+      contains: "ClassBadge"
+    - path: "frontend/src/components/detail/sample-modal.tsx"
+      provides: "Class label display and dropdown editor"
+      contains: "classification"
+    - path: "frontend/src/components/stats/stats-dashboard.tsx"
+      provides: "Detection-only tab hiding for classification"
+      contains: "datasetType"
+  key_links:
+    - from: "frontend/src/app/datasets/[datasetId]/page.tsx"
+      to: "frontend/src/components/grid/image-grid.tsx"
+      via: "datasetType prop threading"
+      pattern: "datasetType"
+    - from: "frontend/src/components/grid/grid-cell.tsx"
+      to: "frontend/src/types/dataset.ts"
+      via: "dataset_type determines badge vs overlay"
+      pattern: "classification"
+    - from: "frontend/src/components/detail/sample-modal.tsx"
+      to: "PATCH /annotations/{id}/category"
+      via: "category update mutation"
+      pattern: "category"
+---
+
+<objective>
+Adapt the frontend to display classification datasets appropriately: class label badges on grid, class label with dropdown in detail modal, classification-aware statistics, and format badge in scan results.
+
+Purpose: Users browsing classification datasets see class-appropriate UI instead of detection-oriented displays (no bbox overlays, no area histograms). Classification labels are the primary annotation visual.
+Output: Updated grid, modal, stats, and scan results components with datasetType-aware branching.
+</objective>
+
+<execution_context>
+@/Users/ortizeg/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/ortizeg/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/phases/15-classification-ingestion-display/15-RESEARCH.md
+@.planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md
+@frontend/src/types/dataset.ts
+@frontend/src/types/scan.ts
+@frontend/src/app/datasets/[datasetId]/page.tsx
+@frontend/src/components/grid/grid-cell.tsx
+@frontend/src/components/grid/image-grid.tsx
+@frontend/src/components/detail/sample-modal.tsx
+@frontend/src/components/detail/annotation-list.tsx
+@frontend/src/components/stats/stats-dashboard.tsx
+@frontend/src/components/stats/annotation-summary.tsx
+@frontend/src/components/ingest/scan-results.tsx
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Types, page threading, grid class badges, and scan results format badge</name>
+  <files>
+    frontend/src/types/dataset.ts
+    frontend/src/types/scan.ts
+    frontend/src/app/datasets/[datasetId]/page.tsx
+    frontend/src/components/grid/grid-cell.tsx
+    frontend/src/components/grid/image-grid.tsx
+    frontend/src/components/ingest/scan-results.tsx
+  </files>
+  <action>
+    **1. TypeScript types**:
+    - `frontend/src/types/dataset.ts`: Add `dataset_type: string;` to the `Dataset` interface (after `prediction_count`). Default is `"detection"`.
+    - `frontend/src/types/scan.ts`: No structural change needed -- `ScanResult.format` is already a string and will carry `"classification_jsonl"`.
+
+    **2. Dataset page prop threading** (`frontend/src/app/datasets/[datasetId]/page.tsx`):
+    - The page fetches the dataset object which now includes `dataset_type`.
+    - Thread `datasetType={dataset.dataset_type}` as a prop to `<ImageGrid>`, `<SampleModal>`, and `<StatsDashboard>` (and any stats sub-components that need it).
+    - Also thread it to any component that renders differently for classification vs detection.
+
+    **3. Grid class badges** (`frontend/src/components/grid/grid-cell.tsx`):
+    - Add `datasetType?: string` prop to GridCell.
+    - When `datasetType === "classification"`:
+      - Do NOT render `<AnnotationOverlay>` (skip bbox rendering entirely)
+      - Instead render a `ClassBadge` inline component:
+        ```tsx
+        function ClassBadge({ label }: { label?: string }) {
+          if (!label) return null;
+          return (
+            <div className="absolute bottom-1 left-1 z-10">
+              <span className="rounded bg-black/60 px-1.5 py-0.5 text-[10px] font-semibold text-white">
+                {label}
+              </span>
+            </div>
+          );
+        }
+        ```
+      - Extract the GT annotation's `category_name` from the annotations map for this sample: `const gtAnnotation = annotations?.find(a => a.source === "ground_truth");`
+      - Render `<ClassBadge label={gtAnnotation?.category_name} />`
+    - When `datasetType !== "classification"` (or undefined): render existing `<AnnotationOverlay>` as before (no change).
+
+    **4. ImageGrid prop threading** (`frontend/src/components/grid/image-grid.tsx`):
+    - Add `datasetType?: string` prop to ImageGrid.
+    - Pass it through to each `<GridCell datasetType={datasetType} />`.
+
+    **5. Scan results format badge** (`frontend/src/components/ingest/scan-results.tsx`):
+    - Where the format is displayed, show "Classification JSONL" when `format === "classification_jsonl"` and "COCO" when `format === "coco"`.
+    - Use the existing badge/styling pattern (likely a colored span). Example: a small badge showing the format type near the dataset name.
+  </action>
+  <verify>
+    - `cd frontend && npx tsc --noEmit` passes without errors
+    - `cd frontend && npm run build` succeeds
+    - Grep confirms: `grep -r "ClassBadge" frontend/src/components/grid/grid-cell.tsx`
+    - Grep confirms: `grep -r "datasetType" frontend/src/app/datasets/*/page.tsx`
+  </verify>
+  <done>Dataset type flows from API through page to grid. Classification datasets show class label badges instead of bbox overlays. Scan results show format badge. TypeScript compiles cleanly.</done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Detail modal class label display/edit and classification-aware statistics</name>
+  <files>
+    frontend/src/components/detail/sample-modal.tsx
+    frontend/src/components/detail/annotation-list.tsx
+    frontend/src/components/stats/stats-dashboard.tsx
+    frontend/src/components/stats/annotation-summary.tsx
+  </files>
+  <action>
+    **1. Sample modal** (`frontend/src/components/detail/sample-modal.tsx`):
+    - Add `datasetType?: string` prop.
+    - When `datasetType === "classification"`:
+      - Show a prominent class label section above or instead of the annotation overlay. Display format:
+        ```
+        Class: [dropdown with all categories]
+        ```
+      - Extract GT annotation: `const gtAnnotation = annotations?.find(a => a.source === "ground_truth");`
+      - If predictions exist, also show: `Predicted: [predicted class label]` with confidence if available.
+      - The class dropdown uses the categories list (from `useFilterFacets` or a categories fetch). On change, call `PATCH /annotations/{gtAnnotation.id}/category` with the new `category_name`.
+      - Create a TanStack Query mutation hook inline or in a hooks file: `usePatchCategory` that calls `apiPatch(\`/annotations/\${annotationId}/category\`, { category_name })` and invalidates the annotation queries on success.
+      - Do NOT render the annotation overlay / bounding box editor for classification datasets. Hide the bbox editing canvas (react-konva editor). The image should display without any overlay.
+    - When `datasetType !== "classification"`: render everything as before (no change).
+
+    **2. Annotation list** (`frontend/src/components/detail/annotation-list.tsx`):
+    - Add `datasetType?: string` prop.
+    - When `datasetType === "classification"`:
+      - Hide the Bounding Box columns (bbox_x, bbox_y, bbox_w, bbox_h) and Area column from the table.
+      - Show: Class, Source, Confidence columns only.
+    - When detection: show all columns as before.
+
+    **3. Stats dashboard** (`frontend/src/components/stats/stats-dashboard.tsx`):
+    - Add `datasetType?: string` prop.
+    - When `datasetType === "classification"`:
+      - Hide the "Evaluation" tab entirely (no IoU-based evaluation for classification in this phase).
+      - Hide the "Error Analysis" sub-panel (detection-specific error categories: TP/FP/FN based on IoU).
+      - Keep: Class Distribution chart, Split Breakdown chart, Summary cards (with relabeled metrics).
+      - Hide: Any bbox area histogram or IoU-related controls.
+    - When detection: show all tabs/panels as before.
+
+    **4. Annotation summary** (`frontend/src/components/stats/annotation-summary.tsx`):
+    - Add `datasetType?: string` prop.
+    - When `datasetType === "classification"`:
+      - Swap summary card labels:
+        - "GT Annotations" -> "Labeled Images"
+        - "Categories" -> "Classes"
+      - Keep "Total Images" and "Predictions" labels as-is.
+    - When detection: show original labels.
+    - Use a conditional card definitions array pattern:
+      ```tsx
+      const cards = datasetType === "classification"
+        ? CLASSIFICATION_CARDS
+        : DETECTION_CARDS;
+      ```
+  </action>
+  <verify>
+    - `cd frontend && npx tsc --noEmit` passes without errors
+    - `cd frontend && npm run build` succeeds
+    - Grep confirms classification branching: `grep -r "classification" frontend/src/components/detail/sample-modal.tsx`
+    - Grep confirms stats adaptation: `grep -r "classification" frontend/src/components/stats/stats-dashboard.tsx`
+    - Grep confirms annotation-summary adaptation: `grep -r "Labeled Images" frontend/src/components/stats/annotation-summary.tsx`
+  </verify>
+  <done>Detail modal shows class label with editable dropdown for classification datasets. Annotation list hides bbox columns for classification. Stats dashboard hides detection-only tabs/panels. Summary cards use classification-appropriate labels. PATCH mutation for category update wired. All TypeScript compiles cleanly.</done>
+</task>
+
+</tasks>
+
+<verification>
+1. Classification dataset grid shows class label badges (no bbox overlays)
+2. Detail modal shows "Class: [dropdown]" for classification, with working category edit
+3. Stats dashboard hides Evaluation tab and Error Analysis for classification
+4. Summary cards show "Labeled Images" and "Classes" for classification
+5. Annotation list hides bbox/area columns for classification
+6. Scan results show "Classification JSONL" format badge
+7. Detection datasets are completely unaffected (no regression)
+8. TypeScript compiles and Next.js builds succeed
+</verification>
+
+<success_criteria>
+- Classification datasets display class badges on grid thumbnails
+- Detail modal has class label display with dropdown editor that persists changes
+- Statistics dashboard shows only classification-relevant metrics
+- Detection workflow is unchanged
+- Frontend builds without errors
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/15-classification-ingestion-display/15-02-SUMMARY.md`
+</output>

From 9286e8ec5c6291cfcc104f43e04be7c14e8ba270 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 20:56:20 -0500
Subject: [PATCH 10/38] fix(15): revise plans based on checker feedback

---
 .../15-classification-ingestion-display/15-01-PLAN.md       | 6 +++++-
 .../15-classification-ingestion-display/15-02-PLAN.md       | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md b/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md
index 653a486..be35187 100644
--- a/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md
+++ b/.planning/phases/15-classification-ingestion-display/15-01-PLAN.md
@@ -197,7 +197,11 @@ Output: ClassificationJSONLParser, extended FolderScanner, updated IngestionServ
     - Update the existing INSERT INTO datasets to include `dataset_type` column. The INSERT currently uses positional VALUES -- add `dataset_type` after `prediction_count` (or adjust column list). Be careful to match column order.
 
     **3. Ingestion router** (`app/routers/ingestion.py`):
-    - The `/ingestion/import` endpoint passes `format` through to `ingest_with_progress`. Currently it may not pass format. Ensure the ImportRequest or the stored ScanResult format is threaded through. The simplest approach: add `format: str = "coco"` field to `ImportRequest` model in `app/models/scan.py`, then pass it in the ingestion router's import endpoint to `ingest_with_progress()`.
+    - The `/ingestion/import` endpoint passes `format` through to `ingest_with_progress`. Currently it may not pass format. Ensure the ImportRequest or the stored ScanResult format is threaded through. The simplest approach: add `format: str = "coco"` field to `ImportRequest` model in `app/models/scan.py`.
+    - The router calls `ingest_splits_with_progress()` (not `ingest_with_progress` directly), so the full threading chain is:
+      1. Add `format: str = "coco"` param to `ingest_splits_with_progress()` signature in `app/services/ingestion.py`
+      2. Inside `ingest_splits_with_progress()`, pass `format=format` to each `self.ingest_with_progress(...)` call in the loop (replacing the hardcoded `format="coco"` default)
+      3. In the router's import endpoint, pass `request.format` (or `scan_result.format`) to `ingest_splits_with_progress(format=...)`
 
     **4. Datasets router** (`app/routers/datasets.py`):
     - Ensure the `GET /datasets` and `GET /datasets/{id}` queries include `dataset_type` in SELECT. Currently using `SELECT *` or explicit columns -- add `dataset_type` to the result mapping into `DatasetResponse`.
diff --git a/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md b/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md
index 9e29888..3c9997c 100644
--- a/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md
+++ b/.planning/phases/15-classification-ingestion-display/15-02-PLAN.md
@@ -152,6 +152,7 @@ Output: Updated grid, modal, stats, and scan results components with datasetType
   <action>
     **1. Sample modal** (`frontend/src/components/detail/sample-modal.tsx`):
     - Add `datasetType?: string` prop.
+    - Pass `datasetType` down to child components: `<AnnotationList datasetType={datasetType} />` in the render.
     - When `datasetType === "classification"`:
       - Show a prominent class label section above or instead of the annotation overlay. Display format:
         ```

From 5264e518b470a4c1ff0f87122982432300c5d068 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:15:12 -0500
Subject: [PATCH 11/38] feat(15-01): add ClassificationJSONLParser,
 dataset_type schema migration, and model updates

- Create ClassificationJSONLParser with sentinel bbox values (0.0) for classification annotations
- Add dataset_type column migration to DuckDB schema (default 'detection')
- Add dataset_type field to DatasetResponse Pydantic model
- Update BaseParser.build_image_batches signature with image_dir parameter

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/ingestion/base_parser.py                 |   2 +-
 app/ingestion/classification_jsonl_parser.py | 220 +++++++++++++++++++
 app/models/dataset.py                        |   1 +
 app/repositories/duckdb_repo.py              |   5 +
 4 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 app/ingestion/classification_jsonl_parser.py

diff --git a/app/ingestion/base_parser.py b/app/ingestion/base_parser.py
index 8386ff1..0629637 100644
--- a/app/ingestion/base_parser.py
+++ b/app/ingestion/base_parser.py
@@ -35,7 +35,7 @@ def parse_categories(self, file_path: Path) -> dict[int, str]:
 
     @abstractmethod
     def build_image_batches(
-        self, file_path: Path, dataset_id: str, split: str | None = None
+        self, file_path: Path, dataset_id: str, split: str | None = None, image_dir: str = ""
     ) -> Iterator[pd.DataFrame]:
         """Yield DataFrames of image/sample records in batches.
 
diff --git a/app/ingestion/classification_jsonl_parser.py b/app/ingestion/classification_jsonl_parser.py
new file mode 100644
index 0000000..cc63ee3
--- /dev/null
+++ b/app/ingestion/classification_jsonl_parser.py
@@ -0,0 +1,220 @@
+"""Streaming Classification JSONL parser with DataFrame batch output.
+
+Parses JSONL files where each line maps an image filename to a
+classification label.  Supports flexible key names for both the
+filename and label fields.
+
+Classification annotations use sentinel bbox values (all zeros)
+since there is no spatial localisation.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from collections.abc import Iterator
+from pathlib import Path
+
+import pandas as pd
+
+from app.ingestion.base_parser import BaseParser
+
+logger = logging.getLogger(__name__)
+
+# Flexible key lookup order for the image filename field.
+_FILENAME_KEYS = ("filename", "file_name", "image", "path")
+
+# Flexible key lookup order for the label field.
+_LABEL_KEYS = ("label", "class", "category", "class_name")
+
+
+def _get_field(record: dict, keys: tuple[str, ...], default: str | None = None) -> str | None:
+    """Return the first matching key's value from *record*."""
+    for k in keys:
+        if k in record:
+            return record[k]
+    return default
+
+
+class ClassificationJSONLParser(BaseParser):
+    """Streaming parser for classification JSONL datasets.
+
+    Each line of the JSONL file is a JSON object with at minimum a
+    filename field and a label field.  The parser produces annotations
+    with sentinel bbox values (``0.0``) since classification has no
+    spatial localisation.
+    """
+
+    @property
+    def format_name(self) -> str:  # noqa: D401
+        """Format identifier."""
+        return "classification_jsonl"
+
+    # ------------------------------------------------------------------
+    # Category extraction
+    # ------------------------------------------------------------------
+
+    def parse_categories(self, file_path: Path) -> dict[int, str]:
+        """Single pass over JSONL to collect unique sorted labels."""
+        labels: set[str] = set()
+        try:
+            with open(file_path, encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        record = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    raw_label = _get_field(record, _LABEL_KEYS)
+                    if raw_label is None:
+                        labels.add("unknown")
+                    elif isinstance(raw_label, list):
+                        for lbl in raw_label:
+                            labels.add(str(lbl))
+                    else:
+                        labels.add(str(raw_label))
+        except OSError:
+            logger.warning("Could not read categories from %s", file_path)
+        return {i: name for i, name in enumerate(sorted(labels))}
+
+    # ------------------------------------------------------------------
+    # Image batch builder
+    # ------------------------------------------------------------------
+
+    def build_image_batches(
+        self,
+        file_path: Path,
+        dataset_id: str,
+        split: str | None = None,
+        image_dir: str = "",
+    ) -> Iterator[pd.DataFrame]:
+        """Yield DataFrames of image/sample records in batches.
+
+        Column order matches the ``samples`` DuckDB table:
+        ``id, dataset_id, file_name, width, height, thumbnail_path,
+        split, metadata, image_dir``.
+
+        Width and height default to ``0`` -- resolved during thumbnail
+        generation.
+        """
+        batch: list[dict] = []
+        with open(file_path, encoding="utf-8") as f:
+            idx = 0
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+
+                filename = _get_field(record, _FILENAME_KEYS)
+                if filename is None:
+                    logger.warning("Skipping line %d: no filename field", idx)
+                    idx += 1
+                    continue
+
+                sample_id = f"{split}_{idx}" if split else str(idx)
+                batch.append(
+                    {
+                        "id": sample_id,
+                        "dataset_id": dataset_id,
+                        "file_name": str(filename),
+                        "width": 0,
+                        "height": 0,
+                        "thumbnail_path": None,
+                        "split": split,
+                        "metadata": None,
+                        "image_dir": image_dir,
+                    }
+                )
+                idx += 1
+
+                if len(batch) >= self.batch_size:
+                    yield pd.DataFrame(batch)
+                    batch = []
+
+        if batch:
+            yield pd.DataFrame(batch)
+
+    # ------------------------------------------------------------------
+    # Annotation batch builder
+    # ------------------------------------------------------------------
+
+    def build_annotation_batches(
+        self,
+        file_path: Path,
+        dataset_id: str,
+        categories: dict[int, str],
+        split: str | None = None,
+    ) -> Iterator[pd.DataFrame]:
+        """Yield DataFrames of annotation records with sentinel bbox values.
+
+        Column order matches the ``annotations`` DuckDB table:
+        ``id, dataset_id, sample_id, category_name, bbox_x, bbox_y,
+        bbox_w, bbox_h, area, is_crowd, source, confidence, metadata``.
+
+        If a label is a list (multi-label), one annotation row is
+        emitted per label for the same sample.
+        """
+        batch: list[dict] = []
+        ann_counter = 0
+        with open(file_path, encoding="utf-8") as f:
+            idx = 0
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+
+                # Skip lines without a filename (same logic as build_image_batches)
+                filename = _get_field(record, _FILENAME_KEYS)
+                if filename is None:
+                    idx += 1
+                    continue
+
+                sample_id = f"{split}_{idx}" if split else str(idx)
+                raw_label = _get_field(record, _LABEL_KEYS)
+
+                if raw_label is None:
+                    labels = ["unknown"]
+                elif isinstance(raw_label, list):
+                    labels = [str(lbl) for lbl in raw_label]
+                else:
+                    labels = [str(raw_label)]
+
+                for lbl in labels:
+                    ann_id = f"{split}_ann_{ann_counter}" if split else f"ann_{ann_counter}"
+                    batch.append(
+                        {
+                            "id": ann_id,
+                            "dataset_id": dataset_id,
+                            "sample_id": sample_id,
+                            "category_name": lbl,
+                            "bbox_x": 0.0,
+                            "bbox_y": 0.0,
+                            "bbox_w": 0.0,
+                            "bbox_h": 0.0,
+                            "area": 0.0,
+                            "is_crowd": False,
+                            "source": "ground_truth",
+                            "confidence": None,
+                            "metadata": None,
+                        }
+                    )
+                    ann_counter += 1
+
+                idx += 1
+
+                if len(batch) >= self.batch_size:
+                    yield pd.DataFrame(batch)
+                    batch = []
+
+        if batch:
+            yield pd.DataFrame(batch)
diff --git a/app/models/dataset.py b/app/models/dataset.py
index 2a12bd2..622f6d9 100644
--- a/app/models/dataset.py
+++ b/app/models/dataset.py
@@ -27,6 +27,7 @@ class DatasetResponse(BaseModel):
     annotation_count: int
     category_count: int
     prediction_count: int = 0
+    dataset_type: str = "detection"
     created_at: datetime
 
 
diff --git a/app/repositories/duckdb_repo.py b/app/repositories/duckdb_repo.py
index 2b20029..ccceff4 100644
--- a/app/repositories/duckdb_repo.py
+++ b/app/repositories/duckdb_repo.py
@@ -126,6 +126,11 @@ def initialize_schema(self) -> None:
             )
         """)
 
+        # Phase 15: Add dataset_type column to datasets (classification vs detection)
+        self.connection.execute(
+            "ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection'"
+        )
+
         # Phase 14: Per-annotation triage overrides
         self.connection.execute("""
             CREATE TABLE IF NOT EXISTS annotation_triage (

From 8af8a114f5d7d7f08774e16981c90bb2262aa0dd Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:18:44 -0500
Subject: [PATCH 12/38] feat(15-01): add scanner detection, parser dispatch,
 dataset_type API, and category update endpoint

- Add classification JSONL layout detectors (D: split dirs, E: flat) to FolderScanner
- Add GCS classification detection support
- Dispatch to ClassificationJSONLParser in IngestionService based on format
- Store dataset_type (classification/detection) on dataset INSERT
- Thread format through ImportRequest -> ingest_splits_with_progress -> ingest_with_progress
- Add dataset_type to GET /datasets and GET /datasets/{id} responses
- Add PATCH /annotations/{id}/category endpoint for classification label editing
- Make statistics gt_annotations classification-aware (distinct labeled images)
- Show .jsonl files in browse endpoint

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/models/annotation.py       |   6 +
 app/models/scan.py             |   3 +
 app/routers/annotations.py     |  28 +++-
 app/routers/datasets.py        |  10 +-
 app/routers/ingestion.py       |   5 +-
 app/routers/statistics.py      |  37 +++--
 app/services/folder_scanner.py | 245 +++++++++++++++++++++++++++++++--
 app/services/ingestion.py      |  21 ++-
 8 files changed, 316 insertions(+), 39 deletions(-)

diff --git a/app/models/annotation.py b/app/models/annotation.py
index 0929f86..be5369b 100644
--- a/app/models/annotation.py
+++ b/app/models/annotation.py
@@ -54,3 +54,9 @@ class AnnotationCreate(BaseModel):
     bbox_y: float
     bbox_w: float
     bbox_h: float
+
+
+class CategoryUpdateRequest(BaseModel):
+    """Request body for PATCH /annotations/{id}/category -- update classification label."""
+
+    category_name: str
diff --git a/app/models/scan.py b/app/models/scan.py
index 1e96822..6a11595 100644
--- a/app/models/scan.py
+++ b/app/models/scan.py
@@ -70,6 +70,9 @@ class ImportRequest(BaseModel):
     splits: list[ImportSplit]
     """Splits to import (user may have deselected some from scan results)."""
 
+    format: str = "coco"
+    """Annotation format: ``"coco"`` or ``"classification_jsonl"``."""
+
 
 class BrowseRequest(BaseModel):
     """Request body for the ``POST /ingestion/browse`` endpoint."""
diff --git a/app/routers/annotations.py b/app/routers/annotations.py
index 36e6967..b879fe7 100644
--- a/app/routers/annotations.py
+++ b/app/routers/annotations.py
@@ -14,7 +14,7 @@
 from fastapi import APIRouter, Depends, HTTPException
 
 from app.dependencies import get_cursor
-from app.models.annotation import AnnotationCreate, AnnotationUpdate
+from app.models.annotation import AnnotationCreate, AnnotationUpdate, CategoryUpdateRequest
 
 router = APIRouter(prefix="/annotations", tags=["annotations"])
 
@@ -32,6 +32,32 @@ def _update_dataset_counts(
     )
 
 
+@router.patch("/{annotation_id}/category")
+def update_annotation_category(
+    annotation_id: str,
+    body: CategoryUpdateRequest,
+    cursor: duckdb.DuckDBPyConnection = Depends(get_cursor),
+) -> dict:
+    """Update category_name for a ground_truth annotation (classification label editing)."""
+    row = cursor.execute(
+        "UPDATE annotations "
+        "SET category_name = ? "
+        "WHERE id = ? AND source = 'ground_truth' "
+        "RETURNING id, dataset_id",
+        [body.category_name, annotation_id],
+    ).fetchone()
+
+    if row is None:
+        raise HTTPException(
+            status_code=404,
+            detail="Annotation not found or not editable",
+        )
+
+    _update_dataset_counts(cursor, row[1])
+
+    return {"updated": annotation_id, "category_name": body.category_name}
+
+
 @router.put("/{annotation_id}")
 def update_annotation(
     annotation_id: str,
diff --git a/app/routers/datasets.py b/app/routers/datasets.py
index 6a34d53..9c9cba7 100644
--- a/app/routers/datasets.py
+++ b/app/routers/datasets.py
@@ -85,7 +85,7 @@ def list_datasets(db: DuckDBRepo = Depends(get_db)) -> DatasetListResponse:
         rows = cursor.execute(
             "SELECT id, name, format, source_path, image_dir, "
             "image_count, annotation_count, category_count, "
-            "prediction_count, created_at "
+            "prediction_count, dataset_type, created_at "
             "FROM datasets ORDER BY created_at DESC"
         ).fetchall()
     finally:
@@ -102,7 +102,8 @@ def list_datasets(db: DuckDBRepo = Depends(get_db)) -> DatasetListResponse:
             annotation_count=row[6],
             category_count=row[7],
             prediction_count=row[8],
-            created_at=row[9],
+            dataset_type=row[9] or "detection",
+            created_at=row[10],
         )
         for row in rows
     ]
@@ -119,7 +120,7 @@ def get_dataset(
         row = cursor.execute(
             "SELECT id, name, format, source_path, image_dir, "
             "image_count, annotation_count, category_count, "
-            "prediction_count, created_at "
+            "prediction_count, dataset_type, created_at "
             "FROM datasets WHERE id = ?",
             [dataset_id],
         ).fetchone()
@@ -139,7 +140,8 @@ def get_dataset(
         annotation_count=row[6],
         category_count=row[7],
         prediction_count=row[8],
-        created_at=row[9],
+        dataset_type=row[9] or "detection",
+        created_at=row[10],
     )
 
 
diff --git a/app/routers/ingestion.py b/app/routers/ingestion.py
index ced4b4e..797b07b 100644
--- a/app/routers/ingestion.py
+++ b/app/routers/ingestion.py
@@ -57,7 +57,7 @@ def scan_folder(
     if not result.splits:
         raise HTTPException(
             status_code=404,
-            detail="No COCO datasets detected in this directory",
+            detail="No importable datasets detected in this directory",
         )
 
     return result
@@ -80,6 +80,7 @@ def progress_stream():
         for progress in ingestion_service.ingest_splits_with_progress(
             splits=request.splits,
             dataset_name=request.dataset_name,
+            format=request.format,
         ):
             if progress.stage == "split_start":
                 # Extract split name from message for the SSE event.
@@ -108,7 +109,7 @@ def progress_stream():
 
 
 # Allowed file extensions shown in the browser (directories always shown).
-_BROWSE_EXTENSIONS = {".json"}
+_BROWSE_EXTENSIONS = {".json", ".jsonl"}
 
 
 @router.post("/browse", response_model=BrowseResponse)
diff --git a/app/routers/statistics.py b/app/routers/statistics.py
index 88a718d..0d3a978 100644
--- a/app/routers/statistics.py
+++ b/app/routers/statistics.py
@@ -41,12 +41,13 @@ def get_dataset_statistics(
     """
     cursor = db.connection.cursor()
     try:
-        # Verify dataset exists
+        # Verify dataset exists and get dataset_type
         row = cursor.execute(
-            "SELECT id FROM datasets WHERE id = ?", [dataset_id]
+            "SELECT id, dataset_type FROM datasets WHERE id = ?", [dataset_id]
         ).fetchone()
         if row is None:
             raise HTTPException(status_code=404, detail="Dataset not found")
+        dataset_type = row[1] or "detection"
 
         # Class distribution: GT and prediction counts per category
         if split is not None:
@@ -90,25 +91,31 @@ def get_dataset_statistics(
         ]
 
         # Summary counts
+        # For classification datasets, gt_annotations = distinct labeled images
+        if dataset_type == "classification":
+            gt_agg = "COUNT(DISTINCT a.sample_id)"
+        else:
+            gt_agg = "COUNT(*)"
+
         if split is not None:
             summary_row = cursor.execute(
-                "SELECT "
-                "(SELECT COUNT(*) FROM samples WHERE dataset_id = ? AND split = ?) as total_images, "
-                "(SELECT COUNT(*) FROM annotations a JOIN samples s ON a.sample_id = s.id AND a.dataset_id = s.dataset_id "
-                "WHERE a.dataset_id = ? AND a.source = 'ground_truth' AND s.split = ?) as gt_annotations, "
-                "(SELECT COUNT(*) FROM annotations a JOIN samples s ON a.sample_id = s.id AND a.dataset_id = s.dataset_id "
-                "WHERE a.dataset_id = ? AND a.source != 'ground_truth' AND s.split = ?) as pred_annotations, "
-                "(SELECT COUNT(DISTINCT a.category_name) FROM annotations a JOIN samples s ON a.sample_id = s.id AND a.dataset_id = s.dataset_id "
-                "WHERE a.dataset_id = ? AND s.split = ?) as total_categories",
+                f"SELECT "
+                f"(SELECT COUNT(*) FROM samples WHERE dataset_id = ? AND split = ?) as total_images, "
+                f"(SELECT {gt_agg} FROM annotations a JOIN samples s ON a.sample_id = s.id AND a.dataset_id = s.dataset_id "
+                f"WHERE a.dataset_id = ? AND a.source = 'ground_truth' AND s.split = ?) as gt_annotations, "
+                f"(SELECT COUNT(*) FROM annotations a JOIN samples s ON a.sample_id = s.id AND a.dataset_id = s.dataset_id "
+                f"WHERE a.dataset_id = ? AND a.source != 'ground_truth' AND s.split = ?) as pred_annotations, "
+                f"(SELECT COUNT(DISTINCT a.category_name) FROM annotations a JOIN samples s ON a.sample_id = s.id AND a.dataset_id = s.dataset_id "
+                f"WHERE a.dataset_id = ? AND s.split = ?) as total_categories",
                 [dataset_id, split, dataset_id, split, dataset_id, split, dataset_id, split],
             ).fetchone()
         else:
             summary_row = cursor.execute(
-                "SELECT "
-                "(SELECT COUNT(*) FROM samples WHERE dataset_id = ?) as total_images, "
-                "(SELECT COUNT(*) FROM annotations WHERE dataset_id = ? AND source = 'ground_truth') as gt_annotations, "
-                "(SELECT COUNT(*) FROM annotations WHERE dataset_id = ? AND source != 'ground_truth') as pred_annotations, "
-                "(SELECT COUNT(DISTINCT category_name) FROM annotations WHERE dataset_id = ?) as total_categories",
+                f"SELECT "
+                f"(SELECT COUNT(*) FROM samples WHERE dataset_id = ?) as total_images, "
+                f"(SELECT {gt_agg} FROM annotations a WHERE a.dataset_id = ? AND a.source = 'ground_truth') as gt_annotations, "
+                f"(SELECT COUNT(*) FROM annotations WHERE dataset_id = ? AND source != 'ground_truth') as pred_annotations, "
+                f"(SELECT COUNT(DISTINCT category_name) FROM annotations WHERE dataset_id = ?) as total_categories",
                 [dataset_id, dataset_id, dataset_id, dataset_id],
             ).fetchone()
 
diff --git a/app/services/folder_scanner.py b/app/services/folder_scanner.py
index c0095f3..ba3ecd5 100644
--- a/app/services/folder_scanner.py
+++ b/app/services/folder_scanner.py
@@ -1,17 +1,22 @@
-"""Heuristic-based COCO dataset folder scanner.
+"""Heuristic-based dataset folder scanner.
 
-Detects three common COCO layouts:
+Detects COCO and classification JSONL layouts:
 
+- **Layout D (Classification split dirs):** Split directories with JSONL + images.
+- **Layout E (Classification flat):** Flat JSONL at root with images.
 - **Layout B (Roboflow):** Split directories containing both annotation
   JSON and images co-located.
 - **Layout A (Standard COCO):** An ``annotations/`` directory with per-split
   JSON files paired with image directories.
 - **Layout C (Flat):** A single annotation file at root with an ``images/``
   directory or co-located images.
+
+Classification layouts are checked first since JSONL files are never COCO.
 """
 
 from __future__ import annotations
 
+import json
 import logging
 import os
 from pathlib import Path
@@ -67,8 +72,9 @@ def _stem(path: str) -> str:
 
 
 class FolderScanner:
-    """Walk a directory tree and detect importable COCO datasets.
+    """Walk a directory tree and detect importable datasets.
 
+    Supports COCO and classification JSONL formats.
     Supports both local and GCS paths via :class:`StorageBackend`.
 
     Usage::
@@ -109,19 +115,33 @@ def scan(self, root_path: str) -> ScanResult:
         warnings: list[str] = []
 
         if is_gcs:
-            splits = self._scan_gcs(resolved, warnings)
+            splits, fmt = self._scan_gcs(resolved, warnings)
         else:
-            # Use optimised local-only path (os.scandir is faster).
+            # Try classification JSONL layouts first (more specific).
+            splits = self._try_layout_d(Path(resolved), warnings)
+            if not splits:
+                splits = self._try_layout_e(Path(resolved), warnings)
+            if splits:
+                return ScanResult(
+                    root_path=resolved,
+                    dataset_name=_basename(resolved),
+                    format="classification_jsonl",
+                    splits=splits,
+                    warnings=warnings,
+                )
+
+            # Fall back to COCO layouts.
             splits = self._try_layout_b(Path(resolved), warnings)
             if not splits:
                 splits = self._try_layout_a(Path(resolved), warnings)
             if not splits:
                 splits = self._try_layout_c(Path(resolved), warnings)
+            fmt = "coco"
 
         return ScanResult(
             root_path=resolved,
             dataset_name=_basename(resolved),
-            format="coco",
+            format=fmt,
             splits=splits,
             warnings=warnings,
         )
@@ -132,11 +152,20 @@ def scan(self, root_path: str) -> ScanResult:
 
     def _scan_gcs(
         self, root: str, warnings: list[str]
-    ) -> list[DetectedSplit]:
-        """Detect COCO datasets in a GCS prefix using StorageBackend."""
+    ) -> tuple[list[DetectedSplit], str]:
+        """Detect datasets in a GCS prefix using StorageBackend.
+
+        Returns a tuple of (splits, format_string).
+        """
         entries = self.storage.list_dir_detail(root)
         dirs = [e for e in entries if e["type"] == "directory"]
         jsons = [e for e in entries if e["type"] == "file" and e["name"].lower().endswith(".json")]
+        jsonls = [e for e in entries if e["type"] == "file" and e["name"].lower().endswith(".jsonl")]
+
+        # Try classification JSONL layouts first (more specific).
+        cls_splits = self._scan_gcs_classification(root, entries, dirs, jsonls, warnings)
+        if cls_splits:
+            return cls_splits, "classification_jsonl"
 
         # Try Layout B: split directories
         split_dirs: dict[str, str] = {}
@@ -175,7 +204,7 @@ def _scan_gcs(
                     else:
                         warnings.append(f"Found JSON but not valid COCO: {jpath}")
             if splits:
-                return splits
+                return splits, "coco"
 
         # Try Layout A: annotations/ dir
         ann_dir = _join(root, "annotations")
@@ -234,7 +263,7 @@ def _scan_gcs(
                             annotation_file_size=coco_size,
                         ))
                 if splits:
-                    return splits
+                    return splits, "coco"
 
         # Try Layout C: flat JSON at root
         for jentry in sorted(jsons, key=lambda e: e["name"]):
@@ -254,16 +283,16 @@ def _scan_gcs(
                         image_dir=img_dir_path,
                         image_count=img_count,
                         annotation_file_size=jentry.get("size") or 0,
-                    )]
+                    )], "coco"
                 else:
                     warnings.append(
                         f"COCO annotation found ({jentry['name']}) but no images in {img_dir_path}"
                     )
-                    return []
+                    return [], "coco"
             else:
                 warnings.append(f"Found JSON but not valid COCO: {jpath}")
 
-        return []
+        return [], "coco"
 
     def _is_coco_annotation_remote(self, path: str) -> bool:
         """Check if a remote file looks like COCO annotation JSON."""
@@ -294,7 +323,195 @@ def _count_images_remote(self, path: str) -> int:
             return 0
 
     # ------------------------------------------------------------------
-    # Layout detectors (local-only, preserved for performance)
+    # GCS classification detection
+    # ------------------------------------------------------------------
+
+    def _scan_gcs_classification(
+        self,
+        root: str,
+        entries: list[dict],
+        dirs: list[dict],
+        jsonls: list[dict],
+        warnings: list[str],
+    ) -> list[DetectedSplit]:
+        """Detect classification JSONL datasets in a GCS prefix."""
+        # Try split directories with JSONL
+        split_dirs: dict[str, str] = {}
+        for d in dirs:
+            norm = d["name"].lower()
+            if norm in SPLIT_DIR_NAMES:
+                canonical = SPLIT_DIR_NAMES[norm]
+                if canonical not in split_dirs:
+                    split_dirs[canonical] = _join(root, d["name"])
+
+        if split_dirs:
+            splits: list[DetectedSplit] = []
+            for canonical_name, dir_path in sorted(split_dirs.items()):
+                sub_entries = self.storage.list_dir_detail(dir_path)
+                sub_jsonls = sorted(
+                    [e for e in sub_entries if e["type"] == "file" and e["name"].lower().endswith(".jsonl")],
+                    key=lambda e: e["name"],
+                )
+                for jentry in sub_jsonls:
+                    jpath = _join(dir_path, jentry["name"])
+                    if self._is_classification_jsonl_remote(jpath):
+                        img_count = sum(
+                            1 for e in sub_entries
+                            if e["type"] == "file"
+                            and os.path.splitext(e["name"])[1].lower() in IMAGE_EXTENSIONS
+                        )
+                        if img_count > 0:
+                            splits.append(DetectedSplit(
+                                name=canonical_name,
+                                annotation_path=jpath,
+                                image_dir=dir_path,
+                                image_count=img_count,
+                                annotation_file_size=jentry.get("size") or 0,
+                            ))
+                        break
+            if splits:
+                return splits
+
+        # Try flat JSONL at root
+        for jentry in sorted(jsonls, key=lambda e: e["name"]):
+            jpath = _join(root, jentry["name"])
+            if self._is_classification_jsonl_remote(jpath):
+                images_dir = _join(root, "images")
+                if self.storage.isdir(images_dir):
+                    img_count = self._count_images_remote(images_dir)
+                    img_dir_path = images_dir
+                else:
+                    img_count = self._count_images_remote(root)
+                    img_dir_path = root
+                if img_count > 0:
+                    return [DetectedSplit(
+                        name=_basename(root),
+                        annotation_path=jpath,
+                        image_dir=img_dir_path,
+                        image_count=img_count,
+                        annotation_file_size=jentry.get("size") or 0,
+                    )]
+
+        return []
+
+    def _is_classification_jsonl_remote(self, path: str) -> bool:
+        """Check if a remote JSONL file looks like classification data."""
+        try:
+            with self.storage.open(path, "r") as f:
+                lines_checked = 0
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    record = json.loads(line)
+                    has_filename = any(k in record for k in ("filename", "file_name", "image", "path"))
+                    has_label = any(k in record for k in ("label", "class", "category", "class_name"))
+                    has_bbox = "bbox" in record or "annotations" in record
+                    if not (has_filename and has_label and not has_bbox):
+                        return False
+                    lines_checked += 1
+                    if lines_checked >= 5:
+                        break
+                return lines_checked > 0
+        except Exception:
+            return False
+
+    # ------------------------------------------------------------------
+    # Classification layout detectors (local-only)
+    # ------------------------------------------------------------------
+
+    def _try_layout_d(
+        self, root: Path, warnings: list[str]
+    ) -> list[DetectedSplit]:
+        """Layout D: Split directories with co-located JSONL + images."""
+        split_dirs = self._detect_split_dirs(root)
+        if not split_dirs:
+            return []
+
+        splits: list[DetectedSplit] = []
+        for canonical_name, dir_path in sorted(split_dirs.items()):
+            jsonl_files = [
+                f for f in dir_path.iterdir()
+                if f.is_file() and f.suffix.lower() == ".jsonl"
+            ]
+            for jf in sorted(jsonl_files, key=lambda p: p.name):
+                if self._is_classification_jsonl(jf):
+                    img_count = self._count_images(dir_path)
+                    if img_count > 0:
+                        splits.append(
+                            DetectedSplit(
+                                name=canonical_name,
+                                annotation_path=str(jf),
+                                image_dir=str(dir_path),
+                                image_count=img_count,
+                                annotation_file_size=jf.stat().st_size,
+                            )
+                        )
+                    break
+
+        return splits
+
+    def _try_layout_e(
+        self, root: Path, warnings: list[str]
+    ) -> list[DetectedSplit]:
+        """Layout E: Flat JSONL at root with images/ subdir or co-located."""
+        jsonl_files = [
+            f for f in root.iterdir()
+            if f.is_file() and f.suffix.lower() == ".jsonl"
+        ]
+
+        for jf in sorted(jsonl_files, key=lambda p: p.name):
+            if self._is_classification_jsonl(jf):
+                images_dir = root / "images"
+                if images_dir.is_dir():
+                    img_count = self._count_images(images_dir)
+                    img_dir_path = images_dir
+                else:
+                    img_count = self._count_images(root)
+                    img_dir_path = root
+
+                if img_count > 0:
+                    return [
+                        DetectedSplit(
+                            name=root.name,
+                            annotation_path=str(jf),
+                            image_dir=str(img_dir_path),
+                            image_count=img_count,
+                            annotation_file_size=jf.stat().st_size,
+                        )
+                    ]
+
+        return []
+
+    @staticmethod
+    def _is_classification_jsonl(file_path: Path) -> bool:
+        """Return ``True`` if *file_path* looks like a classification JSONL file.
+
+        Reads the first 5 non-empty lines, parses each as JSON, and checks
+        for filename + label keys without bbox/annotations keys.
+        """
+        try:
+            lines_checked = 0
+            with open(file_path, encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    record = json.loads(line)
+                    has_filename = any(k in record for k in ("filename", "file_name", "image", "path"))
+                    has_label = any(k in record for k in ("label", "class", "category", "class_name"))
+                    has_bbox = "bbox" in record or "annotations" in record
+                    if not (has_filename and has_label and not has_bbox):
+                        return False
+                    lines_checked += 1
+                    if lines_checked >= 5:
+                        break
+            return lines_checked > 0
+        except Exception:
+            return False
+
+    # ------------------------------------------------------------------
+    # COCO layout detectors (local-only, preserved for performance)
     # ------------------------------------------------------------------
 
     def _try_layout_b(
diff --git a/app/services/ingestion.py b/app/services/ingestion.py
index 2aeb3e7..dff81cb 100644
--- a/app/services/ingestion.py
+++ b/app/services/ingestion.py
@@ -15,6 +15,7 @@
 
 import pandas as pd
 
+from app.ingestion.classification_jsonl_parser import ClassificationJSONLParser
 from app.ingestion.coco_parser import COCOParser
 from app.plugins.base_plugin import PluginContext
 from app.plugins.hooks import HOOK_INGEST_COMPLETE, HOOK_INGEST_START
@@ -109,7 +110,12 @@ def ingest_with_progress(
         self.plugins.trigger_hook(HOOK_INGEST_START, context=context)
 
         # -- Step 1: Parse categories ----------------------------------------
-        parser = COCOParser(batch_size=1000)
+        if format == "classification_jsonl":
+            parser = ClassificationJSONLParser(batch_size=1000)
+        elif format == "coco":
+            parser = COCOParser(batch_size=1000)
+        else:
+            raise ValueError(f"Unsupported format: {format}")
         categories = parser.parse_categories(Path(annotation_path))
 
         yield IngestionProgress(
@@ -165,9 +171,13 @@ def ingest_with_progress(
             ).fetchone()
 
             if existing is None:
+                dataset_type = "classification" if format == "classification_jsonl" else "detection"
                 cursor.execute(
-                    "INSERT INTO datasets VALUES "
-                    "(?, ?, ?, ?, ?, ?, ?, ?, 0, current_timestamp, NULL)",
+                    "INSERT INTO datasets "
+                    "(id, name, format, source_path, image_dir, image_count, "
+                    "annotation_count, category_count, prediction_count, "
+                    "created_at, metadata, dataset_type) "
+                    "VALUES (?, ?, ?, ?, ?, ?, ?, ?, 0, current_timestamp, NULL, ?)",
                     [
                         dataset_id,
                         name,
@@ -177,6 +187,7 @@ def ingest_with_progress(
                         image_count,
                         ann_count,
                         len(categories),
+                        dataset_type,
                     ],
                 )
             else:
@@ -296,6 +307,7 @@ def ingest_splits_with_progress(
         self,
         splits: list,
         dataset_name: str,
+        format: str = "coco",
     ) -> Iterator[IngestionProgress]:
         """Ingest multiple splits as a single dataset, yielding per-split progress.
 
@@ -306,6 +318,8 @@ def ingest_splits_with_progress(
             ``image_dir`` attributes (e.g. :class:`ImportSplit` instances).
         dataset_name:
             Name for the combined dataset.
+        format:
+            Annotation format (``"coco"`` or ``"classification_jsonl"``).
         """
         dataset_id = str(uuid.uuid4())
 
@@ -321,6 +335,7 @@ def ingest_splits_with_progress(
                 annotation_path=split_config.annotation_path,
                 image_dir=split_config.image_dir,
                 dataset_name=dataset_name,
+                format=format,
                 split=split_config.name,
                 dataset_id=dataset_id,
             )

From 0f9324810b00ba247fc16373a7b42112473c949d Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:19:58 -0500
Subject: [PATCH 13/38] docs(15-01): complete classification ingestion backend
 plan

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/STATE.md                            |  12 +-
 .../15-01-SUMMARY.md                          | 131 ++++++++++++++++++
 2 files changed, 138 insertions(+), 5 deletions(-)
 create mode 100644 .planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md

diff --git a/.planning/STATE.md b/.planning/STATE.md
index f777f60..e5f0774 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -10,11 +10,11 @@ See: .planning/PROJECT.md (updated 2026-02-18)
 ## Current Position
 
 Phase: 15 of 17 (Classification Ingestion & Display)
-Plan: 0 of TBD in current phase
-Status: Ready to plan
-Last activity: 2026-02-18 -- Roadmap created for v1.2 milestone
+Plan: 1 of 2 in current phase
+Status: Executing
+Last activity: 2026-02-18 -- Completed 15-01 (Classification Ingestion Backend)
 
-Progress: [##########################..] 88% (v1.0 + v1.1 complete, v1.2 starting)
+Progress: [##########################..] 90% (v1.0 + v1.1 complete, v1.2 phase 15 plan 1 done)
 
 ## Performance Metrics
 
@@ -39,6 +39,8 @@ Recent decisions affecting current work:
 - Separate classification evaluation function (~50 lines) vs modifying 560-line detection eval
 - Thread `datasetType` prop from page level, branch at component boundaries
 - Parser registry in IngestionService for format dispatch
+- Classification JSONL layouts checked before COCO (more specific first)
+- Classification gt_annotations = COUNT(DISTINCT sample_id) for labeled images
 
 ### Pending Todos
 
@@ -58,5 +60,5 @@ None.
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: Roadmap created for v1.2 milestone
+Stopped at: Completed 15-01-PLAN.md (Classification Ingestion Backend)
 Resume file: None
diff --git a/.planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md b/.planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md
new file mode 100644
index 0000000..e131558
--- /dev/null
+++ b/.planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md
@@ -0,0 +1,131 @@
+---
+phase: 15-classification-ingestion-display
+plan: 01
+subsystem: api, ingestion, database
+tags: [classification, jsonl, parser, duckdb, fastapi, sentinel-bbox]
+
+requires:
+  - phase: 07-evaluation
+    provides: "statistics router and evaluation service"
+  - phase: 02-ingestion
+    provides: "BaseParser, COCOParser, FolderScanner, IngestionService"
+provides:
+  - ClassificationJSONLParser with sentinel bbox values
+  - FolderScanner classification JSONL detection (layouts D and E)
+  - dataset_type column and API field
+  - PATCH /annotations/{id}/category endpoint
+  - Format-based parser dispatch in IngestionService
+  - Classification-aware statistics
+affects: [15-02, 16-classification-evaluation, frontend-classification-display]
+
+tech-stack:
+  added: []
+  patterns: [sentinel-bbox-for-classification, format-based-parser-dispatch, layout-detection-priority]
+
+key-files:
+  created:
+    - app/ingestion/classification_jsonl_parser.py
+  modified:
+    - app/repositories/duckdb_repo.py
+    - app/models/dataset.py
+    - app/models/scan.py
+    - app/models/annotation.py
+    - app/ingestion/base_parser.py
+    - app/services/folder_scanner.py
+    - app/services/ingestion.py
+    - app/routers/ingestion.py
+    - app/routers/datasets.py
+    - app/routers/annotations.py
+    - app/routers/statistics.py
+
+key-decisions:
+  - "Classification JSONL layouts checked before COCO layouts since JSONL is never COCO"
+  - "Sentinel bbox values (all 0.0) for classification annotations to avoid nullable columns"
+  - "Format string threaded through ImportRequest -> ingest_splits_with_progress -> ingest_with_progress"
+  - "Classification gt_annotations stat uses COUNT(DISTINCT sample_id) instead of COUNT(*)"
+
+patterns-established:
+  - "Format dispatch: IngestionService selects parser by format string, extensible for future formats"
+  - "Layout priority: classification-specific layouts tested before generic COCO layouts"
+
+duration: 5min
+completed: 2026-02-18
+---
+
+# Phase 15 Plan 01: Classification Ingestion & Backend Summary
+
+**ClassificationJSONLParser with sentinel bbox values, FolderScanner auto-detection of JSONL layouts, format-based parser dispatch, and category update endpoint**
+
+## Performance
+
+- **Duration:** 5 min
+- **Started:** 2026-02-19T02:13:50Z
+- **Completed:** 2026-02-19T02:18:51Z
+- **Tasks:** 2
+- **Files modified:** 12
+
+## Accomplishments
+- ClassificationJSONLParser that produces annotations with sentinel bbox values (0.0) and supports multi-label via array labels
+- FolderScanner detects classification JSONL in split dirs (Layout D) and flat (Layout E) with GCS support
+- Format-based parser dispatch in IngestionService with dataset_type stored on dataset record
+- PATCH /annotations/{id}/category endpoint for classification label editing
+- Classification-aware statistics (gt_annotations = distinct labeled images)
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Schema migration, Pydantic models, and ClassificationJSONLParser** - `5264e51` (feat)
+2. **Task 2: FolderScanner detection, IngestionService dispatch, and API endpoints** - `8af8a11` (feat)
+
+## Files Created/Modified
+- `app/ingestion/classification_jsonl_parser.py` - New parser extending BaseParser with sentinel bbox annotations
+- `app/repositories/duckdb_repo.py` - dataset_type column migration
+- `app/models/dataset.py` - dataset_type field on DatasetResponse
+- `app/models/scan.py` - format field on ImportRequest
+- `app/models/annotation.py` - CategoryUpdateRequest model
+- `app/ingestion/base_parser.py` - image_dir parameter on build_image_batches ABC
+- `app/services/folder_scanner.py` - Layout D/E detectors, GCS classification detection, _is_classification_jsonl
+- `app/services/ingestion.py` - Format dispatch, dataset_type on INSERT, format threading
+- `app/routers/ingestion.py` - Format passthrough, .jsonl in browse, updated error message
+- `app/routers/datasets.py` - dataset_type in SELECT and DatasetResponse mapping
+- `app/routers/annotations.py` - PATCH /annotations/{id}/category endpoint
+- `app/routers/statistics.py` - Classification-aware gt_annotations aggregation
+
+## Decisions Made
+- Classification JSONL layouts checked before COCO layouts since JSONL files are never COCO (more specific detection first)
+- Used sentinel bbox values (all 0.0) for classification annotations, matching the project decision to avoid nullable columns
+- gt_annotations stat for classification uses COUNT(DISTINCT sample_id) to represent "labeled images" rather than raw annotation count
+- Added .jsonl to browse endpoint extensions for file navigation
+
+## Deviations from Plan
+
+### Auto-fixed Issues
+
+**1. [Rule 2 - Missing Critical] Added .jsonl to browse endpoint file extensions**
+- **Found during:** Task 2 (API endpoints)
+- **Issue:** Browse endpoint only showed .json files, users couldn't see .jsonl files when navigating
+- **Fix:** Added ".jsonl" to _BROWSE_EXTENSIONS set
+- **Files modified:** app/routers/ingestion.py
+- **Verification:** Import and app start verified
+- **Committed in:** 8af8a11 (Task 2 commit)
+
+---
+
+**Total deviations:** 1 auto-fixed (1 missing critical)
+**Impact on plan:** Minor addition necessary for classification JSONL usability. No scope creep.
+
+## Issues Encountered
+None
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Backend fully supports classification dataset ingestion, ready for frontend display work in Plan 02
+- Parser dispatch is extensible for future formats (YOLO, VOC, etc.)
+- dataset_type field available for frontend to branch display logic
+
+---
+*Phase: 15-classification-ingestion-display*
+*Completed: 2026-02-18*

From b96ce5efc705c3c35fad2bbcf50a106333dec3c7 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:22:49 -0500
Subject: [PATCH 14/38] feat(15-02): add dataset_type threading, grid class
 badges, and scan format badge

- Add dataset_type field to Dataset TypeScript interface
- Thread datasetType prop from dataset page to ImageGrid, SampleModal, StatsDashboard
- Add ClassBadge component to GridCell for classification datasets (replaces bbox overlay)
- Show "Classification JSONL" format badge in scan results for classification_jsonl format
- Add datasetType prop signatures to SampleModal and StatsDashboard for Task 2

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/app/datasets/[datasetId]/page.tsx     |  6 ++--
 .../src/components/detail/sample-modal.tsx    |  4 ++-
 frontend/src/components/grid/grid-cell.tsx    | 34 ++++++++++++++-----
 frontend/src/components/grid/image-grid.tsx   |  4 ++-
 .../src/components/ingest/scan-results.tsx    |  4 ++-
 .../src/components/stats/stats-dashboard.tsx  |  1 +
 frontend/src/types/dataset.ts                 |  1 +
 7 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/frontend/src/app/datasets/[datasetId]/page.tsx b/frontend/src/app/datasets/[datasetId]/page.tsx
index 2a149ad..d764cc4 100644
--- a/frontend/src/app/datasets/[datasetId]/page.tsx
+++ b/frontend/src/app/datasets/[datasetId]/page.tsx
@@ -106,17 +106,17 @@ export default function DatasetPage({
         <div className="flex flex-1 overflow-hidden">
           <FilterSidebar datasetId={datasetId} />
           <div className="flex-1 overflow-hidden">
-            <ImageGrid datasetId={datasetId} />
+            <ImageGrid datasetId={datasetId} datasetType={dataset?.dataset_type} />
           </div>
         </div>
       )}
       {activeTab === "statistics" && (
-        <StatsDashboard datasetId={datasetId} />
+        <StatsDashboard datasetId={datasetId} datasetType={dataset?.dataset_type} />
       )}
       {activeTab === "embeddings" && (
         <EmbeddingPanel datasetId={datasetId} />
       )}
-      <SampleModal datasetId={datasetId} samples={allSamples} />
+      <SampleModal datasetId={datasetId} samples={allSamples} datasetType={dataset?.dataset_type} />
       <PredictionImportDialog
         datasetId={datasetId}
         open={showPredImport}
diff --git a/frontend/src/components/detail/sample-modal.tsx b/frontend/src/components/detail/sample-modal.tsx
index 2e7516b..e75e91f 100644
--- a/frontend/src/components/detail/sample-modal.tsx
+++ b/frontend/src/components/detail/sample-modal.tsx
@@ -64,6 +64,8 @@ interface SampleModalProps {
   datasetId: string;
   /** All loaded samples (from the grid's query cache). */
   samples: Sample[];
+  /** Dataset type -- "classification" shows class labels instead of bbox overlays. */
+  datasetType?: string;
 }
 
 /**
@@ -74,7 +76,7 @@ interface SampleModalProps {
  * Finds the sample from the provided samples array (already in memory
  * from the grid's infinite query cache).
  */
-export function SampleModal({ datasetId, samples }: SampleModalProps) {
+export function SampleModal({ datasetId, samples, datasetType }: SampleModalProps) {
   const dialogRef = useRef<HTMLDialogElement>(null);
 
   const selectedSampleId = useUIStore((s) => s.selectedSampleId);
diff --git a/frontend/src/components/grid/grid-cell.tsx b/frontend/src/components/grid/grid-cell.tsx
index 7ce9607..7120732 100644
--- a/frontend/src/components/grid/grid-cell.tsx
+++ b/frontend/src/components/grid/grid-cell.tsx
@@ -32,6 +32,18 @@ function tagStyle(tag: string): string {
   }
 }
 
+/** Small label badge for classification datasets (replaces bbox overlay). */
+function ClassBadge({ label }: { label?: string }) {
+  if (!label) return null;
+  return (
+    <div className="absolute bottom-1 left-1 z-10">
+      <span className="rounded bg-black/60 px-1.5 py-0.5 text-[10px] font-semibold text-white">
+        {label}
+      </span>
+    </div>
+  );
+}
+
 interface GridCellProps {
   sample: Sample;
   datasetId: string;
@@ -39,9 +51,11 @@ interface GridCellProps {
   annotations: Annotation[];
   /** Whether this cell has keyboard focus (blue ring indicator). */
   isFocused?: boolean;
+  /** Dataset type -- "classification" shows class badges instead of bbox overlays. */
+  datasetType?: string;
 }
 
-export function GridCell({ sample, datasetId, annotations, isFocused }: GridCellProps) {
+export function GridCell({ sample, datasetId, annotations, isFocused, datasetType }: GridCellProps) {
   const openDetailModal = useUIStore((s) => s.openDetailModal);
   const isSelecting = useFilterStore((s) => s.isSelecting);
   const selectedSampleIds = useFilterStore((s) => s.selectedSampleIds);
@@ -83,13 +97,17 @@ export function GridCell({ sample, datasetId, annotations, isFocused }: GridCell
           loading="lazy"
           decoding="async"
         />
-        {annotations.length > 0 && (
-          <AnnotationOverlay
-            annotations={annotations}
-            imageWidth={sample.width}
-            imageHeight={sample.height}
-            aspectMode="slice"
-          />
+        {datasetType === "classification" ? (
+          <ClassBadge label={annotations.find((a) => a.source === "ground_truth")?.category_name} />
+        ) : (
+          annotations.length > 0 && (
+            <AnnotationOverlay
+              annotations={annotations}
+              imageWidth={sample.width}
+              imageHeight={sample.height}
+              aspectMode="slice"
+            />
+          )
         )}
         {/* Selection checkbox overlay */}
         {isSelecting && (
diff --git a/frontend/src/components/grid/image-grid.tsx b/frontend/src/components/grid/image-grid.tsx
index 3d05696..26138d4 100644
--- a/frontend/src/components/grid/image-grid.tsx
+++ b/frontend/src/components/grid/image-grid.tsx
@@ -25,9 +25,10 @@ import { GridCell } from "./grid-cell";
 
 interface ImageGridProps {
   datasetId: string;
+  datasetType?: string;
 }
 
-export function ImageGrid({ datasetId }: ImageGridProps) {
+export function ImageGrid({ datasetId, datasetType }: ImageGridProps) {
   const parentRef = useRef<HTMLDivElement>(null);
   const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
 
@@ -215,6 +216,7 @@ export function ImageGrid({ datasetId }: ImageGridProps) {
                     datasetId={datasetId}
                     annotations={annotationMap?.[sample.id] ?? []}
                     isFocused={focusedGridIndex === sampleIdx}
+                    datasetType={datasetType}
                   />
                 );
               })}
diff --git a/frontend/src/components/ingest/scan-results.tsx b/frontend/src/components/ingest/scan-results.tsx
index 1b6686a..09aaf76 100644
--- a/frontend/src/components/ingest/scan-results.tsx
+++ b/frontend/src/components/ingest/scan-results.tsx
@@ -68,7 +68,9 @@ export default function ScanResults() {
             />
           </div>
           <span className="ml-4 mt-6 inline-flex items-center rounded-full bg-blue-100 px-2.5 py-0.5 text-xs font-semibold text-blue-800 dark:bg-blue-900 dark:text-blue-200">
-            {scanResult.format.toUpperCase()}
+            {scanResult.format === "classification_jsonl"
+              ? "Classification JSONL"
+              : scanResult.format.toUpperCase()}
           </span>
         </div>
         <p className="mt-2 text-xs text-zinc-400 dark:text-zinc-500">
diff --git a/frontend/src/components/stats/stats-dashboard.tsx b/frontend/src/components/stats/stats-dashboard.tsx
index 0f7ffb8..d098927 100644
--- a/frontend/src/components/stats/stats-dashboard.tsx
+++ b/frontend/src/components/stats/stats-dashboard.tsx
@@ -28,6 +28,7 @@ import { IntelligencePanel } from "@/components/stats/intelligence-panel";
 
 interface StatsDashboardProps {
   datasetId: string;
+  datasetType?: string;
 }
 
 type SubTab = "overview" | "evaluation" | "error_analysis" | "worst_images" | "near_duplicates" | "intelligence";
diff --git a/frontend/src/types/dataset.ts b/frontend/src/types/dataset.ts
index 02ebd74..dcc926d 100644
--- a/frontend/src/types/dataset.ts
+++ b/frontend/src/types/dataset.ts
@@ -13,6 +13,7 @@ export interface Dataset {
   annotation_count: number;
   category_count: number;
   prediction_count: number;
+  dataset_type: string;
   created_at: string;
 }
 

From e7ad77686af924a552c99300e946cf65a59139d4 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:25:40 -0500
Subject: [PATCH 15/38] feat(15-02): classification-aware modal, annotation
 list, and statistics

- Add class label dropdown editor in SampleModal for classification datasets
- Wire PATCH /annotations/{id}/category mutation for category updates
- Hide bbox overlay, annotation editor, and triage toolbar for classification
- Show predicted class with confidence in classification modal
- Hide bbox/area columns in AnnotationList for classification datasets
- Hide Evaluation, Error Analysis, Worst Images, Intelligence tabs for classification
- Swap summary card labels: "Labeled Images" and "Classes" for classification

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/components/detail/annotation-list.tsx |  44 ++++--
 .../src/components/detail/sample-modal.tsx    | 138 +++++++++++++-----
 .../components/stats/annotation-summary.tsx   |  13 +-
 .../src/components/stats/stats-dashboard.tsx  | 101 +++++++------
 4 files changed, 197 insertions(+), 99 deletions(-)

diff --git a/frontend/src/components/detail/annotation-list.tsx b/frontend/src/components/detail/annotation-list.tsx
index c0fbc0e..fef728a 100644
--- a/frontend/src/components/detail/annotation-list.tsx
+++ b/frontend/src/components/detail/annotation-list.tsx
@@ -14,6 +14,8 @@ import type { Annotation } from "@/types/annotation";
 interface AnnotationListProps {
   /** Annotations to display in the table. */
   annotations: Annotation[];
+  /** Dataset type -- "classification" hides bbox/area columns. */
+  datasetType?: string;
   /** Optional callback to delete a ground_truth annotation. Shows delete buttons when provided. */
   onDelete?: (annotationId: string) => void;
 }
@@ -24,7 +26,9 @@ interface AnnotationListProps {
  * Each row shows: colored class dot, class name, bbox coordinates,
  * area, source, and confidence (if available).
  */
-export function AnnotationList({ annotations, onDelete }: AnnotationListProps) {
+export function AnnotationList({ annotations, datasetType, onDelete }: AnnotationListProps) {
+  const isClassification = datasetType === "classification";
+
   return (
     <div className="flex flex-col gap-2">
       <h3 className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
@@ -38,12 +42,16 @@ export function AnnotationList({ annotations, onDelete }: AnnotationListProps) {
               <th className="px-2 py-1.5 font-medium text-zinc-600 dark:text-zinc-400">
                 Class
               </th>
-              <th className="px-2 py-1.5 font-medium text-zinc-600 dark:text-zinc-400">
-                Bounding Box
-              </th>
-              <th className="px-2 py-1.5 font-medium text-zinc-600 dark:text-zinc-400 text-right">
-                Area
-              </th>
+              {!isClassification && (
+                <th className="px-2 py-1.5 font-medium text-zinc-600 dark:text-zinc-400">
+                  Bounding Box
+                </th>
+              )}
+              {!isClassification && (
+                <th className="px-2 py-1.5 font-medium text-zinc-600 dark:text-zinc-400 text-right">
+                  Area
+                </th>
+              )}
               <th className="px-2 py-1.5 font-medium text-zinc-600 dark:text-zinc-400">
                 Source
               </th>
@@ -80,15 +88,19 @@ export function AnnotationList({ annotations, onDelete }: AnnotationListProps) {
                       </span>
                     </span>
                   </td>
-                  <td className="px-2 py-1.5 whitespace-nowrap font-mono text-zinc-600 dark:text-zinc-400">
-                    {ann.bbox_x.toFixed(1)}, {ann.bbox_y.toFixed(1)},{" "}
-                    {ann.bbox_w.toFixed(1)} x {ann.bbox_h.toFixed(1)}
-                  </td>
-                  <td className="px-2 py-1.5 text-right tabular-nums text-zinc-600 dark:text-zinc-400">
-                    {ann.area.toLocaleString(undefined, {
-                      maximumFractionDigits: 0,
-                    })}
-                  </td>
+                  {!isClassification && (
+                    <td className="px-2 py-1.5 whitespace-nowrap font-mono text-zinc-600 dark:text-zinc-400">
+                      {ann.bbox_x.toFixed(1)}, {ann.bbox_y.toFixed(1)},{" "}
+                      {ann.bbox_w.toFixed(1)} x {ann.bbox_h.toFixed(1)}
+                    </td>
+                  )}
+                  {!isClassification && (
+                    <td className="px-2 py-1.5 text-right tabular-nums text-zinc-600 dark:text-zinc-400">
+                      {ann.area.toLocaleString(undefined, {
+                        maximumFractionDigits: 0,
+                      })}
+                    </td>
+                  )}
                   <td className="px-2 py-1.5 text-zinc-600 dark:text-zinc-400">
                     {ann.source}
                   </td>
diff --git a/frontend/src/components/detail/sample-modal.tsx b/frontend/src/components/detail/sample-modal.tsx
index e75e91f..8d3b172 100644
--- a/frontend/src/components/detail/sample-modal.tsx
+++ b/frontend/src/components/detail/sample-modal.tsx
@@ -17,7 +17,8 @@ import { useEffect, useRef, useState, useCallback, type MouseEvent } from "react
 import dynamic from "next/dynamic";
 import { useHotkeys } from "react-hotkeys-hook";
 
-import { fullImageUrl } from "@/lib/api";
+import { useMutation, useQueryClient } from "@tanstack/react-query";
+import { apiPatch, fullImageUrl } from "@/lib/api";
 import {
   useAnnotations,
   useUpdateAnnotation,
@@ -101,11 +102,25 @@ export function SampleModal({ datasetId, samples, datasetType }: SampleModalProp
   // Fetch annotations for the selected sample via per-sample endpoint
   const { data: annotations } = useAnnotations(datasetId, selectedSampleId);
 
+  const isClassification = datasetType === "classification";
+
   // Mutation hooks for annotation CRUD
   const updateMutation = useUpdateAnnotation(datasetId, selectedSampleId ?? "");
   const createMutation = useCreateAnnotation(datasetId, selectedSampleId ?? "");
   const deleteMutation = useDeleteAnnotation(datasetId, selectedSampleId ?? "");
 
+  // Category patch mutation for classification label editing
+  const qc = useQueryClient();
+  const patchCategory = useMutation({
+    mutationFn: ({ annotationId, category_name }: { annotationId: string; category_name: string }) =>
+      apiPatch<{ updated: string }>(`/annotations/${annotationId}/category`, { category_name }),
+    onSuccess: () => {
+      qc.invalidateQueries({ queryKey: ["annotations", selectedSampleId] });
+      qc.invalidateQueries({ queryKey: ["annotations-batch"] });
+      qc.invalidateQueries({ queryKey: ["filter-facets", datasetId] });
+    },
+  });
+
   // Get categories from filter facets for the class picker
   const { data: facets } = useFilterFacets(datasetId);
   const categories = facets?.categories?.map((c) => c.name) ?? [];
@@ -339,7 +354,15 @@ export function SampleModal({ datasetId, samples, datasetType }: SampleModalProp
 
           {/* Full-resolution image with annotation overlays or Konva editor */}
           <div className="relative bg-zinc-100 dark:bg-zinc-800">
-            {isEditMode ? (
+            {isClassification ? (
+              /* Classification: plain image, no overlays or editor */
+              <img
+                src={fullImageUrl(datasetId, sample.id)}
+                alt={sample.file_name}
+                className="h-auto w-full"
+                decoding="async"
+              />
+            ) : isEditMode ? (
               <AnnotationEditor
                 imageUrl={fullImageUrl(datasetId, sample.id)}
                 annotations={gtAnnotations}
@@ -397,45 +420,89 @@ export function SampleModal({ datasetId, samples, datasetType }: SampleModalProp
             )}
           </div>
 
-          {/* Edit toolbar */}
-          <div className="flex items-center gap-2 border-b border-zinc-200 px-5 py-2 dark:border-zinc-700">
-            <button
-              onClick={toggleEditMode}
-              className={`rounded px-3 py-1.5 text-sm font-medium transition-colors ${
-                isEditMode
-                  ? "bg-blue-600 text-white hover:bg-blue-700"
-                  : "bg-zinc-100 text-zinc-700 hover:bg-zinc-200 dark:bg-zinc-800 dark:text-zinc-300 dark:hover:bg-zinc-700"
-              }`}
-            >
-              {isEditMode ? "Done" : "Edit Annotations"}
-            </button>
-            {isEditMode && (
+          {/* Classification class label section */}
+          {isClassification && (
+            <div className="flex items-center gap-4 border-b border-zinc-200 px-5 py-3 dark:border-zinc-700">
+              <div className="flex items-center gap-2">
+                <span className="text-sm font-medium text-zinc-700 dark:text-zinc-300">Class:</span>
+                {gtAnnotations.length > 0 ? (
+                  <select
+                    value={gtAnnotations[0].category_name}
+                    onChange={(e) =>
+                      patchCategory.mutate({
+                        annotationId: gtAnnotations[0].id,
+                        category_name: e.target.value,
+                      })
+                    }
+                    className="rounded border border-zinc-300 bg-white px-2 py-1 text-sm text-zinc-900 focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500 dark:border-zinc-600 dark:bg-zinc-800 dark:text-zinc-100"
+                  >
+                    {categories.map((cat) => (
+                      <option key={cat} value={cat}>
+                        {cat}
+                      </option>
+                    ))}
+                  </select>
+                ) : (
+                  <span className="text-sm text-zinc-400">No label</span>
+                )}
+              </div>
+              {predAnnotations.length > 0 && (
+                <div className="flex items-center gap-2">
+                  <span className="text-sm font-medium text-zinc-700 dark:text-zinc-300">Predicted:</span>
+                  <span className="text-sm text-zinc-900 dark:text-zinc-100">
+                    {predAnnotations[0].category_name}
+                  </span>
+                  {predAnnotations[0].confidence !== null && (
+                    <span className="text-xs text-zinc-400">
+                      ({(predAnnotations[0].confidence * 100).toFixed(1)}%)
+                    </span>
+                  )}
+                </div>
+              )}
+            </div>
+          )}
+
+          {/* Edit toolbar (detection only) */}
+          {!isClassification && (
+            <div className="flex items-center gap-2 border-b border-zinc-200 px-5 py-2 dark:border-zinc-700">
               <button
-                onClick={toggleDrawMode}
+                onClick={toggleEditMode}
                 className={`rounded px-3 py-1.5 text-sm font-medium transition-colors ${
-                  isDrawMode
-                    ? "bg-green-600 text-white hover:bg-green-700"
+                  isEditMode
+                    ? "bg-blue-600 text-white hover:bg-blue-700"
                     : "bg-zinc-100 text-zinc-700 hover:bg-zinc-200 dark:bg-zinc-800 dark:text-zinc-300 dark:hover:bg-zinc-700"
                 }`}
               >
-                {isDrawMode ? "Cancel Draw" : "Draw New Box"}
+                {isEditMode ? "Done" : "Edit Annotations"}
               </button>
-            )}
-            {/* Triage filter buttons (always visible, not gated by edit mode) */}
-            <TriageFilterButtons
-              activeFilter={triageFilter}
-              onFilterChange={setTriageFilter}
-            />
-
-            {/* Spacer + edit hint pushed right */}
-            {isEditMode && (
-              <span className="ml-auto text-xs text-zinc-400">
-                {isDrawMode
-                  ? "Click and drag to draw a new box"
-                  : "Click a box to select, drag to move, handles to resize"}
-              </span>
-            )}
-          </div>
+              {isEditMode && (
+                <button
+                  onClick={toggleDrawMode}
+                  className={`rounded px-3 py-1.5 text-sm font-medium transition-colors ${
+                    isDrawMode
+                      ? "bg-green-600 text-white hover:bg-green-700"
+                      : "bg-zinc-100 text-zinc-700 hover:bg-zinc-200 dark:bg-zinc-800 dark:text-zinc-300 dark:hover:bg-zinc-700"
+                  }`}
+                >
+                  {isDrawMode ? "Cancel Draw" : "Draw New Box"}
+                </button>
+              )}
+              {/* Triage filter buttons (always visible, not gated by edit mode) */}
+              <TriageFilterButtons
+                activeFilter={triageFilter}
+                onFilterChange={setTriageFilter}
+              />
+
+              {/* Spacer + edit hint pushed right */}
+              {isEditMode && (
+                <span className="ml-auto text-xs text-zinc-400">
+                  {isDrawMode
+                    ? "Click and drag to draw a new box"
+                    : "Click a box to select, drag to move, handles to resize"}
+                </span>
+              )}
+            </div>
+          )}
 
           {/* Metadata and annotations section */}
           <div className="grid gap-6 p-5 md:grid-cols-[1fr_2fr]">
@@ -513,6 +580,7 @@ export function SampleModal({ datasetId, samples, datasetType }: SampleModalProp
                 annotations.length > 0 ? (
                   <AnnotationList
                     annotations={annotations}
+                    datasetType={datasetType}
                     onDelete={
                       isEditMode
                         ? (id) => deleteMutation.mutate(id)
diff --git a/frontend/src/components/stats/annotation-summary.tsx b/frontend/src/components/stats/annotation-summary.tsx
index c5d88e4..b037dc0 100644
--- a/frontend/src/components/stats/annotation-summary.tsx
+++ b/frontend/src/components/stats/annotation-summary.tsx
@@ -11,16 +11,25 @@ import type { SummaryStats } from "@/types/statistics";
 
 interface AnnotationSummaryProps {
   summary: SummaryStats;
+  datasetType?: string;
 }
 
-const CARDS: { key: keyof SummaryStats; label: string }[] = [
+const DETECTION_CARDS: { key: keyof SummaryStats; label: string }[] = [
   { key: "total_images", label: "Total Images" },
   { key: "gt_annotations", label: "GT Annotations" },
   { key: "pred_annotations", label: "Predictions" },
   { key: "total_categories", label: "Categories" },
 ];
 
-export function AnnotationSummary({ summary }: AnnotationSummaryProps) {
+const CLASSIFICATION_CARDS: { key: keyof SummaryStats; label: string }[] = [
+  { key: "total_images", label: "Total Images" },
+  { key: "gt_annotations", label: "Labeled Images" },
+  { key: "pred_annotations", label: "Predictions" },
+  { key: "total_categories", label: "Classes" },
+];
+
+export function AnnotationSummary({ summary, datasetType }: AnnotationSummaryProps) {
+  const CARDS = datasetType === "classification" ? CLASSIFICATION_CARDS : DETECTION_CARDS;
   return (
     <div className="grid grid-cols-2 lg:grid-cols-4 gap-4">
       {CARDS.map((card) => (
diff --git a/frontend/src/components/stats/stats-dashboard.tsx b/frontend/src/components/stats/stats-dashboard.tsx
index d098927..b2fbf0c 100644
--- a/frontend/src/components/stats/stats-dashboard.tsx
+++ b/frontend/src/components/stats/stats-dashboard.tsx
@@ -50,7 +50,8 @@ function SkeletonChart({ height }: { height: string }) {
   );
 }
 
-export function StatsDashboard({ datasetId }: StatsDashboardProps) {
+export function StatsDashboard({ datasetId, datasetType }: StatsDashboardProps) {
+  const isClassification = datasetType === "classification";
   const split = useSplit();
   const setSplit = useFilterStore((s) => s.setSplit);
   const { data: facets } = useFilterFacets(datasetId);
@@ -175,39 +176,45 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
         >
           Overview
         </button>
-        <button
-          onClick={() => setActiveTab("evaluation")}
-          disabled={!hasPredictions}
-          className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
-            activeTab === "evaluation"
-              ? "border-blue-500 text-blue-600 dark:text-blue-400"
-              : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
-          } disabled:opacity-40 disabled:cursor-not-allowed`}
-        >
-          Evaluation
-        </button>
-        <button
-          onClick={() => setActiveTab("error_analysis")}
-          disabled={!hasPredictions}
-          className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
-            activeTab === "error_analysis"
-              ? "border-blue-500 text-blue-600 dark:text-blue-400"
-              : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
-          } disabled:opacity-40 disabled:cursor-not-allowed`}
-        >
-          Error Analysis
-        </button>
-        <button
-          onClick={() => setActiveTab("worst_images")}
-          disabled={!hasPredictions}
-          className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
-            activeTab === "worst_images"
-              ? "border-blue-500 text-blue-600 dark:text-blue-400"
-              : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
-          } disabled:opacity-40 disabled:cursor-not-allowed`}
-        >
-          Worst Images
-        </button>
+        {!isClassification && (
+          <button
+            onClick={() => setActiveTab("evaluation")}
+            disabled={!hasPredictions}
+            className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
+              activeTab === "evaluation"
+                ? "border-blue-500 text-blue-600 dark:text-blue-400"
+                : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
+            } disabled:opacity-40 disabled:cursor-not-allowed`}
+          >
+            Evaluation
+          </button>
+        )}
+        {!isClassification && (
+          <button
+            onClick={() => setActiveTab("error_analysis")}
+            disabled={!hasPredictions}
+            className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
+              activeTab === "error_analysis"
+                ? "border-blue-500 text-blue-600 dark:text-blue-400"
+                : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
+            } disabled:opacity-40 disabled:cursor-not-allowed`}
+          >
+            Error Analysis
+          </button>
+        )}
+        {!isClassification && (
+          <button
+            onClick={() => setActiveTab("worst_images")}
+            disabled={!hasPredictions}
+            className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
+              activeTab === "worst_images"
+                ? "border-blue-500 text-blue-600 dark:text-blue-400"
+                : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
+            } disabled:opacity-40 disabled:cursor-not-allowed`}
+          >
+            Worst Images
+          </button>
+        )}
         <button
           onClick={() => setActiveTab("near_duplicates")}
           className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
@@ -218,17 +225,19 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
         >
           Near Duplicates
         </button>
-        <button
-          onClick={() => setActiveTab("intelligence")}
-          disabled={!hasPredictions}
-          className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
-            activeTab === "intelligence"
-              ? "border-purple-500 text-purple-600 dark:text-purple-400"
-              : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
-          } disabled:opacity-40 disabled:cursor-not-allowed`}
-        >
-          Intelligence
-        </button>
+        {!isClassification && (
+          <button
+            onClick={() => setActiveTab("intelligence")}
+            disabled={!hasPredictions}
+            className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
+              activeTab === "intelligence"
+                ? "border-purple-500 text-purple-600 dark:text-purple-400"
+                : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
+            } disabled:opacity-40 disabled:cursor-not-allowed`}
+          >
+            Intelligence
+          </button>
+        )}
       </div>
 
       {activeTab === "overview" && (
@@ -246,7 +255,7 @@ export function StatsDashboard({ datasetId }: StatsDashboardProps) {
                 <SkeletonCard />
               </div>
             ) : (
-              <AnnotationSummary summary={filteredStats.summary} />
+              <AnnotationSummary summary={filteredStats.summary} datasetType={datasetType} />
             )}
           </section>
 

From bfac740ecc05712457804589205a09f6f6a488c6 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:28:46 -0500
Subject: [PATCH 16/38] docs(15-02): complete classification frontend display
 plan

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/STATE.md                            |  14 ++-
 .../15-02-SUMMARY.md                          | 111 ++++++++++++++++++
 2 files changed, 119 insertions(+), 6 deletions(-)
 create mode 100644 .planning/phases/15-classification-ingestion-display/15-02-SUMMARY.md

diff --git a/.planning/STATE.md b/.planning/STATE.md
index e5f0774..1f206e1 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -9,12 +9,12 @@ See: .planning/PROJECT.md (updated 2026-02-18)
 
 ## Current Position
 
-Phase: 15 of 17 (Classification Ingestion & Display)
-Plan: 1 of 2 in current phase
-Status: Executing
-Last activity: 2026-02-18 -- Completed 15-01 (Classification Ingestion Backend)
+Phase: 15 of 17 (Classification Ingestion & Display) -- COMPLETE
+Plan: 2 of 2 in current phase
+Status: Phase Complete
+Last activity: 2026-02-18 -- Completed 15-02 (Classification Frontend Display)
 
-Progress: [##########################..] 90% (v1.0 + v1.1 complete, v1.2 phase 15 plan 1 done)
+Progress: [############################] 93% (v1.0 + v1.1 complete, v1.2 phase 15 done)
 
 ## Performance Metrics
 
@@ -41,6 +41,8 @@ Recent decisions affecting current work:
 - Parser registry in IngestionService for format dispatch
 - Classification JSONL layouts checked before COCO (more specific first)
 - Classification gt_annotations = COUNT(DISTINCT sample_id) for labeled images
+- [Phase 15]: Thread datasetType from page level, branch at component boundaries with isClassification flag
+- [Phase 15]: Hide detection-only stats tabs for classification (Evaluation, Error Analysis, Worst Images, Intelligence)
 
 ### Pending Todos
 
@@ -60,5 +62,5 @@ None.
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: Completed 15-01-PLAN.md (Classification Ingestion Backend)
+Stopped at: Completed 15-02-PLAN.md (Classification Frontend Display) -- Phase 15 complete
 Resume file: None
diff --git a/.planning/phases/15-classification-ingestion-display/15-02-SUMMARY.md b/.planning/phases/15-classification-ingestion-display/15-02-SUMMARY.md
new file mode 100644
index 0000000..3127e00
--- /dev/null
+++ b/.planning/phases/15-classification-ingestion-display/15-02-SUMMARY.md
@@ -0,0 +1,111 @@
+---
+phase: 15-classification-ingestion-display
+plan: 02
+subsystem: frontend, ui
+tags: [classification, react, tanstack-query, dataset-type, class-badge, dropdown-editor]
+
+requires:
+  - phase: 15-classification-ingestion-display
+    plan: 01
+    provides: "dataset_type field, PATCH /annotations/{id}/category, classification-aware statistics"
+provides:
+  - ClassBadge grid overlay for classification datasets
+  - Class label dropdown editor in detail modal with PATCH mutation
+  - Classification-aware statistics dashboard (hidden detection tabs)
+  - Classification-appropriate summary card labels
+  - Format badge in scan results for classification JSONL
+affects: [16-classification-evaluation, frontend-polish]
+
+tech-stack:
+  added: []
+  patterns: [datasetType-prop-threading, isClassification-branching-at-component-boundaries]
+
+key-files:
+  created: []
+  modified:
+    - frontend/src/types/dataset.ts
+    - frontend/src/app/datasets/[datasetId]/page.tsx
+    - frontend/src/components/grid/grid-cell.tsx
+    - frontend/src/components/grid/image-grid.tsx
+    - frontend/src/components/ingest/scan-results.tsx
+    - frontend/src/components/detail/sample-modal.tsx
+    - frontend/src/components/detail/annotation-list.tsx
+    - frontend/src/components/stats/stats-dashboard.tsx
+    - frontend/src/components/stats/annotation-summary.tsx
+
+key-decisions:
+  - "Thread datasetType from page level, branch at component boundaries with isClassification flag"
+  - "Hide entire edit toolbar and annotation editor for classification (no bbox editing needed)"
+  - "Hide Evaluation, Error Analysis, Worst Images, and Intelligence tabs for classification (IoU-based)"
+  - "Keep Near Duplicates tab visible for classification (embedding-based, not IoU-dependent)"
+
+patterns-established:
+  - "datasetType prop threading: page fetches dataset, threads type to all children"
+  - "isClassification branching: components check datasetType === 'classification' to show/hide detection UI"
+
+duration: 5min
+completed: 2026-02-18
+---
+
+# Phase 15 Plan 02: Classification Frontend Display Summary
+
+**Classification-aware grid badges, modal class dropdown editor, and detection-tab hiding via datasetType prop threading**
+
+## Performance
+
+- **Duration:** 5 min
+- **Started:** 2026-02-19T02:20:50Z
+- **Completed:** 2026-02-19T02:25:44Z
+- **Tasks:** 2
+- **Files modified:** 9
+
+## Accomplishments
+- Grid shows class label badges instead of bbox overlays for classification datasets
+- Detail modal displays class dropdown editor with PATCH category mutation and predicted class with confidence
+- Statistics dashboard hides detection-only tabs (Evaluation, Error Analysis, Worst Images, Intelligence)
+- Summary cards show "Labeled Images" and "Classes" labels for classification datasets
+- Annotation list hides bbox and area columns for classification
+- Scan results show "Classification JSONL" format badge
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Types, page threading, grid class badges, and scan results format badge** - `b96ce5e` (feat)
+2. **Task 2: Detail modal class label display/edit and classification-aware statistics** - `e7ad776` (feat)
+
+## Files Created/Modified
+- `frontend/src/types/dataset.ts` - Added dataset_type field to Dataset interface
+- `frontend/src/app/datasets/[datasetId]/page.tsx` - Thread datasetType prop to ImageGrid, SampleModal, StatsDashboard
+- `frontend/src/components/grid/grid-cell.tsx` - ClassBadge component, classification branching in overlay
+- `frontend/src/components/grid/image-grid.tsx` - datasetType prop acceptance and passthrough
+- `frontend/src/components/ingest/scan-results.tsx` - "Classification JSONL" friendly format badge
+- `frontend/src/components/detail/sample-modal.tsx` - Class dropdown editor, PATCH mutation, hide bbox editor/toolbar
+- `frontend/src/components/detail/annotation-list.tsx` - Hide bbox/area columns for classification
+- `frontend/src/components/stats/stats-dashboard.tsx` - Hide detection-only tabs for classification
+- `frontend/src/components/stats/annotation-summary.tsx` - Classification card labels (Labeled Images, Classes)
+
+## Decisions Made
+- Thread datasetType from page level, branch at component boundaries -- consistent pattern, easy to test
+- Hide entire edit toolbar and annotation editor for classification (no bounding boxes to edit)
+- Hide Evaluation/Error Analysis/Worst Images/Intelligence tabs for classification (all IoU-based detection features)
+- Keep Near Duplicates tab visible for classification since it uses embeddings, not IoU
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+None
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Frontend fully supports classification dataset display, ready for classification evaluation in Phase 16
+- datasetType prop threading pattern established for any future dataset-type-specific UI
+- PATCH /annotations/{id}/category wired end-to-end for label editing
+
+---
+*Phase: 15-classification-ingestion-display*
+*Completed: 2026-02-18*

From f522fda18a0a6b0bae7f218477df51fe6aca4f54 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:33:02 -0500
Subject: [PATCH 17/38] docs(phase-15): complete phase execution

---
 .planning/REQUIREMENTS.md                     | 32 +++---
 .planning/ROADMAP.md                          |  8 +-
 .../15-VERIFICATION.md                        | 98 +++++++++++++++++++
 3 files changed, 118 insertions(+), 20 deletions(-)
 create mode 100644 .planning/phases/15-classification-ingestion-display/15-VERIFICATION.md

diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
index 665e268..524809f 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/REQUIREMENTS.md
@@ -9,17 +9,17 @@ Requirements for classification dataset support. Each maps to roadmap phases.
 
 ### Ingestion
 
-- [ ] **INGEST-01**: User can import a classification dataset from a directory containing JSONL annotations and images
-- [ ] **INGEST-02**: System auto-detects dataset type (detection vs classification) from annotation format during import
-- [ ] **INGEST-03**: User can import multi-split classification datasets (train/valid/test) in a single operation
-- [ ] **INGEST-04**: Schema stores dataset_type on the datasets table and handles classification annotations without bbox values
+- [x] **INGEST-01**: User can import a classification dataset from a directory containing JSONL annotations and images
+- [x] **INGEST-02**: System auto-detects dataset type (detection vs classification) from annotation format during import
+- [x] **INGEST-03**: User can import multi-split classification datasets (train/valid/test) in a single operation
+- [x] **INGEST-04**: Schema stores dataset_type on the datasets table and handles classification annotations without bbox values
 
 ### Display
 
-- [ ] **DISP-01**: User sees class label badges on grid thumbnails for classification datasets
-- [ ] **DISP-02**: User sees class label (GT and prediction) prominently in the sample detail modal
-- [ ] **DISP-03**: User can edit the GT class label via dropdown in the detail modal
-- [ ] **DISP-04**: Statistics dashboard shows classification-appropriate metrics (labeled images, class distribution) and hides detection-only elements (bbox area, IoU slider)
+- [x] **DISP-01**: User sees class label badges on grid thumbnails for classification datasets
+- [x] **DISP-02**: User sees class label (GT and prediction) prominently in the sample detail modal
+- [x] **DISP-03**: User can edit the GT class label via dropdown in the detail modal
+- [x] **DISP-04**: Statistics dashboard shows classification-appropriate metrics (labeled images, class distribution) and hides detection-only elements (bbox area, IoU slider)
 
 ### Evaluation
 
@@ -66,14 +66,14 @@ Which phases cover which requirements. Updated during roadmap creation.
 
 | Requirement | Phase | Status |
 |-------------|-------|--------|
-| INGEST-01 | Phase 15 | Pending |
-| INGEST-02 | Phase 15 | Pending |
-| INGEST-03 | Phase 15 | Pending |
-| INGEST-04 | Phase 15 | Pending |
-| DISP-01 | Phase 15 | Pending |
-| DISP-02 | Phase 15 | Pending |
-| DISP-03 | Phase 15 | Pending |
-| DISP-04 | Phase 15 | Pending |
+| INGEST-01 | Phase 15 | Done |
+| INGEST-02 | Phase 15 | Done |
+| INGEST-03 | Phase 15 | Done |
+| INGEST-04 | Phase 15 | Done |
+| DISP-01 | Phase 15 | Done |
+| DISP-02 | Phase 15 | Done |
+| DISP-03 | Phase 15 | Done |
+| DISP-04 | Phase 15 | Done |
 | EVAL-01 | Phase 16 | Pending |
 | EVAL-02 | Phase 16 | Pending |
 | EVAL-03 | Phase 16 | Pending |
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index b084712..aeb64ac 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -88,10 +88,10 @@
   3. User sees class label badges on grid thumbnails instead of bounding box overlays when browsing a classification dataset
   4. User sees GT class label prominently in the sample detail modal and can change it via a dropdown
   5. Statistics dashboard shows classification-appropriate metrics (labeled images count, class distribution) with no detection-only elements visible (no bbox area histogram, no IoU slider)
-**Plans**: 2 plans
+**Plans**: 2 plans (complete)
 Plans:
-- [ ] 15-01-PLAN.md -- Backend: schema migration, ClassificationJSONLParser, FolderScanner detection, IngestionService dispatch, API endpoints
-- [ ] 15-02-PLAN.md -- Frontend: type updates, grid class badges, detail modal class label/dropdown, classification-aware statistics
+- [x] 15-01-PLAN.md -- Backend: schema migration, ClassificationJSONLParser, FolderScanner detection, IngestionService dispatch, API endpoints
+- [x] 15-02-PLAN.md -- Frontend: type updates, grid class badges, detail modal class label/dropdown, classification-aware statistics
 
 #### Phase 16: Classification Evaluation
 **Goal**: Users can import predictions and analyze classification model performance with accuracy, F1, confusion matrix, and error categorization
@@ -134,6 +134,6 @@ Plans:
 | 12. Interactive Viz & Discovery | v1.1 | 3/3 | Complete | 2026-02-13 |
 | 13. Keyboard Shortcuts | v1.1 | 2/2 | Complete | 2026-02-13 |
 | 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
-| 15. Classification Ingestion & Display | v1.2 | 0/2 | Planned | - |
+| 15. Classification Ingestion & Display | v1.2 | 2/2 | Complete | 2026-02-18 |
 | 16. Classification Evaluation | v1.2 | 0/TBD | Not started | - |
 | 17. Classification Polish | v1.2 | 0/TBD | Not started | - |
diff --git a/.planning/phases/15-classification-ingestion-display/15-VERIFICATION.md b/.planning/phases/15-classification-ingestion-display/15-VERIFICATION.md
new file mode 100644
index 0000000..58773d9
--- /dev/null
+++ b/.planning/phases/15-classification-ingestion-display/15-VERIFICATION.md
@@ -0,0 +1,98 @@
+---
+phase: 15-classification-ingestion-display
+verified: 2026-02-19T02:31:00Z
+status: passed
+score: 5/5 must-haves verified
+re_verification: false
+---
+
+# Phase 15: Classification Ingestion & Display Verification Report
+
+**Phase Goal:** Users can import, browse, and inspect classification datasets with the same ease as detection datasets
+**Verified:** 2026-02-19T02:31:00Z
+**Status:** passed
+**Re-verification:** No — initial verification
+
+## Goal Achievement
+
+### Observable Truths
+
+| #  | Truth | Status | Evidence |
+|----|-------|--------|----------|
+| 1  | User can point the ingestion wizard at a folder with JSONL annotations and images, and the system auto-detects it as a classification dataset | VERIFIED | `FolderScanner._try_layout_d` and `_try_layout_e` detect JSONL layouts before COCO; `_is_classification_jsonl` heuristic reads first 5 lines for filename+label keys. GCS path also supported via `_scan_gcs_classification`. `ScanResult.format="classification_jsonl"` returned. |
+| 2  | User can import multi-split classification datasets (train/valid/test) in a single operation, just like detection datasets | VERIFIED | `ImportRequest.format` field added (default `"coco"`, accepts `"classification_jsonl"`). `ingest_splits_with_progress(format=request.format)` threads format into per-split calls. `IngestionService` dispatches to `ClassificationJSONLParser` by format string. `dataset_type="classification"` stored in INSERT. |
+| 3  | User sees class label badges on grid thumbnails instead of bounding box overlays when browsing a classification dataset | VERIFIED | `GridCell` accepts `datasetType?: string`; when `"classification"` renders `<ClassBadge>` with GT `category_name` instead of `<AnnotationOverlay>`. `ImageGrid` threads `datasetType` through. Page threads `dataset.dataset_type` to `<ImageGrid>`. |
+| 4  | User sees GT class label prominently in the sample detail modal and can change it via a dropdown | VERIFIED | `SampleModal` shows `{isClassification && <div>Class: <select>}` at line 424. Dropdown uses `useFilterFacets` for category list. On change, fires `patchCategory.mutate({ annotationId, category_name })` which calls `PATCH /annotations/{id}/category`. Predicted class with confidence also shown. Bbox editor and edit toolbar are hidden for classification. |
+| 5  | Statistics dashboard shows classification-appropriate metrics (labeled images count, class distribution) with no detection-only elements visible (no bbox area histogram, no IoU slider) | VERIFIED | `StatsDashboard` sets `isClassification = datasetType === "classification"`. Evaluation, Error Analysis, Worst Images, and Intelligence tabs all wrapped in `{!isClassification && ...}`. `AnnotationSummary` uses `CLASSIFICATION_CARDS` with "Labeled Images" / "Classes" labels. Backend `gt_annotations` stat uses `COUNT(DISTINCT sample_id)` for classification. |
+
+**Score:** 5/5 truths verified
+
+### Required Artifacts
+
+| Artifact | Expected | Status | Details |
+|----------|----------|--------|---------|
+| `app/ingestion/classification_jsonl_parser.py` | ClassificationJSONLParser extending BaseParser | VERIFIED | Class exists, `format_name` returns `"classification_jsonl"`, sentinel bbox values (all 0.0), multi-label array support, flexible key lookups |
+| `app/repositories/duckdb_repo.py` | dataset_type column migration | VERIFIED | `ALTER TABLE datasets ADD COLUMN IF NOT EXISTS dataset_type VARCHAR DEFAULT 'detection'` in `initialize_schema()` |
+| `app/services/folder_scanner.py` | Classification JSONL layout detection | VERIFIED | `_try_layout_d`, `_try_layout_e`, `_is_classification_jsonl` static method, GCS equivalent `_scan_gcs_classification`; classification checked before COCO |
+| `frontend/src/types/dataset.ts` | Dataset type with dataset_type field | VERIFIED | `dataset_type: string;` field present in Dataset interface |
+| `frontend/src/components/grid/grid-cell.tsx` | ClassBadge rendering for classification datasets | VERIFIED | `ClassBadge` component defined, branching at line 100: `datasetType === "classification"` shows badge, else shows overlay |
+| `frontend/src/components/detail/sample-modal.tsx` | Class label display and dropdown editor | VERIFIED | `isClassification` flag drives conditional: plain image (no editor), class dropdown with `patchCategory` mutation, predicted class display |
+| `frontend/src/components/stats/stats-dashboard.tsx` | Detection-only tab hiding for classification | VERIFIED | `isClassification` flag; Evaluation, Error Analysis, Worst Images, Intelligence all in `{!isClassification && ...}` blocks |
+
+### Key Link Verification
+
+| From | To | Via | Status | Details |
+|------|----|-----|--------|---------|
+| `app/services/folder_scanner.py` | `app/models/scan.py` | `ScanResult(format="classification_jsonl", ...)` | WIRED | `format="classification_jsonl"` literal in scan() return |
+| `app/services/ingestion.py` | `app/ingestion/classification_jsonl_parser.py` | format-based dispatch | WIRED | `if format == "classification_jsonl": parser = ClassificationJSONLParser(batch_size=1000)` |
+| `app/services/ingestion.py` | `app/repositories/duckdb_repo.py` | stores `dataset_type` on INSERT | WIRED | `INSERT INTO datasets (... dataset_type) VALUES (... ?)` with `dataset_type = "classification" if format == "classification_jsonl" else "detection"` |
+| `frontend/src/app/datasets/[datasetId]/page.tsx` | `frontend/src/components/grid/image-grid.tsx` | `datasetType` prop threading | WIRED | `<ImageGrid datasetId={datasetId} datasetType={dataset?.dataset_type} />` |
+| `frontend/src/components/grid/grid-cell.tsx` | `frontend/src/types/dataset.ts` | `dataset_type` determines badge vs overlay | WIRED | `datasetType === "classification"` branch in render; type sourced from `Dataset.dataset_type` |
+| `frontend/src/components/detail/sample-modal.tsx` | `PATCH /annotations/{id}/category` | `patchCategory` TanStack mutation | WIRED | `apiPatch(\`/annotations/\${annotationId}/category\`, { category_name })` in `useMutation`; invalidates annotation queries on success |
+| `app/routers/ingestion.py` | `app/services/ingestion.py` | format passthrough | WIRED | `ingest_splits_with_progress(splits=request.splits, dataset_name=request.dataset_name, format=request.format)` |
+| `app/routers/annotations.py` | DuckDB | `UPDATE annotations SET category_name` | WIRED | `PATCH /{annotation_id}/category` endpoint executes UPDATE and returns `{"updated": annotation_id, "category_name": body.category_name}` |
+
+### Requirements Coverage
+
+All phase goal requirements are satisfied. No REQUIREMENTS.md phase mapping was present for cross-reference.
+
+### Anti-Patterns Found
+
+None. No TODOs, FIXMEs, placeholder returns, or empty implementations found in any modified files.
+
+### Human Verification Required
+
+The following items cannot be verified programmatically:
+
+#### 1. End-to-end classification import wizard flow
+
+**Test:** Point the ingestion wizard at a folder containing train/valid/test split directories with `.jsonl` annotation files and image files. Complete the import.
+**Expected:** Wizard shows "Classification JSONL" format badge, import completes with `dataset_type="classification"` stored, grid shows class label badges instead of bounding boxes.
+**Why human:** Full UI wizard flow with real files; scanner heuristic requires actual JSONL content on disk.
+
+#### 2. Category dropdown in modal populates all classes
+
+**Test:** Open a sample from a classification dataset in the detail modal. Verify the class dropdown contains all categories from the dataset.
+**Expected:** Dropdown shows all classes; selecting a different class persists the change (badge in grid updates after modal close and refresh).
+**Why human:** Requires live data; involves API round-trip, cache invalidation timing, and visual confirmation.
+
+#### 3. Statistics overview for classification dataset
+
+**Test:** Navigate to the Statistics tab for a classification dataset.
+**Expected:** Shows only Overview and Near Duplicates tabs; summary cards show "Labeled Images" (not "GT Annotations") and "Classes" (not "Categories"); no Evaluation, Error Analysis, Worst Images, or Intelligence tabs visible.
+**Why human:** Visual tab rendering requires browser; also verifies that the IoU slider and bbox area histogram are absent.
+
+### Gaps Summary
+
+No gaps found. All automated checks passed at all three levels (exists, substantive, wired).
+
+**Backend (15-01):** `ClassificationJSONLParser` is fully implemented with sentinel bbox values, flexible key lookups, and multi-label support. `FolderScanner` detects classification JSONL layouts D and E (split-dir and flat) before attempting COCO layouts; GCS is also supported. `IngestionService` dispatches by format string and stores `dataset_type`. `PATCH /annotations/{id}/category` endpoint is real and updates DuckDB. Statistics endpoint uses `COUNT(DISTINCT sample_id)` for classification. All dataset API endpoints return `dataset_type`.
+
+**Frontend (15-02):** `dataset_type` flows from API → `Dataset` type → page → `ImageGrid`/`SampleModal`/`StatsDashboard`. `GridCell` shows `ClassBadge` for classification and `AnnotationOverlay` for detection. Modal shows plain image with class dropdown (backed by real TanStack mutation to `PATCH /annotations/{id}/category`) for classification, and the full bbox editor for detection. Stats hides four detection-only tabs. `AnnotationSummary` uses correct card labels per dataset type. Scan results show "Classification JSONL" badge.
+
+**Commits verified:** 5264e51, 8af8a11, b96ce5e, e7ad776 — all exist in git history.
+
+---
+
+_Verified: 2026-02-19T02:31:00Z_
+_Verifier: Claude (gsd-verifier)_

From e148018d1d9b86e99e3e7f352ecf71049917d7fa Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:53:26 -0500
Subject: [PATCH 18/38] docs(16): research phase domain

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../16-RESEARCH.md                            | 439 ++++++++++++++++++
 1 file changed, 439 insertions(+)
 create mode 100644 .planning/phases/16-classification-evaluation/16-RESEARCH.md

diff --git a/.planning/phases/16-classification-evaluation/16-RESEARCH.md b/.planning/phases/16-classification-evaluation/16-RESEARCH.md
new file mode 100644
index 0000000..aa3271b
--- /dev/null
+++ b/.planning/phases/16-classification-evaluation/16-RESEARCH.md
@@ -0,0 +1,439 @@
+# Phase 16: Classification Evaluation - Research
+
+**Researched:** 2026-02-18
+**Domain:** Classification metrics computation, prediction import, confusion matrix, frontend evaluation UI
+**Confidence:** HIGH (internal codebase extension using established patterns, no new libraries)
+
+## Summary
+
+Phase 16 adds classification model evaluation to a codebase that already has a mature detection evaluation pipeline. The existing detection evaluation (`app/services/evaluation.py`, ~560 lines) computes IoU-based PR curves, mAP via supervision, and detection confusion matrices -- all irrelevant for classification. Classification evaluation is fundamentally simpler: no spatial matching, no IoU thresholds, just per-sample label comparison between ground truth and predicted class.
+
+The work spans five areas: (1) classification prediction import -- a new JSONL parser and format option for the existing prediction import dialog, (2) a new `compute_classification_evaluation` service (~50-80 lines) that computes accuracy, F1, per-class precision/recall, and confusion matrix from DuckDB queries, (3) a new `classify_errors` function for error analysis (correct/misclassified/missing), (4) frontend evaluation panel that shows classification-appropriate metrics instead of detection mAP/PR curves, and (5) GT vs predicted label display on grid thumbnails and the detail modal.
+
+The codebase already has all infrastructure needed: the `annotations` table stores classification GT with sentinel bbox values (0.0), the `source` column distinguishes GT from predictions, `dataset_type` is stored on the dataset, and the frontend threads `datasetType` through components. Phase 15 already hid the detection-only Evaluation/Error Analysis tabs for classification -- this phase un-hides them with classification-specific implementations.
+
+**Primary recommendation:** Create a separate `compute_classification_evaluation` function (not modify the detection one), a `ClassificationPredictionParser` for JSONL prediction import, and classification-specific frontend components/views that coexist alongside the detection evaluation pipeline. Route between them based on `dataset_type` at the API and component levels.
+
+## Standard Stack
+
+### Core (already in use -- no new dependencies)
+
+| Library | Purpose | Status |
+|---------|---------|--------|
+| DuckDB | SQL queries for metric computation, confusion matrix | In use |
+| FastAPI | API endpoints | In use |
+| Pydantic | Response models | In use |
+| NumPy | Metric calculation (F1, precision, recall) | In use |
+| Python `json` | JSONL prediction parsing | In use |
+| Next.js + React | Frontend framework | In use |
+| TanStack Query | Data fetching hooks | In use |
+| Recharts | Charts (class distribution bars, confusion matrix) | In use |
+| Zustand | State management (filter store for cell click) | In use |
+
+### Supporting (no new libraries needed)
+
+Classification metrics (accuracy, F1, precision, recall, confusion matrix) are simple enough to compute with NumPy or pure Python from DuckDB query results. scikit-learn would be a natural choice for `classification_report` and `confusion_matrix`, but it is NOT in the current dependency tree and would be overkill for what amounts to ~30 lines of counting logic. The existing supervision library is detection-focused and does not provide classification metrics.
+
+### Alternatives Considered
+
+| Instead of | Could Use | Tradeoff |
+|------------|-----------|----------|
+| Custom NumPy classification metrics | scikit-learn `classification_report` | sklearn is ~50MB, adds a heavy dependency for 30 lines of logic. Custom is simple enough. |
+| Separate classification eval function | Modify existing `compute_evaluation` | Detection eval is 560 lines of IoU matching. Adding classification branches would pollute it. Separate function is cleaner. |
+| New `/classification-evaluation` endpoint | Reuse `/evaluation` endpoint with routing | Reusing existing endpoint keeps frontend simpler -- same hook, different response shape. Router can dispatch based on `dataset_type`. |
+| Separate confusion matrix component | Reuse existing `ConfusionMatrix` component | Existing component already works generically (labels + matrix). No "background" class for classification, but the component handles any label set. Reuse directly. |
+
+## Architecture Patterns
+
+### Recommended Change Map
+
+```
+Backend:
+  app/ingestion/
+    classification_prediction_parser.py  # NEW: parse JSONL predictions with confidence
+  app/services/
+    classification_evaluation.py         # NEW: accuracy, F1, confusion matrix, per-class P/R/F1
+    classification_error_analysis.py     # NEW: correct/misclassified/missing categorization
+  app/models/
+    classification_evaluation.py         # NEW: Pydantic response models
+    prediction.py                        # MODIFY: add "classification_jsonl" format option
+  app/routers/
+    datasets.py                          # MODIFY: add classification_jsonl prediction import
+    statistics.py                        # MODIFY: route evaluation endpoint by dataset_type
+
+Frontend:
+  types/
+    evaluation.ts                        # MODIFY: add ClassificationEvaluationResponse type
+    prediction.ts                        # MODIFY: add "classification_jsonl" format
+    error-analysis.ts                    # MODIFY: add classification error types
+  hooks/
+    use-evaluation.ts                    # MODIFY: or create use-classification-evaluation.ts
+    use-error-analysis.ts               # MODIFY: or create classification variant
+  components/stats/
+    stats-dashboard.tsx                  # MODIFY: un-hide Evaluation/ErrorAnalysis for classification
+    evaluation-panel.tsx                 # MODIFY: branch on datasetType, render classification metrics
+    classification-metrics-cards.tsx     # NEW: accuracy, macro F1, weighted F1 cards
+    classification-per-class-table.tsx   # NEW: per-class P/R/F1 table
+    error-analysis-panel.tsx            # MODIFY: branch on datasetType
+  components/grid/
+    grid-cell.tsx                        # MODIFY: show predicted label badge alongside GT
+  components/detail/
+    sample-modal.tsx                     # ALREADY shows GT vs predicted for classification
+    prediction-import-dialog.tsx         # MODIFY: add classification_jsonl format option
+```
+
+### Pattern 1: Classification Prediction JSONL Format
+
+**What:** Classification predictions as JSONL with filename, predicted_label, and confidence.
+**When to use:** Importing classification model outputs.
+
+```jsonl
+{"filename": "img_001.jpg", "label": "cat", "confidence": 0.95}
+{"filename": "img_002.jpg", "label": "dog", "confidence": 0.87}
+{"filename": "img_003.jpg", "label": "bird", "confidence": 0.72}
+```
+
+The parser reuses the same flexible key lookup from `ClassificationJSONLParser`:
+- Filename keys: `filename`, `file_name`, `image`, `path`
+- Label keys: `label`, `class`, `category`, `class_name`, `predicted_label`, `prediction`
+- Confidence keys: `confidence`, `score`, `probability`, `prob`
+
+Produces annotation rows with sentinel bbox values (0.0), `source = run_name`, and confidence score.
+
+### Pattern 2: Classification Evaluation Backend
+
+**What:** Pure SQL + minimal Python for classification metrics.
+**Why:** No IoU matching, no spatial reasoning -- classification eval is just label comparison.
+
+```python
+# Pseudocode for compute_classification_evaluation
+def compute_classification_evaluation(cursor, dataset_id, source, conf_threshold, split):
+    # 1. Query GT and prediction labels per sample
+    #    SELECT s.id, gt.category_name as gt_label, pred.category_name as pred_label, pred.confidence
+    #    FROM samples s
+    #    LEFT JOIN annotations gt ON gt.sample_id = s.id AND gt.source = 'ground_truth'
+    #    LEFT JOIN annotations pred ON pred.sample_id = s.id AND pred.source = ?
+    #    WHERE s.dataset_id = ? AND pred.confidence >= ?
+
+    # 2. Compute confusion matrix: NxN array where N = len(unique_classes)
+    # 3. Derive from confusion matrix:
+    #    - Accuracy = trace(CM) / sum(CM)
+    #    - Per-class precision = CM[i,i] / sum(CM[:,i])
+    #    - Per-class recall = CM[i,i] / sum(CM[i,:])
+    #    - Per-class F1 = 2*P*R/(P+R)
+    #    - Macro F1 = mean(per_class_F1)
+    #    - Weighted F1 = weighted mean by support
+    # 4. Return ClassificationEvaluationResponse
+```
+
+### Pattern 3: Endpoint Routing by Dataset Type
+
+**What:** The existing `/evaluation` endpoint checks `dataset_type` and dispatches to the appropriate evaluation function.
+**Why:** Frontend uses the same `useEvaluation` hook but gets a response shaped for the dataset type.
+
+Two options:
+
+**Option A: Same endpoint, different response** -- The router checks `dataset_type` and calls either `compute_evaluation` or `compute_classification_evaluation`. The frontend receives a discriminated union type. **Downside:** TypeScript type narrowing is more complex.
+
+**Option B: Same endpoint, superset response** -- Return a response with optional fields. Classification omits `pr_curves` and uses `accuracy/f1` instead of `mAP`. **Downside:** Many optional fields.
+
+**Recommendation: Option A** with a `type` discriminant field. The frontend `useEvaluation` hook returns `EvaluationResponse | ClassificationEvaluationResponse`, and components branch at the panel level (not per-widget). This is the same pattern used for `datasetType` branching elsewhere.
+
+### Pattern 4: Classification Confusion Matrix (No Background Class)
+
+**What:** Classification confusion matrix is simpler than detection: N x N where N = number of classes (no "background" row/col).
+**Why:** In detection, unmatched predictions and GTs map to "background". In classification, every sample has exactly one GT label and at most one predicted label -- there's no spatial mismatch.
+
+The existing `ConfusionMatrix` frontend component takes `matrix: number[][]` and `labels: string[]` -- it works unchanged. The difference is the labels array won't include "background".
+
+For click-to-filter: classification confusion cell samples are trivial -- just query samples where `gt_label = X AND pred_label = Y`. No IoU re-matching needed.
+
+### Pattern 5: Classification Error Analysis Categories
+
+**What:** For classification, error categories are:
+- **Correct** -- GT label matches predicted label
+- **Misclassified** -- GT label differs from predicted label (with confidence above threshold)
+- **Missing prediction** -- Sample has GT but no prediction (or prediction below threshold)
+
+**Why:** Detection error analysis uses TP/Hard FP/Label Error/FN based on IoU matching. Classification has no spatial dimension, so categories simplify to match/mismatch/absent.
+
+### Pattern 6: GT vs Predicted Badge on Grid Thumbnails
+
+**What:** For classification datasets with predictions, grid thumbnails show both GT and predicted labels.
+**Current state:** `grid-cell.tsx` shows a `ClassBadge` for GT only (line 101). Need to also show predicted label.
+
+```tsx
+// Current: only GT badge
+<ClassBadge label={annotations.find(a => a.source === "ground_truth")?.category_name} />
+
+// New: GT + Predicted, with visual differentiation
+<ClassBadge label={gtLabel} />
+{predLabel && (
+  <PredBadge label={predLabel} isCorrect={gtLabel === predLabel} />
+)}
+```
+
+Color coding: green border/bg when predicted matches GT, red when mismatch.
+
+### Anti-Patterns to Avoid
+
+- **Modifying existing detection evaluation code:** The detection eval is 560 lines of IoU-based matching. Do NOT add classification branches inside it. Write a separate function.
+- **Using IoU threshold for classification:** Classification has no spatial matching. The evaluation controls should hide the IoU slider for classification datasets.
+- **Importing predictions as detection annotations:** Classification predictions should use sentinel bbox (0.0) just like GT. They should NOT have bbox values.
+- **Separate /classification-evaluation endpoint:** This would require new frontend hooks. Better to reuse the existing endpoint and dispatch by dataset_type in the router.
+
+## Don't Hand-Roll
+
+| Problem | Don't Build | Use Instead | Why |
+|---------|-------------|-------------|-----|
+| Confusion matrix computation | Manual nested loop counting | NumPy 2D histogram or simple dict counting | Off-by-one errors, class index mapping bugs |
+| F1 score from P/R | Manual formula in each place | Single utility function `compute_f1(p, r)` | Avoid div-by-zero in multiple places |
+| Sample-to-label join | Python-side iteration | DuckDB JOIN query | Let the DB engine do the join, return results |
+
+**Key insight:** Classification eval is genuinely simple -- the danger is overcomplicating it, not undercomplicating it. A ~50-80 line Python function with a handful of NumPy operations is all that's needed.
+
+## Common Pitfalls
+
+### Pitfall 1: Division by Zero in Metrics
+**What goes wrong:** Per-class precision/recall/F1 can have zero denominators when a class has no predictions or no GT samples.
+**Why it happens:** Edge case: a class exists in GT but model never predicts it, or vice versa.
+**How to avoid:** Guard every division: `precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0`. Same for recall and F1.
+**Warning signs:** NaN values in the frontend metrics table.
+
+### Pitfall 2: Multi-Label vs Single-Label Confusion
+**What goes wrong:** Classification JSONL parser from Phase 15 supports multi-label (list of labels per image). If a sample has multiple GT labels, which one do you compare against the prediction?
+**Why it happens:** The parser emits one annotation row per label for multi-label images.
+**How to avoid:** For evaluation, assume single-label classification. If a sample has multiple GT annotations, use the first one (or the one with highest confidence). Document this limitation.
+**Warning signs:** Inflated confusion matrix counts (same sample counted multiple times).
+
+### Pitfall 3: Sample ID Mismatch Between GT and Predictions
+**What goes wrong:** Prediction JSONL uses filenames but samples have integer IDs from the JSONL line index. Predictions need to be matched by filename, not by sample_id directly.
+**Why it happens:** ClassificationJSONLParser generates sample IDs as `"{split}_{idx}"` or `"{idx}"`, not from filenames.
+**How to avoid:** The prediction parser must look up sample_id by filename, just like detection prediction import does. Build a `filename -> sample_id` lookup from the samples table.
+**Warning signs:** Zero predictions imported, or predictions attached to wrong samples.
+
+### Pitfall 4: Confidence Threshold with No Slider
+**What goes wrong:** If the IoU slider is hidden for classification but the confidence slider is kept, the default conf_threshold (0.25) might filter out low-confidence predictions unexpectedly.
+**Why it happens:** Detection evaluation defaults to conf_threshold=0.25. Classification users might not realize predictions below 0.25 confidence are being excluded.
+**How to avoid:** Keep the confidence slider visible for classification. Only hide the IoU slider.
+**Warning signs:** "Missing prediction" count unexpectedly high.
+
+### Pitfall 5: Existing Tab Hiding Logic
+**What goes wrong:** Phase 15 hides Evaluation/Error Analysis/Worst Images/Intelligence tabs for classification datasets (`!isClassification` guard in stats-dashboard.tsx). This phase needs to un-hide Evaluation and Error Analysis with classification-specific content.
+**Why it happens:** Phase 15 correctly hid detection-only tabs. Phase 16 needs to selectively re-enable them.
+**How to avoid:** Change the guard from `!isClassification` to a more nuanced check: always show Evaluation and Error Analysis tabs, but render different content based on datasetType. Keep Worst Images and Intelligence hidden for now (they're detection-specific).
+**Warning signs:** Evaluation tab still hidden after Phase 16 changes.
+
+## Code Examples
+
+### Classification Evaluation Response Model
+
+```python
+# app/models/classification_evaluation.py
+from pydantic import BaseModel
+
+class ClassificationPerClassMetrics(BaseModel):
+    class_name: str
+    precision: float
+    recall: float
+    f1: float
+    support: int  # number of GT samples for this class
+
+class ClassificationEvaluationResponse(BaseModel):
+    accuracy: float
+    macro_f1: float
+    weighted_f1: float
+    per_class_metrics: list[ClassificationPerClassMetrics]
+    confusion_matrix: list[list[int]]
+    confusion_matrix_labels: list[str]
+    conf_threshold: float
+    # Discriminant field for frontend type narrowing
+    evaluation_type: str = "classification"
+```
+
+### Classification Confusion Cell Samples Query
+
+```python
+# Much simpler than detection -- no IoU re-matching needed
+def get_classification_confusion_cell_samples(
+    cursor, dataset_id, source, actual_class, predicted_class, conf_threshold, split=None
+):
+    """Return sample IDs where GT=actual_class and pred=predicted_class."""
+    # Build query with optional split filter
+    query = """
+        SELECT gt.sample_id
+        FROM annotations gt
+        JOIN annotations pred ON gt.sample_id = pred.sample_id AND gt.dataset_id = pred.dataset_id
+        WHERE gt.dataset_id = ? AND gt.source = 'ground_truth'
+        AND pred.source = ? AND pred.confidence >= ?
+        AND gt.category_name = ? AND pred.category_name = ?
+    """
+    params = [dataset_id, source, conf_threshold, actual_class, predicted_class]
+
+    if split:
+        query += " AND gt.sample_id IN (SELECT id FROM samples WHERE dataset_id = ? AND split = ?)"
+        params.extend([dataset_id, split])
+
+    rows = cursor.execute(query, params).fetchall()
+    return [r[0] for r in rows]
+```
+
+### Classification Prediction Parser
+
+```python
+# app/ingestion/classification_prediction_parser.py
+class ClassificationPredictionParser:
+    """Parse classification JSONL predictions into annotation rows."""
+
+    _LABEL_KEYS = ("label", "class", "category", "class_name", "predicted_label", "prediction")
+    _CONFIDENCE_KEYS = ("confidence", "score", "probability", "prob")
+    _FILENAME_KEYS = ("filename", "file_name", "image", "path")
+
+    def parse_streaming(self, file_path, sample_lookup, dataset_id, source="prediction", batch_size=5000):
+        """Yield DataFrames of prediction annotation rows.
+
+        sample_lookup: dict[filename, sample_id] from samples table
+        """
+        batch = []
+        for line in open(file_path, encoding="utf-8"):
+            record = json.loads(line.strip())
+            filename = _get_field(record, self._FILENAME_KEYS)
+            sample_id = sample_lookup.get(filename)
+            if not sample_id:
+                continue  # skip predictions for unknown files
+
+            label = _get_field(record, self._LABEL_KEYS)
+            confidence = _get_field(record, self._CONFIDENCE_KEYS)
+
+            batch.append({
+                "id": str(uuid.uuid4()),
+                "dataset_id": dataset_id,
+                "sample_id": sample_id,
+                "category_name": str(label),
+                "bbox_x": 0.0, "bbox_y": 0.0, "bbox_w": 0.0, "bbox_h": 0.0,
+                "area": 0.0, "is_crowd": False,
+                "source": source,
+                "confidence": float(confidence) if confidence else None,
+                "metadata": None,
+            })
+
+            if len(batch) >= batch_size:
+                yield pd.DataFrame(batch)
+                batch = []
+
+        if batch:
+            yield pd.DataFrame(batch)
+```
+
+### Frontend Classification Evaluation Panel
+
+```tsx
+// Key pattern: branch at the panel level based on datasetType
+// in evaluation-panel.tsx or a wrapper component
+
+function ClassificationEvaluation({ data }: { data: ClassificationEvaluationResponse }) {
+  return (
+    <div className="space-y-6">
+      {/* Metric cards: Accuracy, Macro F1, Weighted F1 */}
+      <div className="grid grid-cols-3 gap-4">
+        <MetricCard label="Accuracy" value={data.accuracy} />
+        <MetricCard label="Macro F1" value={data.macro_f1} />
+        <MetricCard label="Weighted F1" value={data.weighted_f1} />
+      </div>
+
+      {/* Confusion matrix (reuse existing component -- no background class) */}
+      <ConfusionMatrix
+        matrix={data.confusion_matrix}
+        labels={data.confusion_matrix_labels}
+        onCellClick={handleCellClick}
+      />
+
+      {/* Per-class P/R/F1 table */}
+      <ClassificationPerClassTable metrics={data.per_class_metrics} />
+    </div>
+  );
+}
+```
+
+### GT vs Predicted Badge on Grid
+
+```tsx
+// In grid-cell.tsx, extend the classification branch
+{datasetType === "classification" ? (
+  <>
+    <ClassBadge label={gtLabel} />
+    {predLabel && (
+      <div className={`absolute bottom-1 right-1 z-10 rounded px-1.5 py-0.5 text-[10px] font-semibold ${
+        predLabel === gtLabel
+          ? "bg-green-500/80 text-white"
+          : "bg-red-500/80 text-white"
+      }`}>
+        {predLabel}
+      </div>
+    )}
+  </>
+) : (
+  // existing detection overlay
+)}
+```
+
+## State of the Art
+
+| Old Approach | Current Approach | When Changed | Impact |
+|--------------|------------------|--------------|--------|
+| Detection eval only | Detection + Classification eval side-by-side | Phase 16 | Users can evaluate both dataset types |
+| Hidden eval tabs for classification | Classification-specific eval UI | Phase 16 | Full evaluation experience for classification |
+
+**Deprecated/outdated:**
+- Nothing deprecated. This is net-new functionality.
+
+## Open Questions
+
+1. **Multi-label classification evaluation**
+   - What we know: The ClassificationJSONLParser supports multi-label (list of labels per image). Evaluation metrics for multi-label (Hamming loss, subset accuracy) differ significantly from single-label.
+   - What's unclear: Does the user need multi-label evaluation now?
+   - Recommendation: Scope Phase 16 to single-label classification only. If a sample has multiple GT labels, use the first one. Add multi-label support in a future phase if needed.
+
+2. **Top-K accuracy display**
+   - What we know: Classification models often report top-1 and top-5 accuracy. Current JSONL format only has one predicted label per sample.
+   - What's unclear: Should the JSONL prediction format support multiple predicted labels with ranked confidence?
+   - Recommendation: Keep simple -- one prediction per sample for Phase 16. The `confidence` field already provides signal. Top-K can be added later by supporting prediction arrays.
+
+3. **PR curves for classification**
+   - What we know: PR curves are meaningful for classification (varying confidence threshold to trace precision/recall). Detection evaluation already has PR curve infrastructure.
+   - What's unclear: Are PR curves needed for Phase 16 or can they be deferred?
+   - Recommendation: Defer PR curves for classification. The requirements specify accuracy, F1, confusion matrix, and per-class P/R/F1 -- no PR curve charts. This significantly simplifies the phase.
+
+## Sources
+
+### Primary (HIGH confidence)
+- Codebase inspection: `app/services/evaluation.py` (560 lines, detection-only evaluation with IoU matching)
+- Codebase inspection: `app/services/error_analysis.py` (detection-specific error categorization)
+- Codebase inspection: `app/ingestion/classification_jsonl_parser.py` (Phase 15 classification GT parser)
+- Codebase inspection: `app/ingestion/prediction_parser.py` (COCO detection prediction parser)
+- Codebase inspection: `app/routers/statistics.py` (evaluation and error-analysis endpoints)
+- Codebase inspection: `app/routers/datasets.py` (prediction import endpoint, format dispatch)
+- Codebase inspection: `frontend/src/components/stats/stats-dashboard.tsx` (tab hiding logic)
+- Codebase inspection: `frontend/src/components/stats/evaluation-panel.tsx` (detection eval UI)
+- Codebase inspection: `frontend/src/components/stats/confusion-matrix.tsx` (reusable component)
+- Codebase inspection: `frontend/src/components/grid/grid-cell.tsx` (ClassBadge for GT label)
+- Codebase inspection: `frontend/src/components/detail/sample-modal.tsx` (already shows GT vs predicted for classification)
+- Codebase inspection: `frontend/src/hooks/use-evaluation.ts` (TanStack Query hook)
+- Codebase inspection: `frontend/src/hooks/use-confusion-cell.ts` (imperative fetch for cell click)
+- Codebase inspection: `frontend/src/types/evaluation.ts` (TypeScript response types)
+
+### Secondary (MEDIUM confidence)
+- Classification metrics formulas (accuracy, F1, macro/weighted averaging) are well-established ML fundamentals, not library-specific.
+
+### Tertiary (LOW confidence)
+- None. All findings are from direct codebase inspection.
+
+## Metadata
+
+**Confidence breakdown:**
+- Standard stack: HIGH - No new dependencies needed, all existing libraries sufficient
+- Architecture: HIGH - Clear separation pattern (separate classification eval function, existing component reuse, dataset_type routing)
+- Pitfalls: HIGH - Identified from direct codebase inspection (multi-label, sample ID matching, tab hiding, div-by-zero)
+
+**Research date:** 2026-02-18
+**Valid until:** 2026-03-18 (internal codebase patterns, stable)

From d6eef423815a702a08034e6ebf2048958d1c56a7 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 21:57:52 -0500
Subject: [PATCH 19/38] docs(16): create phase plan

---
 .planning/ROADMAP.md                          |   5 +-
 .../16-01-PLAN.md                             | 214 +++++++++++++++++
 .../16-02-PLAN.md                             | 217 ++++++++++++++++++
 3 files changed, 435 insertions(+), 1 deletion(-)
 create mode 100644 .planning/phases/16-classification-evaluation/16-01-PLAN.md
 create mode 100644 .planning/phases/16-classification-evaluation/16-02-PLAN.md

diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index aeb64ac..5a2643f 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -103,7 +103,10 @@ Plans:
   3. User sees a confusion matrix and can click any cell to filter the grid to images with that GT/predicted class pair
   4. User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view
   5. User sees GT vs predicted label comparison on grid thumbnails and in the detail modal
-**Plans**: TBD
+**Plans**: 2 plans
+Plans:
+- [ ] 16-01-PLAN.md -- Backend: classification prediction parser, evaluation service, error analysis service, endpoint routing
+- [ ] 16-02-PLAN.md -- Frontend: types, hooks, prediction import dialog, evaluation panel, error analysis panel, grid badges
 
 #### Phase 17: Classification Polish
 **Goal**: Classification workflows are production-ready for high-cardinality datasets (43+ classes) with visual aids that surface actionable insights
diff --git a/.planning/phases/16-classification-evaluation/16-01-PLAN.md b/.planning/phases/16-classification-evaluation/16-01-PLAN.md
new file mode 100644
index 0000000..65c8603
--- /dev/null
+++ b/.planning/phases/16-classification-evaluation/16-01-PLAN.md
@@ -0,0 +1,214 @@
+---
+phase: 16-classification-evaluation
+plan: 01
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+  - app/ingestion/classification_prediction_parser.py
+  - app/services/classification_evaluation.py
+  - app/services/classification_error_analysis.py
+  - app/models/classification_evaluation.py
+  - app/models/prediction.py
+  - app/routers/datasets.py
+  - app/routers/statistics.py
+  - app/routers/_run_name.py
+autonomous: true
+
+must_haves:
+  truths:
+    - "Classification predictions can be imported via POST /datasets/{id}/predictions with format=classification_jsonl"
+    - "GET /datasets/{id}/evaluation returns accuracy, F1, confusion matrix, per-class P/R/F1 for classification datasets"
+    - "GET /datasets/{id}/confusion-cell-samples returns sample IDs for a (gt_class, pred_class) pair without IoU matching"
+    - "GET /datasets/{id}/error-analysis returns correct/misclassified/missing counts for classification datasets"
+  artifacts:
+    - path: "app/ingestion/classification_prediction_parser.py"
+      provides: "JSONL prediction parser with sentinel bbox, filename-to-sample_id lookup"
+    - path: "app/services/classification_evaluation.py"
+      provides: "compute_classification_evaluation function returning accuracy, F1, confusion matrix, per-class metrics"
+    - path: "app/services/classification_error_analysis.py"
+      provides: "classify_errors function returning correct/misclassified/missing per sample"
+    - path: "app/models/classification_evaluation.py"
+      provides: "ClassificationEvaluationResponse, ClassificationPerClassMetrics Pydantic models"
+  key_links:
+    - from: "app/routers/datasets.py"
+      to: "app/ingestion/classification_prediction_parser.py"
+      via: "format == 'classification_jsonl' branch in import_predictions"
+      pattern: "classification_jsonl"
+    - from: "app/routers/statistics.py"
+      to: "app/services/classification_evaluation.py"
+      via: "dataset_type == 'classification' check in get_evaluation"
+      pattern: "compute_classification_evaluation"
+    - from: "app/routers/statistics.py"
+      to: "app/services/classification_error_analysis.py"
+      via: "dataset_type == 'classification' check in get_error_analysis"
+      pattern: "classify_errors"
+---
+
+<objective>
+Build classification prediction import, evaluation metrics, confusion matrix, and error analysis backend services.
+
+Purpose: Enable classification model evaluation by providing prediction import, metric computation (accuracy, F1, confusion matrix, per-class P/R/F1), and error categorization (correct/misclassified/missing) -- all routed from existing endpoints based on dataset_type.
+
+Output: Classification prediction parser, evaluation service, error analysis service, Pydantic response models, and updated API routing.
+</objective>
+
+<execution_context>
+@/Users/ortizeg/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/ortizeg/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/16-classification-evaluation/16-RESEARCH.md
+@.planning/phases/15-classification-ingestion-display/15-01-SUMMARY.md
+
+Key existing files to reference:
+@app/ingestion/classification_jsonl_parser.py  -- GT parser pattern to follow
+@app/ingestion/prediction_parser.py            -- COCO prediction parser for pattern reference
+@app/services/evaluation.py                    -- Detection evaluation (do NOT modify)
+@app/services/error_analysis.py                -- Detection error analysis (do NOT modify)
+@app/models/evaluation.py                      -- Detection evaluation response models
+@app/models/error_analysis.py                  -- Detection error analysis response models
+@app/models/prediction.py                      -- PredictionImportRequest (add format option)
+@app/routers/datasets.py                       -- import_predictions endpoint (add classification branch)
+@app/routers/statistics.py                     -- evaluation + error-analysis endpoints (add routing)
+@app/routers/_run_name.py                      -- run_name derivation (add classification_jsonl)
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Classification prediction parser and import endpoint</name>
+  <files>
+    app/ingestion/classification_prediction_parser.py
+    app/models/prediction.py
+    app/routers/datasets.py
+    app/routers/_run_name.py
+  </files>
+  <action>
+    1. Create `app/ingestion/classification_prediction_parser.py`:
+       - Class `ClassificationPredictionParser` following the same pattern as `ClassificationJSONLParser` (from Phase 15).
+       - Flexible key lookup: filename keys (`filename`, `file_name`, `image`, `path`), label keys (`label`, `class`, `category`, `class_name`, `predicted_label`, `prediction`), confidence keys (`confidence`, `score`, `probability`, `prob`).
+       - `parse_streaming(file_path, sample_lookup, dataset_id, source, batch_size=5000)` method that yields `pd.DataFrame` batches.
+       - `sample_lookup` is a `dict[str, str]` mapping filename -> sample_id, built by the caller from the samples table.
+       - Each row: `id=uuid4`, `dataset_id`, `sample_id` from lookup, `category_name=label`, sentinel bbox values (0.0 for all), `area=0.0`, `is_crowd=False`, `source=source`, `confidence=float(confidence)`, `metadata=None`.
+       - Skip lines where filename has no match in `sample_lookup` (silently).
+       - Use a helper `_get_field(record, keys)` like the GT parser does.
+
+    2. Update `app/models/prediction.py`:
+       - Add `"classification_jsonl"` to the `format` Literal type: `Literal["coco", "detection_annotation", "classification_jsonl"]`
+
+    3. Update `app/routers/datasets.py` `import_predictions`:
+       - Add an `elif request.format == "classification_jsonl":` branch after the existing `detection_annotation` branch.
+       - Build `sample_lookup: dict[str, str]` from `SELECT id, file_name FROM samples WHERE dataset_id = ?`.
+       - Instantiate `ClassificationPredictionParser()`.
+       - Call `parse_streaming(prediction_path, sample_lookup, dataset_id, source=run_name)`.
+       - Insert each batch with `INSERT INTO annotations SELECT * FROM batch_df`.
+       - Track `total_inserted` and `total_skipped` (count lines in file minus inserted).
+       - Validate prediction_path exists and is a file (not a directory); raise 400 if not.
+
+    4. Update `app/routers/_run_name.py`:
+       - Add `if fmt == "classification_jsonl": return _from_coco(prediction_path)` (use file stem as run name, same as COCO).
+       - Add before the existing `if fmt == "detection_annotation":` check.
+
+    Important: Do NOT modify the existing detection prediction parsers or evaluation services.
+  </action>
+  <verify>
+    - `python -c "from app.ingestion.classification_prediction_parser import ClassificationPredictionParser; print('OK')"` succeeds
+    - `python -c "from app.models.prediction import PredictionImportRequest; r = PredictionImportRequest(prediction_path='/tmp/test.jsonl', format='classification_jsonl'); print(r.format)"` prints `classification_jsonl`
+    - App starts without import errors: `cd app && python -c "from app.routers.datasets import router; print('OK')"`
+  </verify>
+  <done>
+    - ClassificationPredictionParser parses JSONL predictions with flexible keys and sentinel bbox values
+    - PredictionImportRequest accepts "classification_jsonl" format
+    - import_predictions endpoint routes classification_jsonl format to the new parser
+    - Run name derivation handles classification_jsonl format
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Classification evaluation and error analysis services with endpoint routing</name>
+  <files>
+    app/models/classification_evaluation.py
+    app/services/classification_evaluation.py
+    app/services/classification_error_analysis.py
+    app/routers/statistics.py
+  </files>
+  <action>
+    1. Create `app/models/classification_evaluation.py`:
+       - `ClassificationPerClassMetrics(BaseModel)`: `class_name: str`, `precision: float`, `recall: float`, `f1: float`, `support: int`
+       - `ClassificationEvaluationResponse(BaseModel)`: `accuracy: float`, `macro_f1: float`, `weighted_f1: float`, `per_class_metrics: list[ClassificationPerClassMetrics]`, `confusion_matrix: list[list[int]]`, `confusion_matrix_labels: list[str]`, `conf_threshold: float`, `evaluation_type: str = "classification"`
+
+    2. Create `app/services/classification_evaluation.py`:
+       - `compute_classification_evaluation(cursor, dataset_id, source, conf_threshold, split=None)` returning `ClassificationEvaluationResponse`.
+       - Query GT and prediction labels per sample via JOIN:
+         ```sql
+         SELECT s.id, gt.category_name as gt_label, pred.category_name as pred_label
+         FROM samples s
+         JOIN annotations gt ON gt.sample_id = s.id AND gt.dataset_id = s.dataset_id AND gt.source = 'ground_truth'
+         LEFT JOIN annotations pred ON pred.sample_id = s.id AND pred.dataset_id = s.dataset_id AND pred.source = ? AND pred.confidence >= ?
+         WHERE s.dataset_id = ?
+         ```
+         (Add `AND s.split = ?` if split is provided.)
+       - For multi-label GT: if a sample has multiple GT annotations, use the first one only (GROUP BY sample_id, take MIN(gt.category_name) or use DISTINCT ON equivalent).
+       - Build confusion matrix as dict-of-dicts, derive labels from sorted unique classes (both GT and predicted).
+       - Compute from confusion matrix: accuracy = trace/sum, per-class precision/recall/F1 with div-by-zero guards (return 0.0), macro F1 = mean of per-class F1, weighted F1 = sum(f1_i * support_i) / total_support.
+       - Use numpy for confusion matrix construction if convenient, or pure Python dict counting (either is fine for the scale).
+
+       - `get_classification_confusion_cell_samples(cursor, dataset_id, source, actual_class, predicted_class, conf_threshold, split=None)` returning `list[str]` of sample_ids.
+       - Simple query: JOIN gt and pred annotations WHERE gt.category_name = actual_class AND pred.category_name = predicted_class (with optional split filter).
+
+    3. Create `app/services/classification_error_analysis.py`:
+       - `classify_errors(cursor, dataset_id, source, conf_threshold, split=None)` returning `ErrorAnalysisResponse` (reuse the existing model from `app/models/error_analysis.py`).
+       - Query same GT+pred join as evaluation.
+       - Categories: "correct" (gt == pred), "misclassified" (gt != pred and pred is not None), "missing_prediction" (pred is None).
+       - Build `ErrorSummary` with: `true_positives` = correct count, `hard_false_positives` = 0 (not applicable), `label_errors` = misclassified count, `false_negatives` = missing_prediction count.
+       - Build `per_class` list as `PerClassErrors` for each class: tp = correct for that class, label_error = misclassified for that class, fn = missing for that class, hard_fp = 0.
+       - Build `samples_by_type` dict: keys "correct", "misclassified", "missing_prediction", values are lists of `ErrorSample` with sample_id, error_type, category_name (GT class), confidence.
+
+    4. Update `app/routers/statistics.py`:
+       - Import the new classification services and models.
+       - In `get_evaluation`: after verifying dataset exists, fetch `dataset_type` from the datasets table (add it to the existing SELECT). If `dataset_type == "classification"`, call `compute_classification_evaluation(cursor, dataset_id, source, conf_threshold, split=split)` instead of the detection one. Update the response_model to `EvaluationResponse | ClassificationEvaluationResponse` or remove the response_model constraint and let FastAPI infer.
+       - In `get_confusion_cell_samples_endpoint`: similarly check dataset_type. If classification, call `get_classification_confusion_cell_samples` (no iou_threshold needed). Return same `ConfusionCellSamplesResponse`.
+       - In `get_error_analysis`: check dataset_type. If classification, call `classify_errors` from the classification module (not the detection one). No iou_threshold needed.
+       - Important: The IoU threshold parameter is still accepted but ignored for classification datasets (backward compatible).
+  </action>
+  <verify>
+    - `python -c "from app.services.classification_evaluation import compute_classification_evaluation; print('OK')"` succeeds
+    - `python -c "from app.services.classification_error_analysis import classify_errors; print('OK')"` succeeds
+    - `python -c "from app.models.classification_evaluation import ClassificationEvaluationResponse; print('OK')"` succeeds
+    - App starts without import errors: `cd app && python -c "from app.routers.statistics import router; print('OK')"`
+  </verify>
+  <done>
+    - Classification evaluation returns accuracy, macro F1, weighted F1, per-class P/R/F1, confusion matrix
+    - Classification confusion cell samples returns sample IDs without IoU matching
+    - Classification error analysis returns correct/misclassified/missing categorization
+    - Existing endpoints route to classification services when dataset_type is "classification"
+    - Detection evaluation is completely untouched
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+1. Import classification predictions: POST /datasets/{id}/predictions with format=classification_jsonl should insert annotation rows with sentinel bbox values
+2. Evaluate classification: GET /datasets/{id}/evaluation (for a classification dataset) should return accuracy, F1, confusion_matrix, per_class_metrics with evaluation_type="classification"
+3. Confusion cell drill-down: GET /datasets/{id}/confusion-cell-samples should return sample IDs for classification without IoU
+4. Error analysis: GET /datasets/{id}/error-analysis (for classification) should return correct/misclassified/missing categories
+5. Detection datasets: All existing detection evaluation endpoints should work exactly as before
+</verification>
+
+<success_criteria>
+- Classification predictions importable via existing endpoint with new format option
+- Evaluation endpoint returns classification-specific response for classification datasets
+- Confusion cell samples work without IoU matching for classification
+- Error analysis uses correct/misclassified/missing categories for classification
+- Zero changes to detection evaluation code paths
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/16-classification-evaluation/16-01-SUMMARY.md`
+</output>
diff --git a/.planning/phases/16-classification-evaluation/16-02-PLAN.md b/.planning/phases/16-classification-evaluation/16-02-PLAN.md
new file mode 100644
index 0000000..2c18861
--- /dev/null
+++ b/.planning/phases/16-classification-evaluation/16-02-PLAN.md
@@ -0,0 +1,217 @@
+---
+phase: 16-classification-evaluation
+plan: 02
+type: execute
+wave: 2
+depends_on: ["16-01"]
+files_modified:
+  - frontend/src/types/evaluation.ts
+  - frontend/src/types/prediction.ts
+  - frontend/src/types/error-analysis.ts
+  - frontend/src/hooks/use-evaluation.ts
+  - frontend/src/hooks/use-confusion-cell.ts
+  - frontend/src/components/detail/prediction-import-dialog.tsx
+  - frontend/src/components/stats/stats-dashboard.tsx
+  - frontend/src/components/stats/evaluation-panel.tsx
+  - frontend/src/components/stats/error-analysis-panel.tsx
+  - frontend/src/components/grid/grid-cell.tsx
+autonomous: true
+
+must_haves:
+  truths:
+    - "User can select 'Classification JSONL' format in the prediction import dialog and import predictions"
+    - "User sees Evaluation and Error Analysis tabs for classification datasets (un-hidden from Phase 15)"
+    - "User sees accuracy, macro F1, weighted F1 metric cards in the evaluation panel for classification"
+    - "User sees confusion matrix with click-to-filter for classification"
+    - "User sees per-class precision/recall/F1 table for classification"
+    - "User sees correct/misclassified/missing error categories in error analysis for classification"
+    - "User sees GT and predicted class badges on grid thumbnails for classification datasets with predictions"
+  artifacts:
+    - path: "frontend/src/types/evaluation.ts"
+      provides: "ClassificationEvaluationResponse type with discriminant field"
+    - path: "frontend/src/components/stats/evaluation-panel.tsx"
+      provides: "Classification evaluation rendering with metric cards, confusion matrix, per-class table"
+    - path: "frontend/src/components/stats/error-analysis-panel.tsx"
+      provides: "Classification error analysis with correct/misclassified/missing categories"
+    - path: "frontend/src/components/grid/grid-cell.tsx"
+      provides: "Predicted class badge alongside GT badge for classification"
+  key_links:
+    - from: "frontend/src/components/stats/evaluation-panel.tsx"
+      to: "/datasets/{id}/evaluation"
+      via: "useEvaluation hook"
+      pattern: "evaluation_type.*classification"
+    - from: "frontend/src/components/stats/stats-dashboard.tsx"
+      to: "evaluation-panel.tsx"
+      via: "Evaluation tab now visible for classification datasets"
+      pattern: "isClassification.*evaluation"
+    - from: "frontend/src/components/grid/grid-cell.tsx"
+      to: "annotations"
+      via: "Finding prediction annotation to display predicted label"
+      pattern: "source.*ground_truth.*prediction"
+---
+
+<objective>
+Build frontend classification evaluation UI: prediction import format option, evaluation panel with metrics/confusion matrix/per-class table, error analysis panel, and GT vs predicted badges on grid.
+
+Purpose: Give users a complete classification evaluation experience -- importing predictions, viewing metrics (accuracy, F1), exploring the confusion matrix, analyzing errors, and seeing GT vs predicted labels on thumbnails.
+
+Output: Updated prediction import dialog, classification evaluation panel, classification error analysis, and grid cell predicted label badges.
+</objective>
+
+<execution_context>
+@/Users/ortizeg/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/ortizeg/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/16-classification-evaluation/16-RESEARCH.md
+@.planning/phases/16-classification-evaluation/16-01-SUMMARY.md
+@.planning/phases/15-classification-ingestion-display/15-02-SUMMARY.md
+
+Key existing files to reference:
+@frontend/src/types/evaluation.ts              -- Add classification response type
+@frontend/src/types/prediction.ts              -- Add classification_jsonl format
+@frontend/src/hooks/use-evaluation.ts          -- Evaluation query hook
+@frontend/src/hooks/use-confusion-cell.ts      -- Confusion cell click handler
+@frontend/src/components/detail/prediction-import-dialog.tsx  -- Add format option
+@frontend/src/components/stats/stats-dashboard.tsx  -- Un-hide tabs for classification
+@frontend/src/components/stats/evaluation-panel.tsx -- Branch for classification
+@frontend/src/components/stats/error-analysis-panel.tsx  -- Branch for classification
+@frontend/src/components/stats/confusion-matrix.tsx      -- Reusable, no changes needed
+@frontend/src/components/stats/metrics-cards.tsx         -- Reusable for metric cards
+@frontend/src/components/stats/per-class-table.tsx       -- Detection version, reference
+@frontend/src/components/grid/grid-cell.tsx              -- Add predicted label badge
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Types, hooks, prediction import dialog, and grid predicted label badge</name>
+  <files>
+    frontend/src/types/evaluation.ts
+    frontend/src/types/prediction.ts
+    frontend/src/hooks/use-evaluation.ts
+    frontend/src/hooks/use-confusion-cell.ts
+    frontend/src/components/detail/prediction-import-dialog.tsx
+    frontend/src/components/grid/grid-cell.tsx
+  </files>
+  <action>
+    1. Update `frontend/src/types/evaluation.ts`:
+       - Add `ClassificationPerClassMetrics` interface: `class_name: string`, `precision: number`, `recall: number`, `f1: number`, `support: number`.
+       - Add `ClassificationEvaluationResponse` interface: `accuracy: number`, `macro_f1: number`, `weighted_f1: number`, `per_class_metrics: ClassificationPerClassMetrics[]`, `confusion_matrix: number[][]`, `confusion_matrix_labels: string[]`, `conf_threshold: number`, `evaluation_type: "classification"`.
+       - Add `evaluation_type?: "detection"` to the existing `EvaluationResponse` for type discrimination.
+       - Export a union type: `export type AnyEvaluationResponse = EvaluationResponse | ClassificationEvaluationResponse;`
+
+    2. Update `frontend/src/types/prediction.ts`:
+       - Add `"classification_jsonl"` to the `format` union: `format: "coco" | "detection_annotation" | "classification_jsonl"`
+
+    3. Update `frontend/src/hooks/use-evaluation.ts`:
+       - Change the generic type from `EvaluationResponse` to `AnyEvaluationResponse` in the `apiFetch` call.
+       - This allows the hook to return either detection or classification evaluation data transparently.
+
+    4. Update `frontend/src/hooks/use-confusion-cell.ts`:
+       - Check if the hook passes `iou_threshold` -- for classification, the backend ignores it, so no change needed. Just verify the function still works (it should, since the backend accepts but ignores iou_threshold for classification).
+
+    5. Update `frontend/src/components/detail/prediction-import-dialog.tsx`:
+       - Add `{ value: "classification_jsonl", label: "Classification JSONL" }` to `FORMAT_OPTIONS`.
+       - Update the input label from "Prediction Directory" to dynamically change based on format: if `format === "classification_jsonl"` show "Prediction File", else show "Prediction Path" (classification JSONL is a single file, not a directory).
+
+    6. Update `frontend/src/components/grid/grid-cell.tsx`:
+       - In the classification branch (where `ClassBadge` is rendered), also find the first prediction annotation: `annotations.find(a => a.source !== "ground_truth")`.
+       - If a prediction annotation exists, render a small badge at bottom-right of the thumbnail:
+         - Green background (`bg-green-500/80 text-white`) if predicted label matches GT label.
+         - Red background (`bg-red-500/80 text-white`) if predicted label differs from GT label.
+         - Show the predicted class name in the badge.
+         - Style: `absolute bottom-1 right-1 z-10 rounded px-1.5 py-0.5 text-[10px] font-semibold`.
+  </action>
+  <verify>
+    - `cd frontend && npx tsc --noEmit` passes with no type errors
+    - Prediction import dialog renders 3 format options (visual check via dev server)
+  </verify>
+  <done>
+    - ClassificationEvaluationResponse type exists with all required fields
+    - PredictionImportRequest accepts classification_jsonl format
+    - useEvaluation hook returns AnyEvaluationResponse (works for both dataset types)
+    - Prediction import dialog shows "Classification JSONL" format option
+    - Grid cells show predicted class badge (green=correct, red=mismatch) alongside GT badge
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Classification evaluation panel and error analysis panel</name>
+  <files>
+    frontend/src/components/stats/stats-dashboard.tsx
+    frontend/src/components/stats/evaluation-panel.tsx
+    frontend/src/components/stats/error-analysis-panel.tsx
+  </files>
+  <action>
+    1. Update `frontend/src/components/stats/stats-dashboard.tsx`:
+       - Change the Evaluation tab guard from `{!isClassification && (` to always show the Evaluation tab (remove the `!isClassification` wrapper). Keep the `disabled={!hasPredictions}` check.
+       - Change the Error Analysis tab guard similarly: always show, keep `disabled={!hasPredictions}`.
+       - Keep Worst Images and Intelligence tabs hidden for classification (`!isClassification`).
+       - Pass `datasetType` prop to `EvaluationPanel` and `ErrorAnalysisPanel`:
+         - `<EvaluationPanel datasetId={datasetId} split={split} excludedClasses={excludedClasses} datasetType={datasetType} />`
+         - `<ErrorAnalysisPanel datasetId={datasetId} split={split} datasetType={datasetType} />`
+
+    2. Update `frontend/src/components/stats/evaluation-panel.tsx`:
+       - Accept `datasetType?: string` prop.
+       - Add `const isClassification = datasetType === "classification";`
+       - **Controls section:** Hide the IoU slider when `isClassification` (classification has no IoU). Keep the confidence slider and source dropdown visible.
+       - **Data rendering:** After fetching evaluation data, check `evaluation_type`:
+         - If `evaluation_type === "classification"` (or `isClassification`), render classification layout:
+           - **Metric cards row (3 cards):** Accuracy, Macro F1, Weighted F1. Use the existing `MetricsCards` component pattern or render 3 simple stat cards inline: `<div className="grid grid-cols-3 gap-4">` with each card showing label + percentage value formatted to 1 decimal.
+           - **Confusion matrix:** Reuse existing `<ConfusionMatrix>` component unchanged -- it takes `matrix` and `labels` props generically. Wire `onCellClick` the same way as detection (calls `fetchConfusionCellSamples` -> sets filter store sample IDs).
+           - **Per-class table:** Render a table with columns: Class, Precision, Recall, F1, Support. Map over `per_class_metrics`. Use the same styling as the existing `PerClassTable` component. Can either reuse `PerClassTable` with a wrapper or render inline (the classification fields differ from detection: no AP columns).
+         - If detection (no `evaluation_type` or `evaluation_type === "detection"`), render existing detection layout unchanged.
+
+    3. Update `frontend/src/components/stats/error-analysis-panel.tsx`:
+       - Accept `datasetType?: string` prop.
+       - Add `const isClassification = datasetType === "classification";`
+       - **Controls:** Hide IoU slider for classification. Keep confidence slider and source dropdown.
+       - **Summary cards:** When classification, show 3 cards: Correct (green), Misclassified (red/amber), Missing Prediction (orange). Map from the error response: `true_positives` = correct count, `label_errors` = misclassified count, `false_negatives` = missing prediction count.
+       - **Bar chart:** When classification, show bars for Correct/Misclassified/Missing instead of TP/Hard FP/Label Error/FN. Use same `Recharts` bar chart pattern.
+       - **Sample grid:** When classification, show error samples grouped by type (correct, misclassified, missing_prediction) using same thumbnail grid pattern as detection error analysis.
+       - For detection datasets, render everything exactly as before (no changes to existing behavior).
+  </action>
+  <verify>
+    - `cd frontend && npx tsc --noEmit` passes with no type errors
+    - `cd frontend && npm run build` succeeds
+  </verify>
+  <done>
+    - Evaluation tab visible for classification datasets (un-hidden from Phase 15)
+    - Error Analysis tab visible for classification datasets (un-hidden from Phase 15)
+    - Worst Images and Intelligence tabs remain hidden for classification
+    - Classification evaluation panel shows accuracy, macro F1, weighted F1 cards + confusion matrix + per-class table
+    - IoU slider hidden for classification in both evaluation and error analysis panels
+    - Classification error analysis shows correct/misclassified/missing categories
+    - Detection evaluation and error analysis completely unchanged
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+1. Import predictions: User selects "Classification JSONL" in import dialog, browses to a .jsonl file, imports successfully
+2. Evaluation tab: Shows for classification datasets with predictions; displays accuracy, F1, confusion matrix, per-class table
+3. Confusion matrix click: Clicking a cell filters the grid to images with that GT/predicted pair
+4. Error analysis: Shows correct/misclassified/missing categories with summary cards and sample grid
+5. Grid badges: Classification thumbnails show GT badge (top) and predicted badge (bottom-right, green/red)
+6. Detection regression: All detection evaluation/error analysis features work exactly as before
+7. No IoU slider: Classification evaluation and error analysis panels hide the IoU threshold slider
+</verification>
+
+<success_criteria>
+- Classification prediction import dialog works with new format option
+- Evaluation panel renders classification-specific metrics (accuracy, F1, confusion matrix, per-class P/R/F1)
+- Error analysis panel renders classification-specific categories (correct, misclassified, missing)
+- Confusion matrix click-to-filter works for classification
+- Grid shows GT + predicted badges for classification with color coding
+- Detection dataset evaluation is completely unaffected
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/16-classification-evaluation/16-02-SUMMARY.md`
+</output>

From 76bda2a24a18c424a9078cbb87d1f2ee83675cb8 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:26:21 -0500
Subject: [PATCH 20/38] feat(16-01): add classification prediction parser and
 import endpoint

- Create ClassificationPredictionParser with flexible key lookup for filename, label, confidence
- Add classification_jsonl format to PredictionImportRequest
- Route classification_jsonl predictions through new parser in import_predictions
- Handle classification_jsonl in run_name derivation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../classification_prediction_parser.py       | 152 ++++++++++++++++++
 app/models/prediction.py                      |   2 +-
 app/routers/_run_name.py                      |   2 +
 app/routers/datasets.py                       |  35 +++-
 4 files changed, 189 insertions(+), 2 deletions(-)
 create mode 100644 app/ingestion/classification_prediction_parser.py

diff --git a/app/ingestion/classification_prediction_parser.py b/app/ingestion/classification_prediction_parser.py
new file mode 100644
index 0000000..96a3a26
--- /dev/null
+++ b/app/ingestion/classification_prediction_parser.py
@@ -0,0 +1,152 @@
+"""Streaming Classification JSONL prediction parser.
+
+Parses JSONL files where each line maps an image filename to a
+predicted classification label with confidence score.  Uses sentinel
+bbox values (all zeros) since classification has no spatial localisation.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from collections.abc import Iterator
+from pathlib import Path
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# Flexible key lookup order for the image filename field.
+_FILENAME_KEYS = ("filename", "file_name", "image", "path")
+
+# Flexible key lookup order for the predicted label field.
+_LABEL_KEYS = (
+    "label",
+    "class",
+    "category",
+    "class_name",
+    "predicted_label",
+    "prediction",
+)
+
+# Flexible key lookup order for the confidence/score field.
+_CONFIDENCE_KEYS = ("confidence", "score", "probability", "prob")
+
+
+def _get_field(
+    record: dict, keys: tuple[str, ...], default: str | float | None = None
+) -> str | float | None:
+    """Return the first matching key's value from *record*."""
+    for k in keys:
+        if k in record:
+            return record[k]
+    return default
+
+
+class ClassificationPredictionParser:
+    """Stream-parse classification JSONL predictions and yield annotation DataFrames.
+
+    Each line of the JSONL file is a JSON object with at minimum a
+    filename field, a label field, and an optional confidence field.
+    """
+
+    def __init__(self, batch_size: int = 5000) -> None:
+        self.batch_size = batch_size
+
+    def parse_streaming(
+        self,
+        file_path: Path,
+        sample_lookup: dict[str, str],
+        dataset_id: str,
+        source: str = "prediction",
+        batch_size: int | None = None,
+    ) -> Iterator[pd.DataFrame]:
+        """Yield DataFrames of prediction rows matching the annotations schema.
+
+        Parameters
+        ----------
+        file_path:
+            Path to a classification JSONL predictions file.
+        sample_lookup:
+            Mapping of ``filename`` -> ``sample_id`` built from the samples table.
+        dataset_id:
+            The dataset these predictions belong to.
+        source:
+            The run name / source label for these predictions.
+        batch_size:
+            Override instance batch_size if provided.
+
+        Yields
+        ------
+        pd.DataFrame
+            Batches with columns matching the annotations table:
+            ``id, dataset_id, sample_id, category_name, bbox_x, bbox_y,
+            bbox_w, bbox_h, area, is_crowd, source, confidence, metadata``.
+        """
+        effective_batch_size = batch_size or self.batch_size
+        batch: list[dict] = []
+        skipped = 0
+
+        with open(file_path, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+
+                filename = _get_field(record, _FILENAME_KEYS)
+                if filename is None:
+                    skipped += 1
+                    continue
+
+                sample_id = sample_lookup.get(str(filename))
+                if sample_id is None:
+                    skipped += 1
+                    continue
+
+                label = _get_field(record, _LABEL_KEYS)
+                if label is None:
+                    skipped += 1
+                    continue
+
+                confidence = _get_field(record, _CONFIDENCE_KEYS)
+                try:
+                    confidence = float(confidence) if confidence is not None else None
+                except (ValueError, TypeError):
+                    confidence = None
+
+                batch.append(
+                    {
+                        "id": str(uuid.uuid4()),
+                        "dataset_id": dataset_id,
+                        "sample_id": sample_id,
+                        "category_name": str(label),
+                        "bbox_x": 0.0,
+                        "bbox_y": 0.0,
+                        "bbox_w": 0.0,
+                        "bbox_h": 0.0,
+                        "area": 0.0,
+                        "is_crowd": False,
+                        "source": source,
+                        "confidence": confidence,
+                        "metadata": None,
+                    }
+                )
+
+                if len(batch) >= effective_batch_size:
+                    yield pd.DataFrame(batch)
+                    batch = []
+
+        if batch:
+            yield pd.DataFrame(batch)
+
+        if skipped > 0:
+            logger.info(
+                "Classification prediction import: skipped %d lines "
+                "(no filename, no sample match, or no label)",
+                skipped,
+            )
diff --git a/app/models/prediction.py b/app/models/prediction.py
index ebc2933..c159795 100644
--- a/app/models/prediction.py
+++ b/app/models/prediction.py
@@ -23,7 +23,7 @@ class PredictionImportRequest(BaseModel):
     """
 
     prediction_path: str
-    format: Literal["coco", "detection_annotation"] = "coco"
+    format: Literal["coco", "detection_annotation", "classification_jsonl"] = "coco"
     run_name: str | None = None
 
 
diff --git a/app/routers/_run_name.py b/app/routers/_run_name.py
index c5d7dad..30ee3d1 100644
--- a/app/routers/_run_name.py
+++ b/app/routers/_run_name.py
@@ -22,6 +22,8 @@ def derive_run_name(prediction_path: Path, fmt: str) -> str:
 
     Falls back to ``"prediction"`` when metadata cannot be extracted.
     """
+    if fmt == "classification_jsonl":
+        return _from_coco(prediction_path)
     if fmt == "detection_annotation":
         return _from_detection_annotation(prediction_path)
     return _from_coco(prediction_path)
diff --git a/app/routers/datasets.py b/app/routers/datasets.py
index 9c9cba7..99ae72f 100644
--- a/app/routers/datasets.py
+++ b/app/routers/datasets.py
@@ -18,6 +18,7 @@
 from fastapi.responses import StreamingResponse
 
 from app.dependencies import get_db, get_image_service, get_ingestion_service, get_similarity_service
+from app.ingestion.classification_prediction_parser import ClassificationPredictionParser
 from app.ingestion.detection_annotation_parser import DetectionAnnotationParser
 from app.ingestion.prediction_parser import PredictionParser
 from app.models.dataset import (
@@ -187,7 +188,39 @@ def import_predictions(
         total_inserted = 0
         total_skipped = 0
 
-        if request.format == "detection_annotation":
+        if request.format == "classification_jsonl":
+            # --- Classification JSONL predictions (one label per image) ---
+            if not prediction_path.is_file():
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Expected a JSONL file for classification_jsonl format: {request.prediction_path}",
+                )
+
+            # Build sample lookup: filename -> sample_id
+            sample_rows = cursor.execute(
+                "SELECT id, file_name FROM samples WHERE dataset_id = ?",
+                [dataset_id],
+            ).fetchall()
+            sample_lookup: dict[str, str] = {r[1]: r[0] for r in sample_rows}
+
+            parser_cls = ClassificationPredictionParser()
+            for batch_df in parser_cls.parse_streaming(
+                file_path=prediction_path,
+                sample_lookup=sample_lookup,
+                dataset_id=dataset_id,
+                source=run_name,
+            ):
+                cursor.execute(
+                    "INSERT INTO annotations SELECT * FROM batch_df"
+                )
+                total_inserted += len(batch_df)
+
+            # Count total lines to compute skipped
+            with open(prediction_path, encoding="utf-8") as f:
+                file_total = sum(1 for line in f if line.strip())
+            total_skipped = file_total - total_inserted
+
+        elif request.format == "detection_annotation":
             # --- DetectionAnnotation format (directory of per-image JSONs) ---
             if not prediction_path.is_dir():
                 raise HTTPException(

From 9f38741001c24a3f571cc95e620d204bb60c412b Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:28:09 -0500
Subject: [PATCH 21/38] feat(16-01): add classification evaluation, confusion
 cell, and error analysis services

- Create ClassificationEvaluationResponse and ClassificationPerClassMetrics models
- Implement compute_classification_evaluation with accuracy, F1, confusion matrix
- Implement get_classification_confusion_cell_samples without IoU matching
- Implement classify_errors for correct/misclassified/missing categorization
- Route evaluation, confusion-cell-samples, and error-analysis endpoints by dataset_type
- Detection evaluation paths remain completely untouched

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/models/classification_evaluation.py       |  30 +++
 app/routers/statistics.py                     |  64 ++++--
 app/services/classification_error_analysis.py | 161 ++++++++++++++
 app/services/classification_evaluation.py     | 204 ++++++++++++++++++
 4 files changed, 442 insertions(+), 17 deletions(-)
 create mode 100644 app/models/classification_evaluation.py
 create mode 100644 app/services/classification_error_analysis.py
 create mode 100644 app/services/classification_evaluation.py

diff --git a/app/models/classification_evaluation.py b/app/models/classification_evaluation.py
new file mode 100644
index 0000000..16c98e0
--- /dev/null
+++ b/app/models/classification_evaluation.py
@@ -0,0 +1,30 @@
+"""Response models for the classification evaluation endpoint."""
+
+from pydantic import BaseModel
+
+
+class ClassificationPerClassMetrics(BaseModel):
+    """Per-class precision, recall, F1, and support for classification."""
+
+    class_name: str
+    precision: float
+    recall: float
+    f1: float
+    support: int
+
+
+class ClassificationEvaluationResponse(BaseModel):
+    """Full classification evaluation payload.
+
+    Returned by GET /datasets/{id}/evaluation when dataset_type is
+    ``classification``.
+    """
+
+    accuracy: float
+    macro_f1: float
+    weighted_f1: float
+    per_class_metrics: list[ClassificationPerClassMetrics]
+    confusion_matrix: list[list[int]]
+    confusion_matrix_labels: list[str]
+    conf_threshold: float
+    evaluation_type: str = "classification"
diff --git a/app/routers/statistics.py b/app/routers/statistics.py
index 0d3a978..8a6e4e7 100644
--- a/app/routers/statistics.py
+++ b/app/routers/statistics.py
@@ -11,8 +11,14 @@
 from fastapi import APIRouter, Depends, HTTPException, Query
 
 from app.dependencies import get_db
+from app.models.classification_evaluation import ClassificationEvaluationResponse
 from app.models.error_analysis import ErrorAnalysisResponse
 from app.models.evaluation import ConfusionCellSamplesResponse, EvaluationResponse
+from app.services.classification_error_analysis import classify_errors as classify_classification_errors
+from app.services.classification_evaluation import (
+    compute_classification_evaluation,
+    get_classification_confusion_cell_samples,
+)
 from app.models.statistics import (
     ClassDistribution,
     DatasetStatistics,
@@ -136,7 +142,7 @@ def get_dataset_statistics(
     )
 
 
-@router.get("/{dataset_id}/evaluation", response_model=EvaluationResponse)
+@router.get("/{dataset_id}/evaluation")
 def get_evaluation(
     dataset_id: str,
     source: str = Query("prediction"),
@@ -144,19 +150,20 @@ def get_evaluation(
     conf_threshold: float = Query(0.25, ge=0.0, le=1.0),
     split: str | None = Query(None),
     db: DuckDBRepo = Depends(get_db),
-) -> EvaluationResponse:
+) -> EvaluationResponse | ClassificationEvaluationResponse:
     """Return evaluation metrics comparing predictions to ground truth.
 
-    Computes PR curves, mAP@50/75/50:95, confusion matrix, and per-class
-    precision/recall at the given IoU and confidence thresholds.
+    For detection datasets: PR curves, mAP@50/75/50:95, confusion matrix.
+    For classification datasets: accuracy, F1, confusion matrix, per-class P/R/F1.
     """
     cursor = db.connection.cursor()
     try:
         row = cursor.execute(
-            "SELECT id FROM datasets WHERE id = ?", [dataset_id]
+            "SELECT id, dataset_type FROM datasets WHERE id = ?", [dataset_id]
         ).fetchone()
         if row is None:
             raise HTTPException(status_code=404, detail="Dataset not found")
+        dataset_type = row[1] or "detection"
 
         # Verify that the requested source has annotations
         source_count = cursor.execute(
@@ -169,6 +176,11 @@ def get_evaluation(
                 detail=f"No annotations found for source '{source}'",
             )
 
+        if dataset_type == "classification":
+            return compute_classification_evaluation(
+                cursor, dataset_id, source, conf_threshold, split=split
+            )
+
         return compute_evaluation(
             cursor, dataset_id, source, iou_threshold, conf_threshold, split=split
         )
@@ -198,21 +210,33 @@ def get_confusion_cell_samples_endpoint(
     cursor = db.connection.cursor()
     try:
         row = cursor.execute(
-            "SELECT id FROM datasets WHERE id = ?", [dataset_id]
+            "SELECT id, dataset_type FROM datasets WHERE id = ?", [dataset_id]
         ).fetchone()
         if row is None:
             raise HTTPException(status_code=404, detail="Dataset not found")
+        dataset_type = row[1] or "detection"
 
-        sample_ids = get_confusion_cell_samples(
-            cursor,
-            dataset_id,
-            source,
-            actual_class,
-            predicted_class,
-            iou_threshold,
-            conf_threshold,
-            split=split,
-        )
+        if dataset_type == "classification":
+            sample_ids = get_classification_confusion_cell_samples(
+                cursor,
+                dataset_id,
+                source,
+                actual_class,
+                predicted_class,
+                conf_threshold,
+                split=split,
+            )
+        else:
+            sample_ids = get_confusion_cell_samples(
+                cursor,
+                dataset_id,
+                source,
+                actual_class,
+                predicted_class,
+                iou_threshold,
+                conf_threshold,
+                split=split,
+            )
 
         return ConfusionCellSamplesResponse(
             actual_class=actual_class,
@@ -241,10 +265,11 @@ def get_error_analysis(
     cursor = db.connection.cursor()
     try:
         row = cursor.execute(
-            "SELECT id FROM datasets WHERE id = ?", [dataset_id]
+            "SELECT id, dataset_type FROM datasets WHERE id = ?", [dataset_id]
         ).fetchone()
         if row is None:
             raise HTTPException(status_code=404, detail="Dataset not found")
+        dataset_type = row[1] or "detection"
 
         # Verify that the requested source has annotations
         source_count = cursor.execute(
@@ -257,6 +282,11 @@ def get_error_analysis(
                 detail=f"No annotations found for source '{source}'",
             )
 
+        if dataset_type == "classification":
+            return classify_classification_errors(
+                cursor, dataset_id, source, conf_threshold, split=split
+            )
+
         return categorize_errors(
             cursor, dataset_id, source, iou_threshold, conf_threshold, split=split
         )
diff --git a/app/services/classification_error_analysis.py b/app/services/classification_error_analysis.py
new file mode 100644
index 0000000..2d0dc96
--- /dev/null
+++ b/app/services/classification_error_analysis.py
@@ -0,0 +1,161 @@
+"""Classification error analysis service.
+
+Categorises each sample as correct, misclassified, or missing_prediction
+by comparing ground-truth labels to predicted labels.  Reuses the existing
+``ErrorAnalysisResponse`` model from the detection pipeline.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+
+from app.models.error_analysis import (
+    ErrorAnalysisResponse,
+    ErrorSample,
+    ErrorSummary,
+    PerClassErrors,
+)
+
+
+def classify_errors(
+    cursor,
+    dataset_id: str,
+    source: str,
+    conf_threshold: float,
+    split: str | None = None,
+) -> ErrorAnalysisResponse:
+    """Categorise classification predictions as correct/misclassified/missing.
+
+    Parameters
+    ----------
+    cursor:
+        DuckDB cursor.
+    dataset_id:
+        The dataset to analyse.
+    source:
+        Prediction source (run name).
+    conf_threshold:
+        Minimum confidence for predictions.
+    split:
+        Optional split filter.
+
+    Returns
+    -------
+    ErrorAnalysisResponse
+        Summary counts, per-class breakdown, and samples grouped by error type.
+    """
+    split_clause = "AND s.split = ?" if split else ""
+    params: list = [source, conf_threshold, dataset_id]
+    if split:
+        params.append(split)
+
+    query = f"""
+        SELECT
+            s.id as sample_id,
+            MIN(gt.category_name) as gt_label,
+            pred.category_name as pred_label,
+            pred.confidence as pred_confidence
+        FROM samples s
+        JOIN annotations gt
+            ON gt.sample_id = s.id
+            AND gt.dataset_id = s.dataset_id
+            AND gt.source = 'ground_truth'
+        LEFT JOIN annotations pred
+            ON pred.sample_id = s.id
+            AND pred.dataset_id = s.dataset_id
+            AND pred.source = ?
+            AND (pred.confidence >= ? OR pred.confidence IS NULL)
+        WHERE s.dataset_id = ?
+        {split_clause}
+        GROUP BY s.id, pred.category_name, pred.confidence
+    """
+
+    rows = cursor.execute(query, params).fetchall()
+
+    # Categorise each sample
+    correct_count = 0
+    misclassified_count = 0
+    missing_count = 0
+
+    # Per-class tracking
+    class_tp: dict[str, int] = defaultdict(int)
+    class_label_error: dict[str, int] = defaultdict(int)
+    class_fn: dict[str, int] = defaultdict(int)
+    all_classes: set[str] = set()
+
+    # Samples by error type
+    samples_correct: list[ErrorSample] = []
+    samples_misclassified: list[ErrorSample] = []
+    samples_missing: list[ErrorSample] = []
+
+    for sample_id, gt_label, pred_label, pred_confidence in rows:
+        all_classes.add(gt_label)
+
+        if pred_label is None:
+            # No prediction for this sample
+            missing_count += 1
+            class_fn[gt_label] += 1
+            samples_missing.append(
+                ErrorSample(
+                    sample_id=sample_id,
+                    error_type="missing_prediction",
+                    category_name=gt_label,
+                    confidence=None,
+                )
+            )
+        elif gt_label == pred_label:
+            # Correct prediction
+            correct_count += 1
+            class_tp[gt_label] += 1
+            all_classes.add(pred_label)
+            samples_correct.append(
+                ErrorSample(
+                    sample_id=sample_id,
+                    error_type="correct",
+                    category_name=gt_label,
+                    confidence=pred_confidence,
+                )
+            )
+        else:
+            # Misclassified
+            misclassified_count += 1
+            class_label_error[gt_label] += 1
+            all_classes.add(pred_label)
+            samples_misclassified.append(
+                ErrorSample(
+                    sample_id=sample_id,
+                    error_type="misclassified",
+                    category_name=gt_label,
+                    confidence=pred_confidence,
+                )
+            )
+
+    summary = ErrorSummary(
+        true_positives=correct_count,
+        hard_false_positives=0,
+        label_errors=misclassified_count,
+        false_negatives=missing_count,
+    )
+
+    per_class = [
+        PerClassErrors(
+            class_name=cls,
+            tp=class_tp.get(cls, 0),
+            hard_fp=0,
+            label_error=class_label_error.get(cls, 0),
+            fn=class_fn.get(cls, 0),
+        )
+        for cls in sorted(all_classes)
+    ]
+
+    samples_by_type = {
+        "correct": samples_correct,
+        "misclassified": samples_misclassified,
+        "missing_prediction": samples_missing,
+    }
+
+    return ErrorAnalysisResponse(
+        summary=summary,
+        per_class=per_class,
+        samples_by_type=samples_by_type,
+    )
diff --git a/app/services/classification_evaluation.py b/app/services/classification_evaluation.py
new file mode 100644
index 0000000..86036a8
--- /dev/null
+++ b/app/services/classification_evaluation.py
@@ -0,0 +1,204 @@
+"""Classification evaluation service.
+
+Computes accuracy, F1, confusion matrix, and per-class precision/recall/F1
+by comparing ground-truth labels to predicted labels per sample.
+No IoU matching is needed -- classification is a direct label comparison.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+
+from app.models.classification_evaluation import (
+    ClassificationEvaluationResponse,
+    ClassificationPerClassMetrics,
+)
+from app.models.evaluation import ConfusionCellSamplesResponse
+
+
+def compute_classification_evaluation(
+    cursor,
+    dataset_id: str,
+    source: str,
+    conf_threshold: float,
+    split: str | None = None,
+) -> ClassificationEvaluationResponse:
+    """Compute classification evaluation metrics.
+
+    Parameters
+    ----------
+    cursor:
+        DuckDB cursor.
+    dataset_id:
+        The dataset to evaluate.
+    source:
+        Prediction source (run name).
+    conf_threshold:
+        Minimum confidence for predictions.
+    split:
+        Optional split filter (train/val/test).
+
+    Returns
+    -------
+    ClassificationEvaluationResponse
+        Accuracy, F1 scores, per-class metrics, and confusion matrix.
+    """
+    # Query GT and prediction labels per sample.
+    # For multi-label GT, take MIN(gt.category_name) to get one label per sample.
+    split_clause = "AND s.split = ?" if split else ""
+    params: list = [source, conf_threshold, dataset_id]
+    if split:
+        params.append(split)
+
+    query = f"""
+        SELECT
+            s.id as sample_id,
+            MIN(gt.category_name) as gt_label,
+            pred.category_name as pred_label,
+            pred.confidence as pred_confidence
+        FROM samples s
+        JOIN annotations gt
+            ON gt.sample_id = s.id
+            AND gt.dataset_id = s.dataset_id
+            AND gt.source = 'ground_truth'
+        LEFT JOIN annotations pred
+            ON pred.sample_id = s.id
+            AND pred.dataset_id = s.dataset_id
+            AND pred.source = ?
+            AND (pred.confidence >= ? OR pred.confidence IS NULL)
+        WHERE s.dataset_id = ?
+        {split_clause}
+        GROUP BY s.id, pred.category_name, pred.confidence
+    """
+
+    rows = cursor.execute(query, params).fetchall()
+
+    # Build confusion counts: (gt_label, pred_label) -> count
+    confusion_counts: dict[tuple[str, str | None], int] = defaultdict(int)
+    all_classes: set[str] = set()
+
+    for _sample_id, gt_label, pred_label, _confidence in rows:
+        all_classes.add(gt_label)
+        if pred_label is not None:
+            all_classes.add(pred_label)
+        confusion_counts[(gt_label, pred_label)] += 1
+
+    labels = sorted(all_classes)
+    label_to_idx = {lbl: i for i, lbl in enumerate(labels)}
+    n = len(labels)
+
+    # Build confusion matrix (rows=actual, cols=predicted)
+    matrix = [[0] * n for _ in range(n)]
+    # Track missing predictions (pred_label is None)
+    missing_per_class: dict[str, int] = defaultdict(int)
+
+    for (gt_label, pred_label), count in confusion_counts.items():
+        if pred_label is None:
+            missing_per_class[gt_label] += count
+        else:
+            gt_idx = label_to_idx[gt_label]
+            pred_idx = label_to_idx[pred_label]
+            matrix[gt_idx][pred_idx] += count
+
+    # Compute metrics from confusion matrix
+    total = sum(sum(row) for row in matrix)
+    correct = sum(matrix[i][i] for i in range(n))
+    accuracy = correct / total if total > 0 else 0.0
+
+    per_class: list[ClassificationPerClassMetrics] = []
+    f1_scores: list[float] = []
+    supports: list[int] = []
+
+    for i, class_name in enumerate(labels):
+        tp = matrix[i][i]
+        # Support = total GT samples for this class (row sum + missing)
+        row_sum = sum(matrix[i])
+        support = row_sum + missing_per_class.get(class_name, 0)
+
+        # Precision: tp / (tp + fp), where fp = column sum - tp
+        col_sum = sum(matrix[r][i] for r in range(n))
+        precision = tp / col_sum if col_sum > 0 else 0.0
+
+        # Recall: tp / (tp + fn), where fn = row_sum - tp + missing
+        total_actual = row_sum + missing_per_class.get(class_name, 0)
+        recall = tp / total_actual if total_actual > 0 else 0.0
+
+        # F1
+        f1 = (
+            2 * precision * recall / (precision + recall)
+            if (precision + recall) > 0
+            else 0.0
+        )
+
+        per_class.append(
+            ClassificationPerClassMetrics(
+                class_name=class_name,
+                precision=round(precision, 4),
+                recall=round(recall, 4),
+                f1=round(f1, 4),
+                support=support,
+            )
+        )
+        f1_scores.append(f1)
+        supports.append(support)
+
+    # Macro F1: unweighted mean of per-class F1
+    macro_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0.0
+
+    # Weighted F1: weighted by support
+    total_support = sum(supports)
+    weighted_f1 = (
+        sum(f * s for f, s in zip(f1_scores, supports)) / total_support
+        if total_support > 0
+        else 0.0
+    )
+
+    return ClassificationEvaluationResponse(
+        accuracy=round(accuracy, 4),
+        macro_f1=round(macro_f1, 4),
+        weighted_f1=round(weighted_f1, 4),
+        per_class_metrics=per_class,
+        confusion_matrix=matrix,
+        confusion_matrix_labels=labels,
+        conf_threshold=conf_threshold,
+    )
+
+
+def get_classification_confusion_cell_samples(
+    cursor,
+    dataset_id: str,
+    source: str,
+    actual_class: str,
+    predicted_class: str,
+    conf_threshold: float,
+    split: str | None = None,
+) -> list[str]:
+    """Return sample IDs for a (gt_class, pred_class) confusion matrix cell.
+
+    No IoU matching needed -- simple label comparison.
+    """
+    split_clause = "AND s.split = ?" if split else ""
+    params: list = [source, conf_threshold, actual_class, predicted_class, dataset_id]
+    if split:
+        params.append(split)
+
+    query = f"""
+        SELECT DISTINCT s.id
+        FROM samples s
+        JOIN annotations gt
+            ON gt.sample_id = s.id
+            AND gt.dataset_id = s.dataset_id
+            AND gt.source = 'ground_truth'
+        JOIN annotations pred
+            ON pred.sample_id = s.id
+            AND pred.dataset_id = s.dataset_id
+            AND pred.source = ?
+            AND (pred.confidence >= ? OR pred.confidence IS NULL)
+        WHERE gt.category_name = ?
+            AND pred.category_name = ?
+            AND s.dataset_id = ?
+            {split_clause}
+    """
+
+    rows = cursor.execute(query, params).fetchall()
+    return [r[0] for r in rows]

From 5d6c2eed780ff3009cf299b6f8a75a3b45e7f7fe Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:29:18 -0500
Subject: [PATCH 22/38] docs(16-01): complete classification evaluation backend
 plan

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/STATE.md                            |  17 +--
 .../16-01-SUMMARY.md                          | 107 ++++++++++++++++++
 2 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 .planning/phases/16-classification-evaluation/16-01-SUMMARY.md

diff --git a/.planning/STATE.md b/.planning/STATE.md
index 1f206e1..03332cf 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -5,16 +5,16 @@
 See: .planning/PROJECT.md (updated 2026-02-18)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** Phase 15 - Classification Ingestion & Display
+**Current focus:** Phase 16 - Classification Evaluation
 
 ## Current Position
 
-Phase: 15 of 17 (Classification Ingestion & Display) -- COMPLETE
-Plan: 2 of 2 in current phase
-Status: Phase Complete
-Last activity: 2026-02-18 -- Completed 15-02 (Classification Frontend Display)
+Phase: 16 of 17 (Classification Evaluation)
+Plan: 1 of 2 in current phase
+Status: In Progress
+Last activity: 2026-02-18 -- Completed 16-01 (Classification Evaluation Backend)
 
-Progress: [############################] 93% (v1.0 + v1.1 complete, v1.2 phase 15 done)
+Progress: [#############################] 95% (v1.0 + v1.1 complete, v1.2 phase 16 plan 1 done)
 
 ## Performance Metrics
 
@@ -43,6 +43,9 @@ Recent decisions affecting current work:
 - Classification gt_annotations = COUNT(DISTINCT sample_id) for labeled images
 - [Phase 15]: Thread datasetType from page level, branch at component boundaries with isClassification flag
 - [Phase 15]: Hide detection-only stats tabs for classification (Evaluation, Error Analysis, Worst Images, Intelligence)
+- [Phase 16]: Reuse ErrorAnalysisResponse model from detection for classification error analysis
+- [Phase 16]: Route by dataset_type at endpoint level, keeping classification/detection services separate
+- [Phase 16]: Remove response_model on evaluation endpoint for union return type support
 
 ### Pending Todos
 
@@ -62,5 +65,5 @@ None.
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: Completed 15-02-PLAN.md (Classification Frontend Display) -- Phase 15 complete
+Stopped at: Completed 16-01-PLAN.md (Classification Evaluation Backend)
 Resume file: None
diff --git a/.planning/phases/16-classification-evaluation/16-01-SUMMARY.md b/.planning/phases/16-classification-evaluation/16-01-SUMMARY.md
new file mode 100644
index 0000000..71138f5
--- /dev/null
+++ b/.planning/phases/16-classification-evaluation/16-01-SUMMARY.md
@@ -0,0 +1,107 @@
+---
+phase: 16-classification-evaluation
+plan: 01
+subsystem: api
+tags: [classification, evaluation, confusion-matrix, f1, error-analysis, jsonl]
+
+requires:
+  - phase: 15-classification-ingestion-display
+    provides: classification JSONL GT parser, sentinel bbox pattern, dataset_type column
+provides:
+  - Classification prediction JSONL import via existing POST /datasets/{id}/predictions endpoint
+  - Classification evaluation service (accuracy, F1, confusion matrix, per-class P/R/F1)
+  - Classification confusion cell sample drill-down without IoU matching
+  - Classification error analysis (correct/misclassified/missing categorization)
+  - Dataset-type-aware routing in evaluation, confusion-cell, and error-analysis endpoints
+affects: [16-02, frontend-evaluation-tabs]
+
+tech-stack:
+  added: []
+  patterns: [dataset-type routing in statistics endpoints, sentinel bbox for classification predictions]
+
+key-files:
+  created:
+    - app/ingestion/classification_prediction_parser.py
+    - app/models/classification_evaluation.py
+    - app/services/classification_evaluation.py
+    - app/services/classification_error_analysis.py
+  modified:
+    - app/models/prediction.py
+    - app/routers/datasets.py
+    - app/routers/_run_name.py
+    - app/routers/statistics.py
+
+key-decisions:
+  - "Reuse ErrorAnalysisResponse model from detection for classification error analysis (same shape works)"
+  - "Route by dataset_type at endpoint level rather than service level -- keeps services focused"
+  - "Remove response_model constraint on evaluation endpoint to support union return type"
+
+patterns-established:
+  - "Dataset-type routing: fetch dataset_type in endpoint, branch to classification vs detection service"
+  - "Classification prediction parser: flexible key lookup matching GT parser pattern"
+
+duration: 3min
+completed: 2026-02-18
+---
+
+# Phase 16 Plan 01: Classification Evaluation Backend Summary
+
+**Classification prediction import, evaluation metrics (accuracy/F1/confusion matrix), and error analysis services with dataset-type-aware endpoint routing**
+
+## Performance
+
+- **Duration:** 3 min
+- **Started:** 2026-02-19T03:25:01Z
+- **Completed:** 2026-02-19T03:28:16Z
+- **Tasks:** 2
+- **Files modified:** 8
+
+## Accomplishments
+- Classification predictions importable via POST /datasets/{id}/predictions with format=classification_jsonl
+- Evaluation endpoint returns accuracy, macro/weighted F1, per-class P/R/F1, and confusion matrix for classification datasets
+- Confusion cell drill-down works without IoU matching for classification
+- Error analysis categorizes samples as correct/misclassified/missing_prediction
+- All existing detection evaluation paths remain completely untouched
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Classification prediction parser and import endpoint** - `76bda2a` (feat)
+2. **Task 2: Classification evaluation and error analysis services with endpoint routing** - `9f38741` (feat)
+
+## Files Created/Modified
+- `app/ingestion/classification_prediction_parser.py` - Streaming JSONL prediction parser with flexible key lookup and sentinel bbox values
+- `app/models/classification_evaluation.py` - ClassificationEvaluationResponse and ClassificationPerClassMetrics Pydantic models
+- `app/services/classification_evaluation.py` - Accuracy, F1, confusion matrix computation and confusion cell sample lookup
+- `app/services/classification_error_analysis.py` - Correct/misclassified/missing categorization reusing ErrorAnalysisResponse model
+- `app/models/prediction.py` - Added classification_jsonl to format Literal type
+- `app/routers/datasets.py` - Added classification_jsonl branch in import_predictions
+- `app/routers/_run_name.py` - Added classification_jsonl run name derivation
+- `app/routers/statistics.py` - Dataset-type routing for evaluation, confusion-cell-samples, and error-analysis endpoints
+
+## Decisions Made
+- Reused ErrorAnalysisResponse model from detection for classification error analysis -- same shape (summary + per_class + samples_by_type) works for both
+- Route by dataset_type at endpoint level rather than service level -- keeps classification and detection services cleanly separated
+- Removed response_model constraint on evaluation endpoint to support EvaluationResponse | ClassificationEvaluationResponse union return
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+
+None.
+
+## User Setup Required
+
+None - no external service configuration required.
+
+## Next Phase Readiness
+- All backend services ready for classification evaluation frontend (Plan 16-02)
+- Endpoints return correct response shapes for classification datasets
+- Detection datasets continue working exactly as before
+
+---
+*Phase: 16-classification-evaluation*
+*Completed: 2026-02-18*

From 7dca67ef25cc197b3c7f2ba1e905c136de2d0045 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:31:14 -0500
Subject: [PATCH 23/38] feat(16-02): add classification types, prediction
 import format, and grid predicted label badge

- Add ClassificationEvaluationResponse and AnyEvaluationResponse union type
- Add classification_jsonl format to PredictionImportRequest
- Update useEvaluation hook to return AnyEvaluationResponse
- Update useFilteredEvaluation to pass through classification responses
- Add Classification JSONL option to prediction import dialog
- Add predicted class badge (green=correct, red=mismatch) on grid cells

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../detail/prediction-import-dialog.tsx       |  3 ++-
 frontend/src/components/grid/grid-cell.tsx    | 17 +++++++++++++-
 frontend/src/hooks/use-evaluation.ts          |  4 ++--
 frontend/src/hooks/use-filtered-evaluation.ts | 11 ++++++----
 frontend/src/types/evaluation.ts              | 22 +++++++++++++++++++
 frontend/src/types/prediction.ts              |  2 +-
 6 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/frontend/src/components/detail/prediction-import-dialog.tsx b/frontend/src/components/detail/prediction-import-dialog.tsx
index 9892db7..c3839da 100644
--- a/frontend/src/components/detail/prediction-import-dialog.tsx
+++ b/frontend/src/components/detail/prediction-import-dialog.tsx
@@ -17,6 +17,7 @@ interface PredictionImportDialogProps {
 const FORMAT_OPTIONS: { value: Format; label: string }[] = [
   { value: "detection_annotation", label: "Detection Annotation" },
   { value: "coco", label: "COCO Results" },
+  { value: "classification_jsonl", label: "Classification JSONL" },
 ];
 
 export function PredictionImportDialog({
@@ -74,7 +75,7 @@ export function PredictionImportDialog({
           {/* Path input */}
           <div>
             <label className="mb-1 block text-xs font-medium text-zinc-700 dark:text-zinc-300">
-              Prediction Directory
+              {format === "classification_jsonl" ? "Prediction File" : "Prediction Path"}
             </label>
             <div className="flex gap-2">
               <input
diff --git a/frontend/src/components/grid/grid-cell.tsx b/frontend/src/components/grid/grid-cell.tsx
index 7120732..a6860f6 100644
--- a/frontend/src/components/grid/grid-cell.tsx
+++ b/frontend/src/components/grid/grid-cell.tsx
@@ -98,7 +98,22 @@ export function GridCell({ sample, datasetId, annotations, isFocused, datasetTyp
           decoding="async"
         />
         {datasetType === "classification" ? (
-          <ClassBadge label={annotations.find((a) => a.source === "ground_truth")?.category_name} />
+          <>
+            <ClassBadge label={annotations.find((a) => a.source === "ground_truth")?.category_name} />
+            {(() => {
+              const gt = annotations.find((a) => a.source === "ground_truth");
+              const pred = annotations.find((a) => a.source !== "ground_truth");
+              if (!pred) return null;
+              const isCorrect = gt?.category_name === pred.category_name;
+              return (
+                <div className={`absolute bottom-1 right-1 z-10 rounded px-1.5 py-0.5 text-[10px] font-semibold ${
+                  isCorrect ? "bg-green-500/80 text-white" : "bg-red-500/80 text-white"
+                }`}>
+                  {pred.category_name}
+                </div>
+              );
+            })()}
+          </>
         ) : (
           annotations.length > 0 && (
             <AnnotationOverlay
diff --git a/frontend/src/hooks/use-evaluation.ts b/frontend/src/hooks/use-evaluation.ts
index 2b40cb6..425dfa8 100644
--- a/frontend/src/hooks/use-evaluation.ts
+++ b/frontend/src/hooks/use-evaluation.ts
@@ -8,7 +8,7 @@
 import { keepPreviousData, useQuery } from "@tanstack/react-query";
 
 import { apiFetch } from "@/lib/api";
-import type { EvaluationResponse } from "@/types/evaluation";
+import type { AnyEvaluationResponse } from "@/types/evaluation";
 
 export function useEvaluation(
   datasetId: string,
@@ -28,7 +28,7 @@ export function useEvaluation(
       split,
     ],
     queryFn: () =>
-      apiFetch<EvaluationResponse>(
+      apiFetch<AnyEvaluationResponse>(
         `/datasets/${datasetId}/evaluation?source=${encodeURIComponent(source)}&iou_threshold=${iouThreshold}&conf_threshold=${confThreshold}${splitParam}`,
       ),
     staleTime: 10 * 60 * 1000, // 10 min -- each (source, iou, conf, split) combo cached
diff --git a/frontend/src/hooks/use-filtered-evaluation.ts b/frontend/src/hooks/use-filtered-evaluation.ts
index 7496be7..da3edae 100644
--- a/frontend/src/hooks/use-filtered-evaluation.ts
+++ b/frontend/src/hooks/use-filtered-evaluation.ts
@@ -13,6 +13,7 @@ import { useRef, useMemo } from "react";
 
 import type {
   EvaluationResponse,
+  AnyEvaluationResponse,
   APMetrics,
   PRCurve,
 } from "@/types/evaluation";
@@ -144,16 +145,18 @@ function filterEvaluation(
  * lifetime of the raw `data` reference (cache resets when server data changes).
  */
 export function useFilteredEvaluation(
-  data: EvaluationResponse | undefined,
+  data: AnyEvaluationResponse | undefined,
   excludedClasses: Set<string>,
-): EvaluationResponse | undefined {
+): AnyEvaluationResponse | undefined {
   const cacheRef = useRef<{
-    sourceData: EvaluationResponse | undefined;
+    sourceData: AnyEvaluationResponse | undefined;
     results: Map<string, EvaluationResponse>;
   }>({ sourceData: undefined, results: new Map() });
 
   return useMemo(() => {
     if (!data) return undefined;
+    // Classification responses don't need class filtering (no PR curves / AP)
+    if (data.evaluation_type === "classification") return data;
     if (excludedClasses.size === 0) return data;
 
     // Reset cache when upstream data changes
@@ -165,7 +168,7 @@ export function useFilteredEvaluation(
     const cached = cacheRef.current.results.get(key);
     if (cached) return cached;
 
-    const result = filterEvaluation(data, excludedClasses);
+    const result = filterEvaluation(data as EvaluationResponse, excludedClasses);
     cacheRef.current.results.set(key, result);
     return result;
   }, [data, excludedClasses]);
diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts
index b8197ab..52dbf55 100644
--- a/frontend/src/types/evaluation.ts
+++ b/frontend/src/types/evaluation.ts
@@ -32,6 +32,7 @@ export interface PerClassMetrics {
 }
 
 export interface EvaluationResponse {
+  evaluation_type?: "detection";
   pr_curves: PRCurve[];
   ap_metrics: APMetrics;
   per_class_metrics: PerClassMetrics[];
@@ -40,3 +41,24 @@ export interface EvaluationResponse {
   iou_threshold: number;
   conf_threshold: number;
 }
+
+export interface ClassificationPerClassMetrics {
+  class_name: string;
+  precision: number;
+  recall: number;
+  f1: number;
+  support: number;
+}
+
+export interface ClassificationEvaluationResponse {
+  evaluation_type: "classification";
+  accuracy: number;
+  macro_f1: number;
+  weighted_f1: number;
+  per_class_metrics: ClassificationPerClassMetrics[];
+  confusion_matrix: number[][];
+  confusion_matrix_labels: string[];
+  conf_threshold: number;
+}
+
+export type AnyEvaluationResponse = EvaluationResponse | ClassificationEvaluationResponse;
diff --git a/frontend/src/types/prediction.ts b/frontend/src/types/prediction.ts
index 1bd2fd5..a1e65ae 100644
--- a/frontend/src/types/prediction.ts
+++ b/frontend/src/types/prediction.ts
@@ -1,6 +1,6 @@
 export interface PredictionImportRequest {
   prediction_path: string;
-  format: "coco" | "detection_annotation";
+  format: "coco" | "detection_annotation" | "classification_jsonl";
   run_name?: string;
 }
 

From 5d1433e916b3e89b1586ad87598a20167f41fc60 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:33:30 -0500
Subject: [PATCH 24/38] feat(16-02): classification evaluation panel and error
 analysis panel

- Un-hide Evaluation and Error Analysis tabs for classification datasets
- Classification evaluation shows accuracy, macro F1, weighted F1 metric cards
- Classification evaluation renders confusion matrix with click-to-filter
- Classification evaluation renders per-class P/R/F1/Support table
- Classification error analysis shows correct/misclassified/missing categories
- IoU slider hidden for classification in both panels
- Detection evaluation and error analysis completely unchanged

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../components/stats/error-analysis-panel.tsx | 227 +++++++++++++++++-
 .../src/components/stats/evaluation-panel.tsx | 202 +++++++++++++++-
 .../src/components/stats/stats-dashboard.tsx  |  52 ++--
 3 files changed, 432 insertions(+), 49 deletions(-)

diff --git a/frontend/src/components/stats/error-analysis-panel.tsx b/frontend/src/components/stats/error-analysis-panel.tsx
index 9e1ed4f..6c4315e 100644
--- a/frontend/src/components/stats/error-analysis-panel.tsx
+++ b/frontend/src/components/stats/error-analysis-panel.tsx
@@ -4,8 +4,10 @@
  * Error Analysis panel with controls, summary cards, stacked bar chart,
  * and error sample grids.
  *
- * Classifies predictions into True Positive, Hard False Positive,
+ * For detection: classifies predictions into True Positive, Hard False Positive,
  * Label Error, and False Negative categories using IoU matching.
+ *
+ * For classification: shows Correct, Misclassified, and Missing Prediction categories.
  */
 
 import { useState, useEffect, useMemo } from "react";
@@ -27,6 +29,7 @@ import { ErrorSamplesGrid } from "@/components/stats/error-samples-grid";
 interface ErrorAnalysisPanelProps {
   datasetId: string;
   split: string | null;
+  datasetType?: string;
 }
 
 function useDebouncedValue<T>(value: T, delay: number): T {
@@ -55,15 +58,23 @@ function SkeletonChart({ height }: { height: string }) {
   );
 }
 
-/** Color palette for error categories */
-const COLORS = {
+/** Color palette for detection error categories */
+const DETECTION_COLORS = {
   tp: "#22c55e", // green-500
   hard_fp: "#ef4444", // red-500
   label_error: "#f59e0b", // amber-500
   fn: "#f97316", // orange-500
 } as const;
 
-export function ErrorAnalysisPanel({ datasetId, split }: ErrorAnalysisPanelProps) {
+/** Color palette for classification error categories */
+const CLASSIFICATION_COLORS = {
+  correct: "#22c55e", // green-500
+  misclassified: "#ef4444", // red-500
+  missing: "#f97316", // orange-500
+} as const;
+
+export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnalysisPanelProps) {
+  const isClassification = datasetType === "classification";
   const { data: facets } = useFilterFacets(datasetId);
 
   // Available prediction sources (exclude ground_truth)
@@ -97,7 +108,199 @@ export function ErrorAnalysisPanel({ datasetId, split }: ErrorAnalysisPanelProps
     split,
   );
 
-  // Compute totals for percentage display
+  // Classification layout
+  if (isClassification) {
+    // Map detection error fields to classification categories:
+    // true_positives = correct, label_errors = misclassified, false_negatives = missing prediction
+    const correctCount = data?.summary.true_positives ?? 0;
+    const misclassifiedCount = data?.summary.label_errors ?? 0;
+    const missingCount = data?.summary.false_negatives ?? 0;
+    const total = correctCount + misclassifiedCount + missingCount;
+    const pct = (count: number) =>
+      total > 0 ? ((count / total) * 100).toFixed(1) : "0.0";
+
+    // Remap per_class data for classification bar chart
+    const classChartData = data?.per_class.map((c) => ({
+      class_name: c.class_name,
+      correct: c.tp,
+      misclassified: c.label_error,
+      missing: c.fn,
+    })) ?? [];
+
+    const chartHeight = data
+      ? Math.max(300, data.per_class.length * 40)
+      : 300;
+
+    return (
+      <div className="space-y-6">
+        {/* Controls Bar -- no IoU slider for classification */}
+        <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
+          {/* Source dropdown */}
+          <div className="flex items-center gap-2">
+            <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
+              Source:
+            </label>
+            <select
+              value={source}
+              onChange={(e) => setSource(e.target.value)}
+              className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
+            >
+              {predSources.map((s) => (
+                <option key={s} value={s}>
+                  {s}
+                </option>
+              ))}
+            </select>
+          </div>
+
+          {/* Confidence slider */}
+          <div className="flex items-center gap-2">
+            <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
+              Conf:
+            </label>
+            <input
+              type="range"
+              min={0.0}
+              max={1.0}
+              step={0.05}
+              value={confThreshold}
+              onChange={(e) => setConfThreshold(parseFloat(e.target.value))}
+              className="w-28 accent-blue-500"
+            />
+            <span className="text-sm font-mono text-zinc-600 dark:text-zinc-400 w-10">
+              {confThreshold.toFixed(2)}
+            </span>
+          </div>
+        </div>
+
+        {/* Summary Cards */}
+        {isLoading || !data ? (
+          <div className="grid grid-cols-3 gap-4">
+            <SkeletonCard />
+            <SkeletonCard />
+            <SkeletonCard />
+          </div>
+        ) : (
+          <div className="grid grid-cols-3 gap-4">
+            <div className="rounded-lg border border-green-200 dark:border-green-800 bg-green-50 dark:bg-green-950/30 p-4">
+              <p className="text-2xl font-bold text-green-700 dark:text-green-400">
+                {correctCount.toLocaleString()}
+              </p>
+              <p className="text-sm text-green-600 dark:text-green-500">
+                Correct
+              </p>
+              <p className="text-xs text-green-500 dark:text-green-600 mt-1">
+                {pct(correctCount)}%
+              </p>
+            </div>
+
+            <div className="rounded-lg border border-red-200 dark:border-red-800 bg-red-50 dark:bg-red-950/30 p-4">
+              <p className="text-2xl font-bold text-red-700 dark:text-red-400">
+                {misclassifiedCount.toLocaleString()}
+              </p>
+              <p className="text-sm text-red-600 dark:text-red-500">
+                Misclassified
+              </p>
+              <p className="text-xs text-red-500 dark:text-red-600 mt-1">
+                {pct(misclassifiedCount)}%
+              </p>
+            </div>
+
+            <div className="rounded-lg border border-orange-200 dark:border-orange-800 bg-orange-50 dark:bg-orange-950/30 p-4">
+              <p className="text-2xl font-bold text-orange-700 dark:text-orange-400">
+                {missingCount.toLocaleString()}
+              </p>
+              <p className="text-sm text-orange-600 dark:text-orange-500">
+                Missing Prediction
+              </p>
+              <p className="text-xs text-orange-500 dark:text-orange-600 mt-1">
+                {pct(missingCount)}%
+              </p>
+            </div>
+          </div>
+        )}
+
+        {/* Per-class Error Distribution Stacked Bar Chart */}
+        <section>
+          <h2 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
+            Per-Class Error Distribution
+          </h2>
+          {isLoading || !data ? (
+            <SkeletonChart height="h-[350px]" />
+          ) : classChartData.length === 0 ? (
+            <p className="text-sm text-zinc-500 dark:text-zinc-400 py-8 text-center">
+              No error data available
+            </p>
+          ) : (
+            <ResponsiveContainer width="100%" height={chartHeight}>
+              <BarChart
+                layout="vertical"
+                data={classChartData}
+                margin={{ left: 20, right: 20 }}
+              >
+                <CartesianGrid strokeDasharray="3 3" />
+                <XAxis type="number" />
+                <YAxis
+                  type="category"
+                  dataKey="class_name"
+                  width={140}
+                  tick={{ fontSize: 12 }}
+                />
+                <Tooltip />
+                <Legend />
+                <Bar
+                  dataKey="correct"
+                  name="Correct"
+                  stackId="errors"
+                  fill={CLASSIFICATION_COLORS.correct}
+                  radius={[0, 0, 0, 0]}
+                />
+                <Bar
+                  dataKey="misclassified"
+                  name="Misclassified"
+                  stackId="errors"
+                  fill={CLASSIFICATION_COLORS.misclassified}
+                  radius={[0, 0, 0, 0]}
+                />
+                <Bar
+                  dataKey="missing"
+                  name="Missing Prediction"
+                  stackId="errors"
+                  fill={CLASSIFICATION_COLORS.missing}
+                  radius={[0, 2, 2, 0]}
+                />
+              </BarChart>
+            </ResponsiveContainer>
+          )}
+        </section>
+
+        {/* Error Samples Grids */}
+        {!isLoading && data && (
+          <section className="space-y-6">
+            <h2 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300">
+              Error Samples
+            </h2>
+            <ErrorSamplesGrid
+              title="Misclassified"
+              errorType="label_error"
+              samples={data.samples_by_type.label_error ?? []}
+              datasetId={datasetId}
+              color={CLASSIFICATION_COLORS.misclassified}
+            />
+            <ErrorSamplesGrid
+              title="Missing Prediction"
+              errorType="missing_prediction"
+              samples={data.samples_by_type.missing_prediction ?? []}
+              datasetId={datasetId}
+              color={CLASSIFICATION_COLORS.missing}
+            />
+          </section>
+        )}
+      </div>
+    );
+  }
+
+  // Detection layout (unchanged from original)
   const total = data
     ? data.summary.true_positives +
       data.summary.hard_false_positives +
@@ -266,28 +469,28 @@ export function ErrorAnalysisPanel({ datasetId, split }: ErrorAnalysisPanelProps
                 dataKey="tp"
                 name="True Positive"
                 stackId="errors"
-                fill={COLORS.tp}
+                fill={DETECTION_COLORS.tp}
                 radius={[0, 0, 0, 0]}
               />
               <Bar
                 dataKey="hard_fp"
                 name="Hard False Positive"
                 stackId="errors"
-                fill={COLORS.hard_fp}
+                fill={DETECTION_COLORS.hard_fp}
                 radius={[0, 0, 0, 0]}
               />
               <Bar
                 dataKey="label_error"
                 name="Label Error"
                 stackId="errors"
-                fill={COLORS.label_error}
+                fill={DETECTION_COLORS.label_error}
                 radius={[0, 0, 0, 0]}
               />
               <Bar
                 dataKey="fn"
                 name="False Negative"
                 stackId="errors"
-                fill={COLORS.fn}
+                fill={DETECTION_COLORS.fn}
                 radius={[0, 2, 2, 0]}
               />
             </BarChart>
@@ -306,21 +509,21 @@ export function ErrorAnalysisPanel({ datasetId, split }: ErrorAnalysisPanelProps
             errorType="hard_fp"
             samples={data.samples_by_type.hard_fp ?? []}
             datasetId={datasetId}
-            color={COLORS.hard_fp}
+            color={DETECTION_COLORS.hard_fp}
           />
           <ErrorSamplesGrid
             title="Label Errors"
             errorType="label_error"
             samples={data.samples_by_type.label_error ?? []}
             datasetId={datasetId}
-            color={COLORS.label_error}
+            color={DETECTION_COLORS.label_error}
           />
           <ErrorSamplesGrid
             title="False Negatives"
             errorType="false_negative"
             samples={data.samples_by_type.false_negative ?? []}
             datasetId={datasetId}
-            color={COLORS.fn}
+            color={DETECTION_COLORS.fn}
           />
         </section>
       )}
diff --git a/frontend/src/components/stats/evaluation-panel.tsx b/frontend/src/components/stats/evaluation-panel.tsx
index d38e90e..d8128fb 100644
--- a/frontend/src/components/stats/evaluation-panel.tsx
+++ b/frontend/src/components/stats/evaluation-panel.tsx
@@ -5,6 +5,8 @@
  *
  * Contains source dropdown, IoU/confidence sliders (debounced 300ms),
  * and renders MetricsCards, PRCurveChart, ConfusionMatrix, PerClassTable.
+ *
+ * Branches between detection and classification layouts based on datasetType.
  */
 
 import { useState, useEffect, useMemo, useCallback } from "react";
@@ -19,11 +21,13 @@ import { MetricsCards } from "@/components/stats/metrics-cards";
 import { PRCurveChart } from "@/components/stats/pr-curve-chart";
 import { ConfusionMatrix } from "@/components/stats/confusion-matrix";
 import { PerClassTable } from "@/components/stats/per-class-table";
+import type { EvaluationResponse, ClassificationEvaluationResponse } from "@/types/evaluation";
 
 interface EvaluationPanelProps {
   datasetId: string;
   split: string | null;
   excludedClasses: Set<string>;
+  datasetType?: string;
 }
 
 function useDebouncedValue<T>(value: T, delay: number): T {
@@ -52,7 +56,107 @@ function SkeletonChart({ height }: { height: string }) {
   );
 }
 
-export function EvaluationPanel({ datasetId, split, excludedClasses }: EvaluationPanelProps) {
+/** Metric cards for classification: Accuracy, Macro F1, Weighted F1 */
+function ClassificationMetricsCards({ data }: { data: ClassificationEvaluationResponse }) {
+  const cards = [
+    { label: "Accuracy", value: data.accuracy },
+    { label: "Macro F1", value: data.macro_f1 },
+    { label: "Weighted F1", value: data.weighted_f1 },
+  ];
+  return (
+    <div className="grid grid-cols-3 gap-4">
+      {cards.map((card) => (
+        <div
+          key={card.label}
+          className="rounded-lg border border-zinc-200 dark:border-zinc-700 p-4 bg-white dark:bg-zinc-900"
+        >
+          <p className="text-2xl font-bold text-zinc-900 dark:text-zinc-100">
+            {(card.value * 100).toFixed(1)}%
+          </p>
+          <p className="text-sm text-zinc-500 dark:text-zinc-400 mt-1">
+            {card.label}
+          </p>
+        </div>
+      ))}
+    </div>
+  );
+}
+
+/** Per-class table for classification: Class, Precision, Recall, F1, Support */
+function ClassificationPerClassTable({ metrics }: { metrics: ClassificationEvaluationResponse["per_class_metrics"] }) {
+  const sorted = useMemo(
+    () => [...metrics].sort((a, b) => b.f1 - a.f1),
+    [metrics],
+  );
+
+  if (sorted.length === 0) {
+    return (
+      <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
+        <p className="text-sm text-zinc-500 dark:text-zinc-400 text-center py-4">
+          No per-class metrics available
+        </p>
+      </div>
+    );
+  }
+
+  return (
+    <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
+      <h3 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
+        Per-Class Metrics
+      </h3>
+      <div className="overflow-x-auto">
+        <table className="w-full text-sm">
+          <thead>
+            <tr className="border-b border-zinc-200 dark:border-zinc-700">
+              <th className="text-left py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                Class
+              </th>
+              <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                Precision
+              </th>
+              <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                Recall
+              </th>
+              <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                F1
+              </th>
+              <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                Support
+              </th>
+            </tr>
+          </thead>
+          <tbody>
+            {sorted.map((m) => (
+              <tr
+                key={m.class_name}
+                className="border-b border-zinc-100 dark:border-zinc-800"
+              >
+                <td className="py-2 px-3 text-zinc-900 dark:text-zinc-100 font-medium">
+                  {m.class_name}
+                </td>
+                <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
+                  {m.precision.toFixed(3)}
+                </td>
+                <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
+                  {m.recall.toFixed(3)}
+                </td>
+                <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
+                  {m.f1.toFixed(3)}
+                </td>
+                <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
+                  {m.support.toLocaleString()}
+                </td>
+              </tr>
+            ))}
+          </tbody>
+        </table>
+      </div>
+    </div>
+  );
+}
+
+export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType }: EvaluationPanelProps) {
+  const isClassification = datasetType === "classification";
   const { data: facets } = useFilterFacets(datasetId);
 
   // Available prediction sources (exclude ground_truth)
@@ -108,6 +212,86 @@ export function EvaluationPanel({ datasetId, split, excludedClasses }: Evaluatio
     [datasetId, source, debouncedIou, debouncedConf, split],
   );
 
+  // Classification evaluation layout
+  if (isClassification) {
+    const classData = data as ClassificationEvaluationResponse | undefined;
+    return (
+      <div className="space-y-6">
+        {/* Controls Bar -- no IoU slider for classification */}
+        <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
+          {/* Source dropdown */}
+          <div className="flex items-center gap-2">
+            <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
+              Source:
+            </label>
+            <select
+              value={source}
+              onChange={(e) => setSource(e.target.value)}
+              className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
+            >
+              {predSources.map((s) => (
+                <option key={s} value={s}>
+                  {s}
+                </option>
+              ))}
+            </select>
+          </div>
+
+          {/* Confidence slider */}
+          <div className="flex items-center gap-2">
+            <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
+              Conf:
+            </label>
+            <input
+              type="range"
+              min={0.0}
+              max={1.0}
+              step={0.05}
+              value={confThreshold}
+              onChange={(e) => setConfThreshold(parseFloat(e.target.value))}
+              className="w-28 accent-blue-500"
+            />
+            <span className="text-sm font-mono text-zinc-600 dark:text-zinc-400 w-10">
+              {confThreshold.toFixed(2)}
+            </span>
+          </div>
+        </div>
+
+        {/* Classification Metric Cards */}
+        {isLoading || !classData ? (
+          <div className="grid grid-cols-3 gap-4">
+            <SkeletonCard />
+            <SkeletonCard />
+            <SkeletonCard />
+          </div>
+        ) : (
+          <ClassificationMetricsCards data={classData} />
+        )}
+
+        {/* Confusion Matrix */}
+        {isLoading || !classData ? (
+          <SkeletonChart height="h-[350px]" />
+        ) : (
+          <ConfusionMatrix
+            matrix={classData.confusion_matrix}
+            labels={classData.confusion_matrix_labels}
+            onCellClick={handleCellClick}
+          />
+        )}
+
+        {/* Per-Class Table */}
+        {isLoading || !classData ? (
+          <SkeletonChart height="h-[200px]" />
+        ) : (
+          <ClassificationPerClassTable metrics={classData.per_class_metrics} />
+        )}
+      </div>
+    );
+  }
+
+  // Detection evaluation layout (unchanged)
+  const detData = data as EvaluationResponse | undefined;
+
   return (
     <div className="space-y-6">
       {/* Controls Bar */}
@@ -170,19 +354,19 @@ export function EvaluationPanel({ datasetId, split, excludedClasses }: Evaluatio
       </div>
 
       {/* Metric Cards */}
-      {isLoading || !data ? (
+      {isLoading || !detData ? (
         <div className="grid grid-cols-3 gap-4">
           <SkeletonCard />
           <SkeletonCard />
           <SkeletonCard />
         </div>
       ) : (
-        <MetricsCards metrics={data.ap_metrics} />
+        <MetricsCards metrics={detData.ap_metrics} />
       )}
 
       {/* Charts: PR Curve + Confusion Matrix */}
       <div className="grid grid-cols-1 lg:grid-cols-2 gap-6">
-        {isLoading || !data ? (
+        {isLoading || !detData ? (
           <>
             <SkeletonChart height="h-[350px]" />
             <SkeletonChart height="h-[350px]" />
@@ -190,12 +374,12 @@ export function EvaluationPanel({ datasetId, split, excludedClasses }: Evaluatio
         ) : (
           <>
             <PRCurveChart
-              curves={data.pr_curves}
+              curves={detData.pr_curves}
               confThreshold={confThreshold}
             />
             <ConfusionMatrix
-              matrix={data.confusion_matrix}
-              labels={data.confusion_matrix_labels}
+              matrix={detData.confusion_matrix}
+              labels={detData.confusion_matrix_labels}
               onCellClick={handleCellClick}
             />
           </>
@@ -203,10 +387,10 @@ export function EvaluationPanel({ datasetId, split, excludedClasses }: Evaluatio
       </div>
 
       {/* Per-Class Table */}
-      {isLoading || !data ? (
+      {isLoading || !detData ? (
         <SkeletonChart height="h-[200px]" />
       ) : (
-        <PerClassTable metrics={data.per_class_metrics} />
+        <PerClassTable metrics={detData.per_class_metrics} />
       )}
     </div>
   );
diff --git a/frontend/src/components/stats/stats-dashboard.tsx b/frontend/src/components/stats/stats-dashboard.tsx
index b2fbf0c..39e2deb 100644
--- a/frontend/src/components/stats/stats-dashboard.tsx
+++ b/frontend/src/components/stats/stats-dashboard.tsx
@@ -176,32 +176,28 @@ export function StatsDashboard({ datasetId, datasetType }: StatsDashboardProps)
         >
           Overview
         </button>
-        {!isClassification && (
-          <button
-            onClick={() => setActiveTab("evaluation")}
-            disabled={!hasPredictions}
-            className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
-              activeTab === "evaluation"
-                ? "border-blue-500 text-blue-600 dark:text-blue-400"
-                : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
-            } disabled:opacity-40 disabled:cursor-not-allowed`}
-          >
-            Evaluation
-          </button>
-        )}
-        {!isClassification && (
-          <button
-            onClick={() => setActiveTab("error_analysis")}
-            disabled={!hasPredictions}
-            className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
-              activeTab === "error_analysis"
-                ? "border-blue-500 text-blue-600 dark:text-blue-400"
-                : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
-            } disabled:opacity-40 disabled:cursor-not-allowed`}
-          >
-            Error Analysis
-          </button>
-        )}
+        <button
+          onClick={() => setActiveTab("evaluation")}
+          disabled={!hasPredictions}
+          className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
+            activeTab === "evaluation"
+              ? "border-blue-500 text-blue-600 dark:text-blue-400"
+              : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
+          } disabled:opacity-40 disabled:cursor-not-allowed`}
+        >
+          Evaluation
+        </button>
+        <button
+          onClick={() => setActiveTab("error_analysis")}
+          disabled={!hasPredictions}
+          className={`px-4 py-2 text-sm font-medium border-b-2 transition-colors ${
+            activeTab === "error_analysis"
+              ? "border-blue-500 text-blue-600 dark:text-blue-400"
+              : "border-transparent text-zinc-500 dark:text-zinc-400 hover:text-zinc-700 dark:hover:text-zinc-300"
+          } disabled:opacity-40 disabled:cursor-not-allowed`}
+        >
+          Error Analysis
+        </button>
         {!isClassification && (
           <button
             onClick={() => setActiveTab("worst_images")}
@@ -291,11 +287,11 @@ export function StatsDashboard({ datasetId, datasetType }: StatsDashboardProps)
       )}
 
       {activeTab === "evaluation" && hasPredictions && (
-        <EvaluationPanel datasetId={datasetId} split={split} excludedClasses={excludedClasses} />
+        <EvaluationPanel datasetId={datasetId} split={split} excludedClasses={excludedClasses} datasetType={datasetType} />
       )}
 
       {activeTab === "error_analysis" && hasPredictions && (
-        <ErrorAnalysisPanel datasetId={datasetId} split={split} />
+        <ErrorAnalysisPanel datasetId={datasetId} split={split} datasetType={datasetType} />
       )}
 
       {activeTab === "worst_images" && hasPredictions && (

From 1bce1724ead3eaa43cdd99161d400c878a2087d7 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:34:40 -0500
Subject: [PATCH 25/38] docs(16-02): complete classification evaluation
 frontend plan

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/STATE.md                            |  12 +-
 .../16-02-SUMMARY.md                          | 120 ++++++++++++++++++
 2 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 .planning/phases/16-classification-evaluation/16-02-SUMMARY.md

diff --git a/.planning/STATE.md b/.planning/STATE.md
index 03332cf..e6b9dc8 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -10,11 +10,11 @@ See: .planning/PROJECT.md (updated 2026-02-18)
 ## Current Position
 
 Phase: 16 of 17 (Classification Evaluation)
-Plan: 1 of 2 in current phase
-Status: In Progress
-Last activity: 2026-02-18 -- Completed 16-01 (Classification Evaluation Backend)
+Plan: 2 of 2 in current phase (COMPLETE)
+Status: Phase 16 Complete
+Last activity: 2026-02-18 -- Completed 16-02 (Classification Evaluation Frontend)
 
-Progress: [#############################] 95% (v1.0 + v1.1 complete, v1.2 phase 16 plan 1 done)
+Progress: [##############################] 97% (v1.0 + v1.1 complete, v1.2 phase 16 complete)
 
 ## Performance Metrics
 
@@ -46,6 +46,8 @@ Recent decisions affecting current work:
 - [Phase 16]: Reuse ErrorAnalysisResponse model from detection for classification error analysis
 - [Phase 16]: Route by dataset_type at endpoint level, keeping classification/detection services separate
 - [Phase 16]: Remove response_model on evaluation endpoint for union return type support
+- [Phase 16]: Classification metric cards inline rather than reusing MetricsCards (different data shape)
+- [Phase 16]: Map backend error fields to classification labels: true_positives=correct, label_errors=misclassified
 
 ### Pending Todos
 
@@ -65,5 +67,5 @@ None.
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: Completed 16-01-PLAN.md (Classification Evaluation Backend)
+Stopped at: Completed 16-02-PLAN.md (Classification Evaluation Frontend)
 Resume file: None
diff --git a/.planning/phases/16-classification-evaluation/16-02-SUMMARY.md b/.planning/phases/16-classification-evaluation/16-02-SUMMARY.md
new file mode 100644
index 0000000..2a8a527
--- /dev/null
+++ b/.planning/phases/16-classification-evaluation/16-02-SUMMARY.md
@@ -0,0 +1,120 @@
+---
+phase: 16-classification-evaluation
+plan: 02
+subsystem: frontend
+tags: [classification, evaluation, confusion-matrix, f1, error-analysis, prediction-import, grid-badge]
+
+requires:
+  - phase: 16-classification-evaluation
+    plan: 01
+    provides: Classification evaluation backend endpoints (evaluation, confusion-cell, error-analysis)
+provides:
+  - Classification evaluation UI with accuracy/F1 metric cards, confusion matrix, per-class table
+  - Classification error analysis UI with correct/misclassified/missing categories
+  - Classification JSONL prediction import format option
+  - Predicted class badge on grid thumbnails (green=correct, red=mismatch)
+affects: [frontend-evaluation-tabs, grid-cell-badges]
+
+tech-stack:
+  added: []
+  patterns: [dataset-type branching at component level, classification vs detection layout switching]
+
+key-files:
+  created: []
+  modified:
+    - frontend/src/types/evaluation.ts
+    - frontend/src/types/prediction.ts
+    - frontend/src/hooks/use-evaluation.ts
+    - frontend/src/hooks/use-filtered-evaluation.ts
+    - frontend/src/components/detail/prediction-import-dialog.tsx
+    - frontend/src/components/grid/grid-cell.tsx
+    - frontend/src/components/stats/stats-dashboard.tsx
+    - frontend/src/components/stats/evaluation-panel.tsx
+    - frontend/src/components/stats/error-analysis-panel.tsx
+
+key-decisions:
+  - "Classification metric cards inline rather than reusing MetricsCards component (different data shape)"
+  - "Classification per-class table inline rather than extending PerClassTable (no AP columns)"
+  - "Map backend error fields to classification labels: true_positives=correct, label_errors=misclassified, false_negatives=missing"
+
+patterns-established:
+  - "Early return pattern: isClassification branch returns full JSX before detection code runs"
+
+duration: 3min
+completed: 2026-02-18
+---
+
+# Phase 16 Plan 02: Classification Evaluation Frontend Summary
+
+**Classification evaluation UI with accuracy/F1/confusion matrix, error analysis with correct/misclassified/missing categories, prediction import format, and grid predicted class badges**
+
+## Performance
+
+- **Duration:** 3 min
+- **Started:** 2026-02-19T03:30:08Z
+- **Completed:** 2026-02-19T03:33:34Z
+- **Tasks:** 2
+- **Files modified:** 9
+
+## Accomplishments
+- ClassificationEvaluationResponse type with discriminant field and AnyEvaluationResponse union
+- Classification JSONL format option in prediction import dialog with dynamic label ("Prediction File")
+- Evaluation tab un-hidden for classification datasets with accuracy, macro F1, weighted F1 metric cards
+- Confusion matrix with click-to-filter works for classification (reuses existing ConfusionMatrix component)
+- Per-class table shows Precision, Recall, F1, Support for classification (sorted by F1)
+- Error analysis shows Correct/Misclassified/Missing Prediction summary cards and stacked bar chart
+- Grid thumbnails show predicted class badge (green=correct, red=mismatch) alongside GT badge
+- IoU slider hidden for classification in both evaluation and error analysis panels
+- Detection evaluation and error analysis completely unchanged
+- useFilteredEvaluation passes through classification responses without filtering
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Types, hooks, prediction import dialog, and grid predicted label badge** - `7dca67e` (feat)
+2. **Task 2: Classification evaluation panel and error analysis panel** - `5d1433e` (feat)
+
+## Files Modified
+- `frontend/src/types/evaluation.ts` - ClassificationEvaluationResponse, ClassificationPerClassMetrics, AnyEvaluationResponse union
+- `frontend/src/types/prediction.ts` - Added classification_jsonl to format union
+- `frontend/src/hooks/use-evaluation.ts` - Returns AnyEvaluationResponse instead of EvaluationResponse
+- `frontend/src/hooks/use-filtered-evaluation.ts` - Passes through classification responses, casts detection for filtering
+- `frontend/src/components/detail/prediction-import-dialog.tsx` - Classification JSONL format option, dynamic path label
+- `frontend/src/components/grid/grid-cell.tsx` - Predicted class badge (green/red) at bottom-right of classification thumbnails
+- `frontend/src/components/stats/stats-dashboard.tsx` - Un-hidden Evaluation and Error Analysis tabs, pass datasetType prop
+- `frontend/src/components/stats/evaluation-panel.tsx` - Classification branch with metric cards, confusion matrix, per-class table
+- `frontend/src/components/stats/error-analysis-panel.tsx` - Classification branch with correct/misclassified/missing categories
+
+## Decisions Made
+- Rendered classification metric cards inline (ClassificationMetricsCards) rather than reusing the detection MetricsCards component -- different data shape (accuracy/F1 vs mAP)
+- Rendered classification per-class table inline (ClassificationPerClassTable) rather than extending PerClassTable -- no AP columns, sorted by F1
+- Mapped backend ErrorAnalysisResponse fields to classification-friendly labels: true_positives -> Correct, label_errors -> Misclassified, false_negatives -> Missing Prediction
+
+## Deviations from Plan
+
+### Auto-fixed Issues
+
+**1. [Rule 3 - Blocking] Updated useFilteredEvaluation for union type compatibility**
+- **Found during:** Task 1
+- **Issue:** useFilteredEvaluation typed as EvaluationResponse-only, would cause type errors with AnyEvaluationResponse from useEvaluation
+- **Fix:** Updated to accept AnyEvaluationResponse, pass through classification responses unfiltered, cast detection for existing filter logic
+- **Files modified:** frontend/src/hooks/use-filtered-evaluation.ts
+- **Commit:** 7dca67e
+
+## Issues Encountered
+
+None.
+
+## User Setup Required
+
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Complete classification evaluation experience end-to-end (backend + frontend)
+- All 7 verification items from plan are addressed
+- Ready for Phase 17 (Dataset Intelligence)
+
+---
+*Phase: 16-classification-evaluation*
+*Completed: 2026-02-18*

From 3bbeed28809103f5144bc39389b2b62858e9370f Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:38:50 -0500
Subject: [PATCH 26/38] fix(16-02): use misclassified key for classification
 error samples grid

The classification error analysis backend populates samples_by_type with
"misclassified" key, but the frontend was reading "label_error" (detection key).
Fall back to label_error for detection compatibility.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 frontend/src/components/stats/error-analysis-panel.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/src/components/stats/error-analysis-panel.tsx b/frontend/src/components/stats/error-analysis-panel.tsx
index 6c4315e..f852715 100644
--- a/frontend/src/components/stats/error-analysis-panel.tsx
+++ b/frontend/src/components/stats/error-analysis-panel.tsx
@@ -283,7 +283,7 @@ export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnaly
             <ErrorSamplesGrid
               title="Misclassified"
               errorType="label_error"
-              samples={data.samples_by_type.label_error ?? []}
+              samples={data.samples_by_type.misclassified ?? data.samples_by_type.label_error ?? []}
               datasetId={datasetId}
               color={CLASSIFICATION_COLORS.misclassified}
             />

From 79ff323a8c06eafd1d6cedce218c6eb7b9954ec3 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:39:38 -0500
Subject: [PATCH 27/38] docs(phase-16): complete phase execution

---
 .planning/REQUIREMENTS.md                     |  20 +--
 .planning/ROADMAP.md                          |   8 +-
 .../16-VERIFICATION.md                        | 123 ++++++++++++++++++
 3 files changed, 137 insertions(+), 14 deletions(-)
 create mode 100644 .planning/phases/16-classification-evaluation/16-VERIFICATION.md

diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
index 524809f..b8b6a3d 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/REQUIREMENTS.md
@@ -23,11 +23,11 @@ Requirements for classification dataset support. Each maps to roadmap phases.
 
 ### Evaluation
 
-- [ ] **EVAL-01**: User can import classification predictions in JSONL format with confidence scores
-- [ ] **EVAL-02**: User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics
-- [ ] **EVAL-03**: User sees a confusion matrix for classification with click-to-filter support
-- [ ] **EVAL-04**: User sees error analysis categorizing each image as correct, misclassified, or missing prediction
-- [ ] **EVAL-05**: User sees GT vs predicted label comparison on grid thumbnails and in the modal
+- [x] **EVAL-01**: User can import classification predictions in JSONL format with confidence scores
+- [x] **EVAL-02**: User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics
+- [x] **EVAL-03**: User sees a confusion matrix for classification with click-to-filter support
+- [x] **EVAL-04**: User sees error analysis categorizing each image as correct, misclassified, or missing prediction
+- [x] **EVAL-05**: User sees GT vs predicted label comparison on grid thumbnails and in the modal
 
 ### Polish
 
@@ -74,11 +74,11 @@ Which phases cover which requirements. Updated during roadmap creation.
 | DISP-02 | Phase 15 | Done |
 | DISP-03 | Phase 15 | Done |
 | DISP-04 | Phase 15 | Done |
-| EVAL-01 | Phase 16 | Pending |
-| EVAL-02 | Phase 16 | Pending |
-| EVAL-03 | Phase 16 | Pending |
-| EVAL-04 | Phase 16 | Pending |
-| EVAL-05 | Phase 16 | Pending |
+| EVAL-01 | Phase 16 | Done |
+| EVAL-02 | Phase 16 | Done |
+| EVAL-03 | Phase 16 | Done |
+| EVAL-04 | Phase 16 | Done |
+| EVAL-05 | Phase 16 | Done |
 | POLISH-01 | Phase 17 | Pending |
 | POLISH-02 | Phase 17 | Pending |
 | POLISH-03 | Phase 17 | Pending |
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 5a2643f..3ffef4c 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -103,10 +103,10 @@ Plans:
   3. User sees a confusion matrix and can click any cell to filter the grid to images with that GT/predicted class pair
   4. User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view
   5. User sees GT vs predicted label comparison on grid thumbnails and in the detail modal
-**Plans**: 2 plans
+**Plans**: 2 plans (complete)
 Plans:
-- [ ] 16-01-PLAN.md -- Backend: classification prediction parser, evaluation service, error analysis service, endpoint routing
-- [ ] 16-02-PLAN.md -- Frontend: types, hooks, prediction import dialog, evaluation panel, error analysis panel, grid badges
+- [x] 16-01-PLAN.md -- Backend: classification prediction parser, evaluation service, error analysis service, endpoint routing
+- [x] 16-02-PLAN.md -- Frontend: types, hooks, prediction import dialog, evaluation panel, error analysis panel, grid badges
 
 #### Phase 17: Classification Polish
 **Goal**: Classification workflows are production-ready for high-cardinality datasets (43+ classes) with visual aids that surface actionable insights
@@ -138,5 +138,5 @@ Plans:
 | 13. Keyboard Shortcuts | v1.1 | 2/2 | Complete | 2026-02-13 |
 | 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
 | 15. Classification Ingestion & Display | v1.2 | 2/2 | Complete | 2026-02-18 |
-| 16. Classification Evaluation | v1.2 | 0/TBD | Not started | - |
+| 16. Classification Evaluation | v1.2 | 2/2 | Complete | 2026-02-18 |
 | 17. Classification Polish | v1.2 | 0/TBD | Not started | - |
diff --git a/.planning/phases/16-classification-evaluation/16-VERIFICATION.md b/.planning/phases/16-classification-evaluation/16-VERIFICATION.md
new file mode 100644
index 0000000..f514ba7
--- /dev/null
+++ b/.planning/phases/16-classification-evaluation/16-VERIFICATION.md
@@ -0,0 +1,123 @@
+---
+phase: 16-classification-evaluation
+verified: 2026-02-19T03:37:08Z
+status: gaps_found
+score: 4/5 must-haves verified
+re_verification: false
+gaps:
+  - truth: "User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view"
+    status: partial
+    reason: "The classification error analysis panel references data.samples_by_type.label_error to populate the Misclassified sample grid, but the backend returns samples_by_type with key 'misclassified' (not 'label_error'). The detection model uses 'label_error'; classification uses 'misclassified'. Misclassified sample thumbnails will always be empty due to key mismatch."
+    artifacts:
+      - path: "frontend/src/components/stats/error-analysis-panel.tsx"
+        issue: "Line 286: data.samples_by_type.label_error should be data.samples_by_type.misclassified for the classification branch"
+    missing:
+      - "Change line 286 in error-analysis-panel.tsx from data.samples_by_type.label_error to data.samples_by_type.misclassified in the isClassification branch"
+human_verification:
+  - test: "Import a classification JSONL prediction file and verify accuracy/F1 metrics are computed correctly"
+    expected: "Evaluation panel shows non-zero accuracy, macro F1, and weighted F1 values that match the actual prediction file's correctness"
+    why_human: "Cannot verify metric computation correctness without actual data; requires end-to-end execution"
+  - test: "Click a confusion matrix cell for a classification dataset and verify grid filters to matching images"
+    expected: "Grid updates to show only images with the selected GT/predicted class pair"
+    why_human: "State flow through filter store requires running the UI"
+  - test: "Open detail modal for a misclassified image in a classification dataset"
+    expected: "Modal shows GT class label and a 'Predicted:' label with the wrong prediction alongside it"
+    why_human: "Requires UI interaction to verify layout of the classification section in sample-modal.tsx"
+---
+
+# Phase 16: Classification Evaluation Verification Report
+
+**Phase Goal:** Users can import predictions and analyze classification model performance with accuracy, F1, confusion matrix, and error categorization
+**Verified:** 2026-02-19T03:37:08Z
+**Status:** gaps_found
+**Re-verification:** No -- initial verification
+
+## Goal Achievement
+
+### Observable Truths
+
+| # | Truth | Status | Evidence |
+|---|-------|--------|---------|
+| 1 | User can import classification predictions in JSONL format with confidence scores and see them alongside ground truth | VERIFIED | `ClassificationPredictionParser` in `app/ingestion/classification_prediction_parser.py` is substantive with sentinel bbox values, flexible key lookup, and batch streaming. `app/routers/datasets.py` line 191 routes `format == "classification_jsonl"` to it. `app/models/prediction.py` includes `classification_jsonl` in the Literal type. |
+| 2 | User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics in the evaluation panel | VERIFIED | `app/services/classification_evaluation.py` computes all metrics from confusion matrix with div-by-zero guards. `app/routers/statistics.py` lines 179-182 route classification datasets to this service. Frontend `evaluation-panel.tsx` has a complete `isClassification` early-return branch with `ClassificationMetricsCards` and `ClassificationPerClassTable` components that render all required fields. |
+| 3 | User sees a confusion matrix and can click any cell to filter the grid to images with that GT/predicted class pair | VERIFIED | `get_classification_confusion_cell_samples` in `app/services/classification_evaluation.py` performs direct label JOIN without IoU. Router routes classification datasets to it. Frontend `evaluation-panel.tsx` calls `fetchConfusionCellSamples` in `handleCellClick`, sets `setSampleIdFilter`, and switches to grid tab. |
+| 4 | User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view | PARTIAL | Backend `classify_errors` in `app/services/classification_error_analysis.py` correctly returns `samples_by_type` with keys `"correct"`, `"misclassified"`, `"missing_prediction"`. Summary cards (line 115-118) correctly map from `true_positives`/`label_errors`/`false_negatives`. BUT: the `ErrorSamplesGrid` for "Misclassified" samples (line 286) reads `data.samples_by_type.label_error` instead of `data.samples_by_type.misclassified` -- key mismatch means the misclassified sample grid always renders empty. |
+| 5 | User sees GT vs predicted label comparison on grid thumbnails and in the detail modal | VERIFIED | `grid-cell.tsx` lines 100-115 render a GT badge (bottom-left via `ClassBadge`) and a prediction badge (bottom-right, green/red) in the classification branch. `sample-modal.tsx` lines 424-463 show a "Class:" dropdown (GT) and a "Predicted:" label with confidence for classification datasets. |
+
+**Score:** 4/5 truths verified
+
+### Required Artifacts (Plan 16-01)
+
+| Artifact | Expected | Status | Details |
+|----------|----------|--------|---------|
+| `app/ingestion/classification_prediction_parser.py` | JSONL prediction parser with sentinel bbox, filename-to-sample_id lookup | VERIFIED | 153 lines, full implementation with `_get_field`, flexible key lookup, `parse_streaming` with batch yielding |
+| `app/services/classification_evaluation.py` | compute_classification_evaluation returning accuracy, F1, confusion matrix, per-class metrics | VERIFIED | 205 lines, full implementation including confusion cell lookup function |
+| `app/services/classification_error_analysis.py` | classify_errors returning correct/misclassified/missing per sample | VERIFIED | 162 lines, full implementation using `ErrorAnalysisResponse` model |
+| `app/models/classification_evaluation.py` | ClassificationEvaluationResponse, ClassificationPerClassMetrics Pydantic models | VERIFIED | Both models present with all required fields |
+
+### Required Artifacts (Plan 16-02)
+
+| Artifact | Expected | Status | Details |
+|----------|----------|--------|---------|
+| `frontend/src/types/evaluation.ts` | ClassificationEvaluationResponse type with discriminant field | VERIFIED | Has `ClassificationEvaluationResponse` with `evaluation_type: "classification"`, `ClassificationPerClassMetrics`, and `AnyEvaluationResponse` union |
+| `frontend/src/components/stats/evaluation-panel.tsx` | Classification evaluation rendering with metric cards, confusion matrix, per-class table | VERIFIED | Full `isClassification` early-return with `ClassificationMetricsCards`, `ConfusionMatrix`, `ClassificationPerClassTable` |
+| `frontend/src/components/stats/error-analysis-panel.tsx` | Classification error analysis with correct/misclassified/missing categories | PARTIAL | Summary cards correct. Bar chart correct. Misclassified sample grid uses wrong key `label_error` instead of `misclassified` |
+| `frontend/src/components/grid/grid-cell.tsx` | Predicted class badge alongside GT badge for classification | VERIFIED | Lines 100-115 render green/red predicted badge at bottom-right with GT badge at bottom-left |
+
+### Key Link Verification (Plan 16-01)
+
+| From | To | Via | Status | Details |
+|------|----|-----|--------|---------|
+| `app/routers/datasets.py` | `app/ingestion/classification_prediction_parser.py` | `format == 'classification_jsonl'` branch | WIRED | Line 191 branches, imports `ClassificationPredictionParser`, builds `sample_lookup`, streams and inserts |
+| `app/routers/statistics.py` | `app/services/classification_evaluation.py` | `dataset_type == 'classification'` check | WIRED | Lines 179-182 call `compute_classification_evaluation`; confusion cell endpoint lines 219-228 call `get_classification_confusion_cell_samples` |
+| `app/routers/statistics.py` | `app/services/classification_error_analysis.py` | `dataset_type == 'classification'` check | WIRED | Lines 285-288 call `classify_classification_errors` (aliased import) |
+
+### Key Link Verification (Plan 16-02)
+
+| From | To | Via | Status | Details |
+|------|----|-----|--------|---------|
+| `frontend/src/components/stats/evaluation-panel.tsx` | `/datasets/{id}/evaluation` | `useEvaluation` hook | WIRED | Hook returns `AnyEvaluationResponse`; panel checks `evaluation_type === "classification"` via `isClassification` prop |
+| `frontend/src/components/stats/stats-dashboard.tsx` | `evaluation-panel.tsx` | Evaluation tab visible for classification datasets | WIRED | Tab rendered unconditionally (lines 179-189), `isClassification` not used to gate it; `datasetType` prop passed to `EvaluationPanel` |
+| `frontend/src/components/grid/grid-cell.tsx` | annotations | Finding prediction annotation to display predicted label | WIRED | Line 105 `annotations.find(a => a.source !== "ground_truth")` retrieves prediction |
+
+### Requirements Coverage
+
+Not applicable -- requirements are tracked at milestone level, not phase level.
+
+### Anti-Patterns Found
+
+| File | Line | Pattern | Severity | Impact |
+|------|------|---------|----------|--------|
+| `frontend/src/components/stats/error-analysis-panel.tsx` | 286 | `data.samples_by_type.label_error` in classification branch -- wrong key | BLOCKER | Misclassified sample thumbnails always empty; backend sends key `"misclassified"`, not `"label_error"` |
+
+### Human Verification Required
+
+#### 1. End-to-end prediction import and metric verification
+
+**Test:** Import a classification JSONL file into a classification dataset; navigate to the Evaluation tab, check the accuracy, macro F1, and weighted F1 values
+**Expected:** Non-zero metric values matching the actual correctness rate of the predictions
+**Why human:** Cannot verify metric computation accuracy without real data and a running backend
+
+#### 2. Confusion matrix click-to-filter
+
+**Test:** In a classification dataset with predictions, click a cell in the confusion matrix
+**Expected:** Grid panel activates and shows only images with the clicked GT/predicted class pair
+**Why human:** State flow through filter store and tab navigation requires running the UI
+
+#### 3. Detail modal GT vs predicted label
+
+**Test:** Open a misclassified image's detail modal
+**Expected:** Modal shows the GT class label (editable dropdown) and "Predicted: X" label below it with confidence percentage
+**Why human:** Visual layout requires running UI; testing wiring of `gtAnnotations`/`predAnnotations` split
+
+## Gaps Summary
+
+One gap found. The classification error analysis panel has a key mismatch for the "Misclassified" sample grid. The backend `classify_errors` function in `app/services/classification_error_analysis.py` stores misclassified samples under `samples_by_type["misclassified"]`, but the frontend classification branch reads `data.samples_by_type.label_error` (line 286 of `error-analysis-panel.tsx`). The key `"label_error"` only exists in detection responses; for classification it is `"misclassified"`.
+
+This means: summary cards (counts) display correctly, the bar chart displays correctly, but the misclassified thumbnail grid always shows zero samples even when there are misclassified images. The fix is a one-line change: `data.samples_by_type.label_error` → `data.samples_by_type.misclassified` in the `isClassification` branch.
+
+All other success criteria are fully met with substantive, wired implementations.
+
+---
+_Verified: 2026-02-19T03:37:08Z_
+_Verifier: Claude (gsd-verifier)_

From a5f9460b3629a9656a5df657d7ca5b44ffab1e9e Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:45:33 -0500
Subject: [PATCH 28/38] docs(classification-polish): research phase domain

---
 .../17-classification-polish/17-RESEARCH.md   | 441 ++++++++++++++++++
 1 file changed, 441 insertions(+)
 create mode 100644 .planning/phases/17-classification-polish/17-RESEARCH.md

diff --git a/.planning/phases/17-classification-polish/17-RESEARCH.md b/.planning/phases/17-classification-polish/17-RESEARCH.md
new file mode 100644
index 0000000..3ad2e73
--- /dev/null
+++ b/.planning/phases/17-classification-polish/17-RESEARCH.md
@@ -0,0 +1,441 @@
+# Phase 17: Classification Polish - Research
+
+**Researched:** 2026-02-18
+**Domain:** High-cardinality confusion matrix rendering, embedding scatter coloring, most-confused pairs, per-class sparklines
+**Confidence:** HIGH (all four requirements are frontend-focused UI enhancements on existing infrastructure, no new backend services or libraries needed)
+
+## Summary
+
+Phase 17 polishes the classification evaluation experience for production use with high-cardinality datasets (43+ classes). It addresses four distinct UI gaps: (1) the current confusion matrix renders as an HTML table which becomes unreadable at 43+ classes -- it needs threshold filtering to hide low-value cells and overflow handling; (2) the embedding scatter plot currently colors all points uniformly blue but should support coloring by GT class, predicted class, or correct/incorrect status; (3) the confusion matrix data already contains all information needed to derive a ranked list of most-confused class pairs, but no summary is surfaced; (4) the per-class metrics table shows raw numbers but lacks visual sparklines with color-coded thresholds for quick scanning.
+
+All four requirements are frontend-focused with minimal backend changes. The existing `ClassificationEvaluationResponse` already returns `confusion_matrix`, `confusion_matrix_labels`, and `per_class_metrics` -- enough data for requirements POLISH-01, POLISH-03, and POLISH-04 without backend changes. POLISH-02 requires enriching the embedding coordinates endpoint to include GT and predicted labels per sample, or fetching annotation data separately to join client-side.
+
+**Primary recommendation:** Implement all four requirements as frontend enhancements. For the confusion matrix (POLISH-01), use the existing HTML table approach with a threshold filter slider (hide cells below N%) and `overflow-auto` with `max-h`/`max-w` constraints rather than migrating to canvas -- the HTML table already uses cell-level color intensity and is easier to maintain. For embedding coloring (POLISH-02), extend the backend `/coordinates` endpoint to include `gtLabel` and `predLabel` per point so the `getFillColor` accessor can use a categorical color palette. For most-confused pairs (POLISH-03), derive from the existing confusion matrix client-side. For sparklines (POLISH-04), use Recharts `LineChart` with hidden axes to create inline SVG sparklines.
+
+## Standard Stack
+
+### Core (already in use -- no new dependencies)
+
+| Library | Version | Purpose | Status |
+|---------|---------|---------|--------|
+| Recharts | ^3.7.0 | Sparkline mini charts in per-class table | In use |
+| deck.gl | ^9.2.6 | ScatterplotLayer `getFillColor` accessor for categorical coloring | In use |
+| React/Next.js | - | Component rendering, memoization | In use |
+| DuckDB | - | JOIN annotations to embedding coordinates (backend) | In use |
+| Tailwind CSS | - | Styling, responsive overflow containers | In use |
+
+### Supporting (no new libraries needed)
+
+The sparkline requirement can be met with Recharts `LineChart` + `Line` with hidden axes in a small container (~60x20px). No dedicated sparkline library is needed. The Recharts `LineChart` component supports `width`/`height` props directly (no `ResponsiveContainer` needed for fixed-size inline use).
+
+For categorical color palettes in the embedding scatter, a static array of 20-50 distinct colors is sufficient. D3's categorical color scales (`d3-scale-chromatic`) are NOT in the dependency tree and would be overkill -- a hardcoded palette of ~20 colors with hashing for overflow is simpler and has zero bundle impact.
+
+### Alternatives Considered
+
+| Instead of | Could Use | Tradeoff |
+|------------|-----------|----------|
+| HTML table with threshold filter (confusion matrix) | Canvas-based heatmap (e.g., custom Canvas2D) | Canvas handles extreme sizes better but loses click interactivity, accessibility, and requires more complex implementation. HTML table with threshold filtering handles 43 classes well enough. |
+| Recharts inline LineChart (sparklines) | SVG `<polyline>` or `<path>` by hand | Recharts is already imported; hand-rolling SVG paths saves ~1KB per sparkline but adds maintenance burden. |
+| Recharts inline LineChart (sparklines) | `react-sparklines` library | Adds a new dependency for something Recharts already supports. |
+| Backend label enrichment on /coordinates | Client-side JOIN via separate annotations fetch | Backend is cleaner (single fetch, no N+1). Frontend JOIN requires fetching all annotations for all samples which is already available in batch-annotations but couples embedding panel to annotation data. |
+
+## Architecture Patterns
+
+### Recommended Change Map
+
+```
+Backend:
+  app/services/reduction_service.py    # MODIFY: get_coordinates JOIN to include GT/pred labels
+  (or) app/routers/embeddings.py       # MODIFY: accept color_mode query param, enrich response
+
+Frontend:
+  types/embedding.ts                   # MODIFY: add gtLabel, predLabel to EmbeddingPoint
+  components/embedding/embedding-scatter.tsx   # MODIFY: accept colorMode prop, use categorical getFillColor
+  components/embedding/embedding-panel.tsx     # MODIFY: add color mode dropdown, pass datasetType, pass colorMode
+  components/stats/confusion-matrix.tsx        # MODIFY: add threshold slider, overflow constraints, max-h/max-w
+  components/stats/evaluation-panel.tsx        # MODIFY: add MostConfusedPairs component, add sparklines to per-class table
+  (or) components/stats/most-confused-pairs.tsx  # NEW: ranked list of most confused (gt, pred) pairs
+  (or) components/stats/per-class-sparkline.tsx  # NEW: inline Recharts sparkline
+  app/datasets/[datasetId]/page.tsx    # MODIFY: pass datasetType to EmbeddingPanel
+```
+
+### Pattern 1: Confusion Matrix Threshold Filtering (POLISH-01)
+
+**What:** Add a slider that filters out confusion matrix cells below a threshold percentage, making high-cardinality matrices readable.
+**When to use:** When label count >= ~15 classes.
+
+The current `ConfusionMatrix` component row-normalizes and renders all cells. At 43 classes, this is 1,849 cells -- most with 0.00 values. The fix:
+
+1. Add a `threshold` state (0.0 to 0.5, default 0.01) with a slider in the matrix header
+2. Cells below threshold render as empty (no text, transparent background)
+3. Wrap the table in a container with `max-h-[500px] max-w-full overflow-auto` for scroll
+4. Make cell size smaller for high-cardinality: `min-w-[24px]` instead of `min-w-[32px]` when labels > 20
+5. Truncate long label text with `max-w-[80px] truncate` on row/column headers
+
+```tsx
+// In confusion-matrix.tsx
+const [threshold, setThreshold] = useState(0.01);
+const isHighCardinality = labels.length > 20;
+
+// In the cell render:
+{norm >= threshold ? (
+  <span>{norm.toFixed(2)}</span>
+) : null}
+```
+
+No canvas rendering needed. The HTML table with threshold filtering, scroll overflow, and compact cell sizing handles 43 classes adequately. Tested reasoning: 43x43 = 1,849 `<td>` elements is trivial for the browser DOM. Canvas would only be justified at 200+ classes.
+
+### Pattern 2: Embedding Scatter Color Modes (POLISH-02)
+
+**What:** A dropdown in the embedding toolbar that switches point coloring between: "Default" (uniform blue), "GT Class", "Predicted Class", "Correct/Incorrect".
+**When to use:** Classification datasets with predictions imported.
+
+**Backend change:** Enrich `get_coordinates` to JOIN annotation labels:
+
+```python
+# In reduction_service.py get_coordinates
+SELECT e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path,
+       gt.category_name as gt_label,
+       pred.category_name as pred_label
+FROM embeddings e
+JOIN samples s ON e.sample_id = s.id AND e.dataset_id = s.dataset_id
+LEFT JOIN annotations gt ON gt.sample_id = s.id AND gt.dataset_id = s.dataset_id
+    AND gt.source = 'ground_truth'
+LEFT JOIN annotations pred ON pred.sample_id = s.id AND pred.dataset_id = s.dataset_id
+    AND pred.source != 'ground_truth'
+WHERE e.dataset_id = ? AND e.x IS NOT NULL
+```
+
+Note: This LEFT JOINs so points without annotations still appear. For multi-source predictions, pick the first non-GT source or accept NULL.
+
+**Frontend change:** The `EmbeddingScatter` component's `getFillColor` accessor switches based on `colorMode`:
+
+```tsx
+type ColorMode = "default" | "gt_class" | "pred_class" | "correctness";
+
+// Categorical palette (20 distinct colors, cycle with modulo for overflow)
+const PALETTE: [number,number,number,number][] = [
+  [31,119,180,200], [255,127,14,200], [44,160,44,200], [214,39,40,200],
+  [148,103,189,200], [140,86,75,200], [227,119,194,200], [127,127,127,200],
+  // ... 12 more ...
+];
+
+getFillColor: (d) => {
+  if (colorMode === "gt_class" && d.gtLabel) {
+    return PALETTE[labelIndex.get(d.gtLabel)! % PALETTE.length];
+  }
+  if (colorMode === "pred_class" && d.predLabel) {
+    return PALETTE[labelIndex.get(d.predLabel)! % PALETTE.length];
+  }
+  if (colorMode === "correctness") {
+    if (!d.predLabel) return [180,180,180,100]; // no prediction: gray
+    return d.gtLabel === d.predLabel
+      ? [44,160,44,200]   // correct: green
+      : [214,39,40,200];  // incorrect: red
+  }
+  return [100,120,220,200]; // default blue
+}
+```
+
+The `labelIndex` is a `Map<string, number>` built from unique labels in the points array, sorted alphabetically for stable color assignment.
+
+**Key concern:** The `EmbeddingPanel` currently receives only `datasetId`. It needs `datasetType` to know whether to show the color mode dropdown. The page already has `dataset?.dataset_type` -- thread it through as a prop.
+
+### Pattern 3: Most-Confused Class Pairs (POLISH-03)
+
+**What:** A ranked list derived from the confusion matrix showing the top-N most confused (actual, predicted) pairs.
+**When to use:** Always shown below/beside the confusion matrix when classification evaluation data is available.
+
+This is a pure frontend derivation -- no backend change needed. The confusion matrix and labels are already in `ClassificationEvaluationResponse`.
+
+```tsx
+function getMostConfusedPairs(
+  matrix: number[][],
+  labels: string[],
+  topN: number = 10,
+): { actual: string; predicted: string; count: number; pct: number }[] {
+  const pairs: { actual: string; predicted: string; count: number; pct: number }[] = [];
+  for (let i = 0; i < matrix.length; i++) {
+    const rowSum = matrix[i].reduce((a, b) => a + b, 0);
+    for (let j = 0; j < matrix[i].length; j++) {
+      if (i === j) continue; // skip diagonal (correct predictions)
+      if (matrix[i][j] === 0) continue;
+      pairs.push({
+        actual: labels[i],
+        predicted: labels[j],
+        count: matrix[i][j],
+        pct: rowSum > 0 ? matrix[i][j] / rowSum : 0,
+      });
+    }
+  }
+  pairs.sort((a, b) => b.count - a.count);
+  return pairs.slice(0, topN);
+}
+```
+
+Render as a compact table: rank, actual class, arrow, predicted class, count, percentage. Clicking a row could trigger the existing confusion cell click-to-filter behavior.
+
+### Pattern 4: Per-Class Sparklines with Color-Coded Thresholds (POLISH-04)
+
+**What:** Add a small inline sparkline to each row of the per-class metrics table, with color coding: green (F1 >= 0.8), yellow (0.5 <= F1 < 0.8), red (F1 < 0.5).
+**When to use:** Always shown in the classification per-class table.
+
+The "sparkline" for per-class metrics is a bit ambiguous since each class has a single F1 value, not a time series. Two interpretations:
+
+**Interpretation A: Per-class metric bar (precision/recall/F1 as a small bar chart)**
+A tiny 3-bar chart (P, R, F1) for each class, giving a visual summary per row. This is more useful than a line sparkline for single-point-in-time data.
+
+**Interpretation B: Confidence-threshold sweep sparkline**
+Show how F1 varies as confidence threshold changes. This requires computing F1 at multiple thresholds (backend change needed -- return F1 at e.g. 5 threshold values).
+
+**Recommendation: Interpretation A** is simpler and requires no backend changes. Three small bars (P, R, F1) using Recharts `BarChart` with hidden axes, color-coded by the F1 threshold:
+
+```tsx
+function PerClassSparkline({ precision, recall, f1 }: { precision: number; recall: number; f1: number }) {
+  const color = f1 >= 0.8 ? "#22c55e" : f1 >= 0.5 ? "#eab308" : "#ef4444";
+  const data = [
+    { name: "P", value: precision },
+    { name: "R", value: recall },
+    { name: "F1", value: f1 },
+  ];
+  return (
+    <BarChart width={48} height={20} data={data}>
+      <Bar dataKey="value" fill={color} radius={1} />
+    </BarChart>
+  );
+}
+```
+
+Alternatively, a simpler approach: just a colored horizontal bar representing F1 (0-1 scale) with background showing "full" (1.0). No Recharts needed -- pure CSS:
+
+```tsx
+<div className="w-16 h-3 bg-zinc-200 dark:bg-zinc-700 rounded-full overflow-hidden">
+  <div
+    className="h-full rounded-full"
+    style={{
+      width: `${f1 * 100}%`,
+      backgroundColor: f1 >= 0.8 ? "#22c55e" : f1 >= 0.5 ? "#eab308" : "#ef4444",
+    }}
+  />
+</div>
+```
+
+The CSS bar is simpler, zero-dependency, and arguably clearer for a single metric. **Recommend the CSS bar approach** unless the user specifically wants a multi-metric sparkline.
+
+### Anti-Patterns to Avoid
+
+- **Canvas confusion matrix for 43 classes:** Canvas loses click interactivity, text rendering quality, and accessibility. HTML table with threshold filtering is adequate for this scale.
+- **Fetching all annotations separately for embedding coloring:** This creates an N+1 or large-batch problem. Better to enrich the `/coordinates` endpoint with a JOIN.
+- **Computing most-confused pairs on the backend:** The confusion matrix is already transmitted. Deriving pairs client-side avoids a new endpoint and keeps the backend simple.
+- **Using ResponsiveContainer for sparklines in table cells:** ResponsiveContainer requires a parent with explicit dimensions. In table cells, use fixed `width`/`height` props on the chart directly.
+
+## Don't Hand-Roll
+
+| Problem | Don't Build | Use Instead | Why |
+|---------|-------------|-------------|-----|
+| Categorical color palette | Dynamic color generation HSL math | Static 20-color palette array | Reproducible, visually distinct, no computation |
+| Sparkline chart | Custom SVG `<path>` calculation | Recharts BarChart or CSS bar | Already in dependency tree, consistent styling |
+| Most-confused pairs | New backend endpoint | Client-side derivation from confusion matrix | Data already on client, O(N^2) is trivial for N<=50 |
+| Overflow scroll on confusion matrix | Custom virtual scrolling | CSS `overflow-auto` with `max-h`/`max-w` | 43x43 DOM elements is trivial, no virtualization needed |
+
+**Key insight:** All four requirements are UI refinements on data that is already available in the frontend. The only backend change needed is enriching embedding coordinates with annotation labels for POLISH-02.
+
+## Common Pitfalls
+
+### Pitfall 1: Threshold Slider Hides All Cells
+**What goes wrong:** If the user sets the confusion matrix threshold too high, all off-diagonal cells disappear, making the matrix look like nothing is wrong.
+**Why it happens:** Most off-diagonal values in a well-performing model are very small fractions (0.01-0.05).
+**How to avoid:** Set a sensible default (0.01 = 1%), show a "N cells hidden" count, and never hide diagonal cells regardless of threshold.
+**Warning signs:** Confusion matrix appears nearly empty with all off-diagonal cells blank.
+
+### Pitfall 2: Embedding Color Mode Without Predictions
+**What goes wrong:** User selects "Predicted Class" or "Correct/Incorrect" color mode but no predictions are imported. All points turn gray.
+**Why it happens:** The `predLabel` field is null for all points when no predictions exist.
+**How to avoid:** Disable "Predicted Class" and "Correct/Incorrect" options in the dropdown when no predictions exist. Check for the presence of prediction sources (same `hasPredictions` logic used in stats dashboard).
+**Warning signs:** All points are gray/identical color in a non-"Default" mode.
+
+### Pitfall 3: Too Many Classes for Color Palette
+**What goes wrong:** With 43+ classes, the 20-color palette cycles and multiple classes share the same color, reducing the scatter plot's usefulness.
+**Why it happens:** Human color discrimination is limited to ~20 distinct hues.
+**How to avoid:** Accept this limitation and mitigate: (1) show a legend that maps colors to classes (scrollable), (2) use hover tooltip to show the exact class name, (3) for 20+ classes, recommend "Correct/Incorrect" mode which only needs 3 colors (correct/incorrect/no-prediction).
+**Warning signs:** Multiple visually distinct clusters in the scatter plot share the same color.
+
+### Pitfall 4: Stale Embedding Coordinates After Prediction Import
+**What goes wrong:** User imports predictions, switches to Embeddings tab, but coordinates don't include the new `predLabel` because the TanStack Query cache is stale (staleTime: Infinity).
+**Why it happens:** Embedding coordinates query uses `staleTime: Infinity` -- it never refetches automatically.
+**How to avoid:** After prediction import completes, invalidate the `embedding-coordinates` query key. The prediction import dialog already invalidates several query keys on success -- add `embedding-coordinates` to that list.
+**Warning signs:** Color mode shows all points as "no prediction" (gray) even after importing predictions.
+
+### Pitfall 5: Multiple Prediction Sources Per Sample
+**What goes wrong:** If a sample has predictions from multiple sources (e.g., "model_v1" and "model_v2"), the JOIN in get_coordinates returns duplicate rows per sample.
+**Why it happens:** LEFT JOIN on annotations with source != 'ground_truth' matches multiple rows.
+**How to avoid:** Either: (1) use a subquery with LIMIT 1 per sample, or (2) accept a `source` query parameter on the coordinates endpoint to filter to one prediction source, or (3) pick the first non-GT source with ROW_NUMBER(). Option (2) is cleanest -- matches how the evaluation panel already handles source selection.
+**Warning signs:** Duplicate points in the scatter plot (same x,y but different pred labels).
+
+### Pitfall 6: Classification-Only Multi-Label GT in Embeddings
+**What goes wrong:** If a sample has multiple GT annotations (multi-label), the coordinates JOIN returns duplicate rows.
+**Why it happens:** Same as Pitfall 5 but for GT side.
+**How to avoid:** Use MIN(gt.category_name) or GROUP BY to collapse to one GT label per sample, matching the pattern in `compute_classification_evaluation`.
+**Warning signs:** Point count in scatter differs from embedding count shown in toolbar.
+
+## Code Examples
+
+### Confusion Matrix with Threshold Filter
+
+```tsx
+// confusion-matrix.tsx additions
+const [threshold, setThreshold] = useState(0.01);
+
+// In the header area:
+<div className="flex items-center gap-2">
+  <label className="text-xs text-zinc-500">Min:</label>
+  <input
+    type="range" min={0} max={0.5} step={0.01}
+    value={threshold}
+    onChange={(e) => setThreshold(parseFloat(e.target.value))}
+    className="w-20 accent-blue-500"
+  />
+  <span className="text-xs font-mono text-zinc-400">{(threshold*100).toFixed(0)}%</span>
+</div>
+
+// Wrap table in scrollable container:
+<div className="overflow-auto max-h-[500px]">
+  <table>...</table>
+</div>
+
+// Cell rendering:
+const showValue = norm >= threshold || ri === ci; // always show diagonal
+```
+
+### Enriched Coordinates Query (Backend)
+
+```python
+# reduction_service.py get_coordinates -- enriched for classification
+def get_coordinates(self, dataset_id: str, cursor, source: str | None = None) -> list[dict]:
+    source_clause = "AND pred.source = ?" if source else ""
+    params = [dataset_id]
+    if source:
+        params.append(source)
+
+    result = cursor.execute(f"""
+        SELECT e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path,
+               MIN(gt.category_name) as gt_label,
+               MIN(pred.category_name) as pred_label
+        FROM embeddings e
+        JOIN samples s ON e.sample_id = s.id AND e.dataset_id = s.dataset_id
+        LEFT JOIN annotations gt ON gt.sample_id = s.id AND gt.dataset_id = s.dataset_id
+            AND gt.source = 'ground_truth'
+        LEFT JOIN annotations pred ON pred.sample_id = s.id AND pred.dataset_id = s.dataset_id
+            AND pred.source != 'ground_truth' {source_clause}
+        WHERE e.dataset_id = ? AND e.x IS NOT NULL
+        GROUP BY e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path
+        ORDER BY e.sample_id
+    """, params + [dataset_id] if source else [dataset_id]).fetchall()
+
+    return [
+        {
+            "sampleId": r[0], "x": r[1], "y": r[2],
+            "fileName": r[3], "thumbnailPath": r[4],
+            "gtLabel": r[5], "predLabel": r[6],
+        }
+        for r in result
+    ]
+```
+
+### Color Mode Dropdown and Palette
+
+```tsx
+// embedding-panel.tsx toolbar addition
+const COLOR_MODES = [
+  { value: "default", label: "Default" },
+  { value: "gt_class", label: "GT Class" },
+  { value: "pred_class", label: "Predicted Class" },
+  { value: "correctness", label: "Correct / Incorrect" },
+] as const;
+
+<select
+  value={colorMode}
+  onChange={(e) => setColorMode(e.target.value as ColorMode)}
+  className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-xs px-2 py-1"
+>
+  {COLOR_MODES.map((m) => (
+    <option key={m.value} value={m.value} disabled={
+      !hasPredictions && (m.value === "pred_class" || m.value === "correctness")
+    }>
+      {m.label}
+    </option>
+  ))}
+</select>
+```
+
+### CSS F1 Bar (Sparkline Alternative)
+
+```tsx
+function F1Bar({ f1 }: { f1: number }) {
+  const color = f1 >= 0.8 ? "bg-green-500" : f1 >= 0.5 ? "bg-yellow-500" : "bg-red-500";
+  return (
+    <div className="w-16 h-2.5 bg-zinc-200 dark:bg-zinc-700 rounded-full overflow-hidden">
+      <div className={`h-full rounded-full ${color}`} style={{ width: `${f1 * 100}%` }} />
+    </div>
+  );
+}
+```
+
+## State of the Art
+
+| Old Approach | Current Approach | When Changed | Impact |
+|--------------|------------------|--------------|--------|
+| Full HTML table for all cells | Threshold-filtered table with overflow scroll | Phase 17 | Readable at 43+ classes |
+| Uniform blue scatter points | Categorical coloring by class/correctness | Phase 17 | Instant visual insight on embedding clusters |
+| Raw confusion matrix only | Most-confused pairs summary | Phase 17 | Actionable: top error modes at a glance |
+| Numbers-only per-class table | Color-coded F1 bars | Phase 17 | Scan 43 classes in seconds |
+
+**Deprecated/outdated:**
+- Nothing deprecated. All enhancements build on Phase 16 output.
+
+## Open Questions
+
+1. **Sparkline interpretation: single-metric bar vs multi-threshold sweep**
+   - What we know: Each class has one P, R, F1 value at the current confidence threshold. A "sparkline" traditionally implies a time-series line.
+   - What's unclear: Does the user want a single F1 bar per class, or a mini-chart showing how F1 varies across confidence thresholds?
+   - Recommendation: Implement a color-coded F1 bar (green/yellow/red) first. If confidence-sweep sparklines are desired, they require a backend change to return F1 at multiple thresholds per class (more complex). Defer to a follow-up.
+
+2. **Embedding color legend visibility at 43+ classes**
+   - What we know: A legend for 43 classes takes significant vertical space and many colors are visually similar.
+   - What's unclear: Should the legend be always-visible, collapsed/expandable, or omitted in favor of hover tooltips?
+   - Recommendation: Show a scrollable legend panel (max-h with overflow) for GT/Pred class modes. For "Correct/Incorrect" mode, show a simple 3-item legend (correct/incorrect/no prediction).
+
+3. **Prediction source selection for embedding coloring**
+   - What we know: Evaluation panel has a source dropdown. Embedding panel does not.
+   - What's unclear: Should embedding coloring respect a selected prediction source, or always use the first available source?
+   - Recommendation: Add an optional source query param to the coordinates endpoint. Default to first non-GT source. If the user has multiple prediction sources, they can switch via a dropdown in the embedding toolbar (add only if multiple sources exist).
+
+## Sources
+
+### Primary (HIGH confidence)
+- Codebase inspection: `frontend/src/components/stats/confusion-matrix.tsx` (current HTML table implementation, 138 lines)
+- Codebase inspection: `frontend/src/components/embedding/embedding-scatter.tsx` (deck.gl ScatterplotLayer, getFillColor accessor, updateTriggers pattern)
+- Codebase inspection: `frontend/src/components/embedding/embedding-panel.tsx` (toolbar, lasso toggle, hover state)
+- Codebase inspection: `frontend/src/types/embedding.ts` (EmbeddingPoint interface -- no gtLabel/predLabel yet)
+- Codebase inspection: `frontend/src/components/stats/evaluation-panel.tsx` (ClassificationMetricsCards, ClassificationPerClassTable)
+- Codebase inspection: `app/services/reduction_service.py` (get_coordinates SQL, JOIN samples only)
+- Codebase inspection: `app/services/classification_evaluation.py` (confusion matrix computation, per-class metrics)
+- Codebase inspection: `frontend/src/hooks/use-embeddings.ts` (staleTime: Infinity for coordinates)
+- Codebase inspection: `package.json` (Recharts ^3.7.0, deck.gl ^9.2.6)
+
+### Secondary (MEDIUM confidence)
+- deck.gl ScatterplotLayer documentation: `getFillColor` accessor supports per-point RGBA arrays with `updateTriggers` for reactive updates
+- Recharts BarChart/LineChart support fixed-size rendering via `width`/`height` props without ResponsiveContainer
+
+### Tertiary (LOW confidence)
+- Canvas vs HTML table performance for large matrices: Based on general web performance knowledge. HTML table with 1,849 cells (43x43) is well within browser capabilities. Canvas would be warranted at ~200+ classes (40,000+ cells).
+
+## Metadata
+
+**Confidence breakdown:**
+- Standard stack: HIGH - No new dependencies, all existing libraries sufficient
+- Architecture: HIGH - Clear extension patterns on existing components, minimal backend change (one SQL JOIN enrichment)
+- Pitfalls: HIGH - Identified from direct codebase inspection (threshold UX, stale cache, multi-source JOIN, palette limits)
+
+**Research date:** 2026-02-18
+**Valid until:** 2026-03-18 (internal codebase patterns, stable)

From 2893b39cde5addade339423c9aed8fff7aa87e43 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:49:54 -0500
Subject: [PATCH 29/38] docs(17): create phase plan

---
 .planning/ROADMAP.md                          |   7 +-
 .../17-classification-polish/17-01-PLAN.md    | 168 ++++++++++
 .../17-classification-polish/17-02-PLAN.md    | 307 ++++++++++++++++++
 3 files changed, 480 insertions(+), 2 deletions(-)
 create mode 100644 .planning/phases/17-classification-polish/17-01-PLAN.md
 create mode 100644 .planning/phases/17-classification-polish/17-02-PLAN.md

diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 3ffef4c..581ea9b 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -117,7 +117,10 @@ Plans:
   2. User can color the embedding scatter plot by GT class, predicted class, or correct/incorrect status
   3. User sees a ranked list of most-confused class pairs derived from the confusion matrix
   4. User sees per-class performance sparklines with color-coded thresholds (green/yellow/red) in the metrics table
-**Plans**: TBD
+**Plans**: 2 plans
+Plans:
+- [ ] 17-01-PLAN.md -- Confusion matrix threshold/overflow, most-confused pairs, F1 bars in per-class table
+- [ ] 17-02-PLAN.md -- Embedding scatter color modes (GT class, predicted class, correct/incorrect)
 
 ## Progress
 
@@ -139,4 +142,4 @@ Plans:
 | 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
 | 15. Classification Ingestion & Display | v1.2 | 2/2 | Complete | 2026-02-18 |
 | 16. Classification Evaluation | v1.2 | 2/2 | Complete | 2026-02-18 |
-| 17. Classification Polish | v1.2 | 0/TBD | Not started | - |
+| 17. Classification Polish | v1.2 | 0/2 | Not started | - |
diff --git a/.planning/phases/17-classification-polish/17-01-PLAN.md b/.planning/phases/17-classification-polish/17-01-PLAN.md
new file mode 100644
index 0000000..bdda008
--- /dev/null
+++ b/.planning/phases/17-classification-polish/17-01-PLAN.md
@@ -0,0 +1,168 @@
+---
+phase: 17-classification-polish
+plan: 01
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+  - frontend/src/components/stats/confusion-matrix.tsx
+  - frontend/src/components/stats/evaluation-panel.tsx
+autonomous: true
+
+must_haves:
+  truths:
+    - "Confusion matrix is readable at 43+ classes with threshold filtering hiding low-value cells"
+    - "Confusion matrix scrolls within constrained container at high cardinality"
+    - "User sees a ranked list of most-confused class pairs below the confusion matrix"
+    - "User sees color-coded F1 bars (green/yellow/red) in the per-class metrics table"
+  artifacts:
+    - path: "frontend/src/components/stats/confusion-matrix.tsx"
+      provides: "Threshold slider, compact cells, overflow scroll container"
+      contains: "threshold"
+    - path: "frontend/src/components/stats/evaluation-panel.tsx"
+      provides: "MostConfusedPairs component, F1Bar component in ClassificationPerClassTable"
+      contains: "MostConfusedPairs"
+  key_links:
+    - from: "frontend/src/components/stats/evaluation-panel.tsx"
+      to: "frontend/src/components/stats/confusion-matrix.tsx"
+      via: "ConfusionMatrix component usage"
+      pattern: "<ConfusionMatrix"
+---
+
+<objective>
+Add confusion matrix threshold filtering with overflow scroll for 43+ class readability, a most-confused class pairs summary derived client-side from the confusion matrix, and color-coded F1 bars in the classification per-class metrics table.
+
+Purpose: Make classification evaluation production-ready for high-cardinality datasets where raw numbers are hard to scan and full confusion matrices are unreadable.
+Output: Enhanced confusion-matrix.tsx with threshold slider and compact mode, evaluation-panel.tsx with MostConfusedPairs and F1Bar inline components.
+</objective>
+
+<execution_context>
+@/Users/ortizeg/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/ortizeg/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/17-classification-polish/17-RESEARCH.md
+@frontend/src/components/stats/confusion-matrix.tsx
+@frontend/src/components/stats/evaluation-panel.tsx
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Confusion matrix threshold filtering and overflow scroll</name>
+  <files>frontend/src/components/stats/confusion-matrix.tsx</files>
+  <action>
+Enhance the ConfusionMatrix component for high-cardinality (43+ classes) readability:
+
+1. **Threshold slider state:** Add `const [threshold, setThreshold] = useState(0.01)` inside the component. Render a slider in the header area next to the title:
+   - Range: 0 to 0.5, step 0.01
+   - Show current value as percentage: `{(threshold*100).toFixed(0)}%`
+   - Label: "Min:"
+   - Also show a count of hidden cells: `{hiddenCount} cells hidden`
+
+2. **Cell rendering with threshold:** In the cell render loop, only show the numeric value if `norm >= threshold || ri === ci` (always show diagonal cells regardless of threshold). When a cell is below threshold AND not diagonal, render the `<td>` with transparent background and no text content.
+
+3. **Overflow scroll container:** Wrap the table in a container with `overflow-auto max-h-[500px]` so large matrices scroll vertically and horizontally.
+
+4. **Compact mode for high cardinality:** When `labels.length > 20`:
+   - Use `min-w-[24px]` instead of `min-w-[32px]` on cells
+   - Add `max-w-[80px] truncate` to row and column label text
+   - Use `text-[10px]` for cell values instead of default text-xs
+
+5. **"N cells hidden" counter:** Compute `hiddenCount` by counting off-diagonal cells where `norm > 0 && norm < threshold`. Display next to the slider.
+
+Keep the existing cellColor function, onCellClick behavior, row-normalization, axis titles, and rotated column headers exactly as they are.
+  </action>
+  <verify>
+Run `cd frontend && npx tsc --noEmit` to confirm no type errors. Visually inspect that the component renders with the slider and that the threshold slider appears in the header area.
+  </verify>
+  <done>
+Confusion matrix has a threshold slider (0-50%, default 1%) that hides low-value off-diagonal cells. Diagonal cells always visible. Table scrolls within a max-h-[500px] container. High-cardinality (>20 classes) uses compact cell sizing and truncated labels. Hidden cell count displayed.
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Most-confused pairs summary and F1 bars in per-class table</name>
+  <files>frontend/src/components/stats/evaluation-panel.tsx</files>
+  <action>
+Add two inline components to evaluation-panel.tsx for the classification evaluation layout:
+
+**1. MostConfusedPairs component** (inline, above the per-class table in the classification layout):
+
+```tsx
+function MostConfusedPairs({
+  matrix,
+  labels,
+  onPairClick,
+}: {
+  matrix: number[][];
+  labels: string[];
+  onPairClick?: (actual: string, predicted: string) => void;
+}) {
+```
+
+- Derive the top 10 most-confused (actual, predicted) pairs from the confusion matrix.
+- For each pair: extract raw count `matrix[i][j]` where `i !== j`, compute percentage as `matrix[i][j] / rowSum`.
+- Sort by raw count descending, take top 10.
+- Render as a compact card with title "Most Confused Pairs" and a table:
+  - Columns: Rank (#), Actual, arrow (unicode right arrow), Predicted, Count, Pct
+  - Each row is clickable if `onPairClick` is provided (reuse the same handleCellClick logic for confusion cell filtering).
+  - Style: rounded-lg border, same card styling as other evaluation cards.
+- Skip pairs with count === 0. If no off-diagonal errors exist, show "No misclassifications found".
+
+**2. F1Bar component** (inline, used in ClassificationPerClassTable):
+
+```tsx
+function F1Bar({ f1 }: { f1: number }) {
+  const color = f1 >= 0.8 ? "bg-green-500" : f1 >= 0.5 ? "bg-yellow-500" : "bg-red-500";
+  return (
+    <div className="w-16 h-2.5 bg-zinc-200 dark:bg-zinc-700 rounded-full overflow-hidden">
+      <div className={`h-full rounded-full ${color}`} style={{ width: `${f1 * 100}%` }} />
+    </div>
+  );
+}
+```
+
+- Pure CSS bar, no Recharts dependency. Width proportional to F1 (0-1 scale). Green >= 0.8, yellow >= 0.5, red < 0.5.
+
+**3. Integrate into ClassificationPerClassTable:**
+- Add a "Performance" column header after Support.
+- In each row, render `<F1Bar f1={m.f1} />` in the new column.
+
+**4. Integrate MostConfusedPairs into classification layout:**
+- Place `<MostConfusedPairs>` between the ConfusionMatrix and the ClassificationPerClassTable in the classification evaluation JSX.
+- Pass `matrix={classData.confusion_matrix}`, `labels={classData.confusion_matrix_labels}`, `onPairClick={handleCellClick}`.
+- Wrap in the same loading skeleton pattern as the other sections.
+  </action>
+  <verify>
+Run `cd frontend && npx tsc --noEmit` to confirm no type errors. Check that the classification evaluation panel now has: metric cards, confusion matrix, most-confused pairs, and per-class table with F1 bars.
+  </verify>
+  <done>
+Classification evaluation panel shows a "Most Confused Pairs" ranked list (top 10) derived from the confusion matrix with clickable rows that filter the grid. Per-class table has a new "Performance" column with color-coded F1 bars (green/yellow/red).
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+1. `cd frontend && npx tsc --noEmit` passes with no errors
+2. Confusion matrix has threshold slider visible in header
+3. MostConfusedPairs component renders between confusion matrix and per-class table
+4. F1 bars appear in the per-class metrics table Performance column
+5. Detection evaluation layout is completely unchanged
+</verification>
+
+<success_criteria>
+- Confusion matrix readable at 43+ classes via threshold filtering and scroll overflow
+- Most-confused pairs summary visible and clickable (filters grid)
+- F1 bars with green/yellow/red thresholds visible in per-class table
+- No regressions to detection evaluation
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/17-classification-polish/17-01-SUMMARY.md`
+</output>
diff --git a/.planning/phases/17-classification-polish/17-02-PLAN.md b/.planning/phases/17-classification-polish/17-02-PLAN.md
new file mode 100644
index 0000000..db5ba85
--- /dev/null
+++ b/.planning/phases/17-classification-polish/17-02-PLAN.md
@@ -0,0 +1,307 @@
+---
+phase: 17-classification-polish
+plan: 02
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+  - app/services/reduction_service.py
+  - app/routers/embeddings.py
+  - frontend/src/types/embedding.ts
+  - frontend/src/components/embedding/embedding-scatter.tsx
+  - frontend/src/components/embedding/embedding-panel.tsx
+  - frontend/src/app/datasets/[datasetId]/page.tsx
+autonomous: true
+
+must_haves:
+  truths:
+    - "User can select color mode (Default, GT Class, Predicted Class, Correct/Incorrect) in embedding toolbar"
+    - "Embedding scatter points change color based on selected color mode"
+    - "Color mode dropdown options for Predicted Class and Correct/Incorrect are disabled when no predictions exist"
+    - "Backend coordinates endpoint returns gtLabel and predLabel per point"
+  artifacts:
+    - path: "app/services/reduction_service.py"
+      provides: "Enriched get_coordinates with GT/pred label JOINs"
+      contains: "gt_label"
+    - path: "frontend/src/types/embedding.ts"
+      provides: "gtLabel and predLabel fields on EmbeddingPoint"
+      contains: "gtLabel"
+    - path: "frontend/src/components/embedding/embedding-scatter.tsx"
+      provides: "colorMode-driven getFillColor with categorical palette"
+      contains: "colorMode"
+    - path: "frontend/src/components/embedding/embedding-panel.tsx"
+      provides: "Color mode dropdown in toolbar"
+      contains: "ColorMode"
+  key_links:
+    - from: "frontend/src/components/embedding/embedding-panel.tsx"
+      to: "frontend/src/components/embedding/embedding-scatter.tsx"
+      via: "colorMode prop"
+      pattern: "colorMode"
+    - from: "app/services/reduction_service.py"
+      to: "frontend/src/types/embedding.ts"
+      via: "API response shape includes gtLabel, predLabel"
+      pattern: "gtLabel"
+    - from: "frontend/src/app/datasets/[datasetId]/page.tsx"
+      to: "frontend/src/components/embedding/embedding-panel.tsx"
+      via: "datasetType prop threading"
+      pattern: "datasetType"
+---
+
+<objective>
+Add embedding scatter color modes (GT Class, Predicted Class, Correct/Incorrect) by enriching the backend coordinates endpoint with annotation labels and building a color mode dropdown with categorical palette in the frontend.
+
+Purpose: Let users instantly see how embedding clusters correlate with class labels and prediction correctness, surfacing misclassification patterns spatially.
+Output: Enriched coordinates API, color mode dropdown in embedding toolbar, categorical coloring in scatter plot.
+</objective>
+
+<execution_context>
+@/Users/ortizeg/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/ortizeg/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/17-classification-polish/17-RESEARCH.md
+@app/services/reduction_service.py
+@app/routers/embeddings.py
+@frontend/src/types/embedding.ts
+@frontend/src/components/embedding/embedding-scatter.tsx
+@frontend/src/components/embedding/embedding-panel.tsx
+@frontend/src/app/datasets/[datasetId]/page.tsx
+@frontend/src/hooks/use-embeddings.ts
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Backend coordinates enrichment with GT/pred labels</name>
+  <files>
+    app/services/reduction_service.py
+    app/routers/embeddings.py
+    frontend/src/types/embedding.ts
+  </files>
+  <action>
+**1. Enrich `get_coordinates` in reduction_service.py:**
+
+Modify the SQL query to LEFT JOIN annotations for GT and predicted labels. Use MIN() + GROUP BY to handle multi-annotation edge cases (collapse to one label per sample):
+
+```python
+def get_coordinates(self, dataset_id: str, cursor) -> list[dict]:
+    result = cursor.execute(
+        """
+        SELECT e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path,
+               MIN(gt.category_name) as gt_label,
+               MIN(pred.category_name) as pred_label
+        FROM embeddings e
+        JOIN samples s ON e.sample_id = s.id AND e.dataset_id = s.dataset_id
+        LEFT JOIN annotations gt ON gt.sample_id = s.id AND gt.dataset_id = s.dataset_id
+            AND gt.source = 'ground_truth'
+        LEFT JOIN annotations pred ON pred.sample_id = s.id AND pred.dataset_id = s.dataset_id
+            AND pred.source != 'ground_truth'
+        WHERE e.dataset_id = ? AND e.x IS NOT NULL
+        GROUP BY e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path
+        ORDER BY e.sample_id
+        """,
+        [dataset_id],
+    ).fetchall()
+    return [
+        {
+            "sampleId": r[0],
+            "x": r[1],
+            "y": r[2],
+            "fileName": r[3],
+            "thumbnailPath": r[4],
+            "gtLabel": r[5],
+            "predLabel": r[6],
+        }
+        for r in result
+    ]
+```
+
+Keep the method signature the same (dataset_id, cursor). The LEFT JOINs ensure points without annotations still appear (gtLabel/predLabel will be null). The MIN() + GROUP BY prevents duplicate rows from multi-source predictions or multi-label GT.
+
+The router endpoint `get_coordinates` in embeddings.py needs NO changes -- it already returns `list[dict]` and just calls `reduction_service.get_coordinates(dataset_id, cursor)`.
+
+**2. Update EmbeddingPoint type in frontend/src/types/embedding.ts:**
+
+Add two optional fields to the `EmbeddingPoint` interface:
+
+```typescript
+export interface EmbeddingPoint {
+  sampleId: string;
+  x: number;
+  y: number;
+  fileName: string;
+  thumbnailPath: string | null;
+  gtLabel?: string | null;
+  predLabel?: string | null;
+}
+```
+
+Make them optional (`?`) so existing code that doesn't use them continues to work without changes.
+  </action>
+  <verify>
+Run `cd frontend && npx tsc --noEmit` to confirm the type change doesn't break existing usages.
+Run `python -c "import app.services.reduction_service; print('OK')"` to confirm the module loads.
+  </verify>
+  <done>
+Backend coordinates endpoint returns gtLabel and predLabel per point (null when no annotations exist). EmbeddingPoint type updated with optional gtLabel/predLabel fields. No breakage to existing consumers.
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Embedding scatter color mode dropdown and categorical coloring</name>
+  <files>
+    frontend/src/components/embedding/embedding-scatter.tsx
+    frontend/src/components/embedding/embedding-panel.tsx
+    frontend/src/app/datasets/[datasetId]/page.tsx
+  </files>
+  <action>
+**1. Add colorMode prop to EmbeddingScatter (embedding-scatter.tsx):**
+
+Add a `colorMode` prop to EmbeddingScatterProps:
+
+```typescript
+type ColorMode = "default" | "gt_class" | "pred_class" | "correctness";
+
+interface EmbeddingScatterProps {
+  points: EmbeddingPoint[];
+  onHover?: (...) => void;
+  selectedIds?: string[] | null;
+  deckRef?: React.RefObject<DeckGLRef | null>;
+  colorMode?: ColorMode;
+}
+```
+
+Export the `ColorMode` type for use in embedding-panel.tsx.
+
+Define a categorical color palette (20 distinct RGBA colors, based on Tableau 20):
+
+```typescript
+const CATEGORICAL_PALETTE: [number,number,number,number][] = [
+  [31,119,180,200], [255,127,14,200], [44,160,44,200], [214,39,40,200],
+  [148,103,189,200], [140,86,75,200], [227,119,194,200], [127,127,127,200],
+  [188,189,34,200], [23,190,207,200], [174,199,232,200], [255,187,120,200],
+  [152,223,138,200], [255,152,150,200], [197,176,213,200], [196,156,148,200],
+  [247,182,210,200], [199,199,199,200], [219,219,141,200], [158,218,229,200],
+];
+```
+
+Build a `labelIndex` Map inside the useMemo for layers: collect unique label values from points (sorted alphabetically), map each to an index. This ensures stable color assignment.
+
+Update the `getFillColor` accessor in the ScatterplotLayer:
+
+```typescript
+getFillColor: (d) => {
+  // Lasso selection overrides color mode
+  if (selectedSet !== null) {
+    return selectedSet.has(d.sampleId)
+      ? [99, 102, 241, 230]
+      : [180, 180, 180, 80];
+  }
+  if (colorMode === "gt_class" && d.gtLabel) {
+    return CATEGORICAL_PALETTE[labelIndex.get(d.gtLabel)! % CATEGORICAL_PALETTE.length];
+  }
+  if (colorMode === "pred_class" && d.predLabel) {
+    return CATEGORICAL_PALETTE[labelIndex.get(d.predLabel)! % CATEGORICAL_PALETTE.length];
+  }
+  if (colorMode === "correctness") {
+    if (!d.predLabel) return [180, 180, 180, 100] as [number,number,number,number]; // no prediction: gray
+    return d.gtLabel === d.predLabel
+      ? [44, 160, 44, 200] as [number,number,number,number]   // correct: green
+      : [214, 39, 40, 200] as [number,number,number,number];  // incorrect: red
+  }
+  return [100, 120, 220, 200]; // default blue
+}
+```
+
+Add `colorMode` to the `updateTriggers.getFillColor` array so deck.gl re-renders when color mode changes.
+
+Add `colorMode` to the useMemo dependency array for layers.
+
+**2. Add color mode dropdown to EmbeddingPanel (embedding-panel.tsx):**
+
+Add `datasetType` prop to EmbeddingPanelProps:
+
+```typescript
+interface EmbeddingPanelProps {
+  datasetId: string;
+  datasetType?: string;
+}
+```
+
+Add color mode state: `const [colorMode, setColorMode] = useState<ColorMode>("default");`
+
+Import `ColorMode` from embedding-scatter.tsx.
+
+Determine `hasPredictions` from the coordinates data: `const hasPredictions = useMemo(() => coordinates?.some(p => p.predLabel != null) ?? false, [coordinates]);`
+
+In the toolbar (between the "N points" label and the lasso button), add the color mode dropdown (only visible when coordinates are loaded):
+
+```tsx
+<select
+  value={colorMode}
+  onChange={(e) => setColorMode(e.target.value as ColorMode)}
+  className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-xs px-2 py-1 text-zinc-900 dark:text-zinc-100"
+>
+  <option value="default">Default</option>
+  <option value="gt_class">GT Class</option>
+  <option value="pred_class" disabled={!hasPredictions}>Predicted Class</option>
+  <option value="correctness" disabled={!hasPredictions}>Correct / Incorrect</option>
+</select>
+```
+
+Pass `colorMode` to the `<EmbeddingScatter>` component.
+
+**3. Thread datasetType to EmbeddingPanel (page.tsx):**
+
+In `frontend/src/app/datasets/[datasetId]/page.tsx`, find where `<EmbeddingPanel>` is rendered and add `datasetType={dataset?.dataset_type}` prop. This enables the panel to know whether to show color mode controls.
+
+Note: The color mode dropdown should always be shown (not just for classification) since detection datasets also have GT/pred annotations. The `hasPredictions` check handles disabling prediction-dependent modes.
+
+**4. Invalidate embedding coordinates after prediction import:**
+
+In the prediction import dialog or wherever predictions are imported, add `embedding-coordinates` to the list of query keys invalidated on import success. Check `frontend/src/components/detail/prediction-import-dialog.tsx` for the `onSuccess` callback of the prediction import mutation -- add:
+```typescript
+queryClient.invalidateQueries({ queryKey: ["embedding-coordinates", datasetId] });
+```
+
+This prevents stale coordinates (missing predLabel) after importing predictions (Pitfall 4 from research).
+  </action>
+  <verify>
+Run `cd frontend && npx tsc --noEmit` to confirm no type errors.
+Verify that the EmbeddingPanel toolbar has the color mode dropdown.
+Verify that EmbeddingScatter accepts and uses the colorMode prop.
+Verify that the page passes datasetType to EmbeddingPanel.
+  </verify>
+  <done>
+Embedding scatter plot supports 4 color modes: Default (uniform blue), GT Class (categorical palette), Predicted Class (categorical palette), and Correct/Incorrect (green/red/gray). Color mode dropdown in toolbar with prediction-dependent options disabled when no predictions. Embedding coordinates cache invalidated after prediction import.
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+1. `cd frontend && npx tsc --noEmit` passes with no errors
+2. Backend `get_coordinates` SQL includes LEFT JOIN annotations and returns gtLabel/predLabel
+3. EmbeddingPoint type has optional gtLabel/predLabel fields
+4. EmbeddingScatter getFillColor branches on colorMode with categorical palette
+5. EmbeddingPanel toolbar has color mode dropdown
+6. Prediction-dependent modes disabled when hasPredictions is false
+7. Page passes datasetType to EmbeddingPanel
+8. Prediction import invalidates embedding-coordinates query key
+</verification>
+
+<success_criteria>
+- Color mode dropdown visible in embedding toolbar
+- Points change color when switching modes (GT Class, Predicted Class, Correct/Incorrect)
+- Predicted Class and Correct/Incorrect disabled when no predictions imported
+- Backend returns gtLabel and predLabel without breaking existing consumers
+- No regressions to lasso selection, hover tooltip, or detection embedding functionality
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/17-classification-polish/17-02-SUMMARY.md`
+</output>

From 10a3230787caf2eee679c8bcf8e7a76b0578a97d Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:57:23 -0500
Subject: [PATCH 30/38] feat(17-01): add threshold filtering and overflow
 scroll to confusion matrix

- Add threshold slider (0-50%, default 1%) to hide low-value off-diagonal cells
- Wrap table in overflow-auto max-h-[500px] scroll container
- Compact mode for >20 classes: smaller cells, truncated labels, 10px text
- Show hidden cell count next to slider
- Diagonal cells always visible regardless of threshold

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/components/stats/confusion-matrix.tsx | 89 +++++++++++++++----
 1 file changed, 73 insertions(+), 16 deletions(-)

diff --git a/frontend/src/components/stats/confusion-matrix.tsx b/frontend/src/components/stats/confusion-matrix.tsx
index 3304de6..649ba44 100644
--- a/frontend/src/components/stats/confusion-matrix.tsx
+++ b/frontend/src/components/stats/confusion-matrix.tsx
@@ -6,8 +6,15 @@
  * Diagonal cells use blue intensity (correct predictions).
  * Off-diagonal cells use red intensity (misclassifications).
  * Column headers are rotated for compactness.
+ *
+ * Features:
+ * - Threshold slider to hide low-value off-diagonal cells
+ * - Overflow scroll container for large matrices
+ * - Compact mode for high-cardinality (>20 classes)
  */
 
+import { useState, useMemo } from "react";
+
 interface ConfusionMatrixProps {
   matrix: number[][];
   labels: string[];
@@ -24,6 +31,8 @@ function cellColor(value: number, maxVal: number, isDiagonal: boolean): string {
 }
 
 export function ConfusionMatrix({ matrix, labels, onCellClick }: ConfusionMatrixProps) {
+  const [threshold, setThreshold] = useState(0.01);
+
   if (matrix.length === 0) {
     return (
       <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
@@ -40,15 +49,57 @@ export function ConfusionMatrix({ matrix, labels, onCellClick }: ConfusionMatrix
     return sum > 0 ? row.map((v) => v / sum) : row.map(() => 0);
   });
 
+  // Count off-diagonal cells hidden by threshold (only those with actual values)
+  const hiddenCount = useMemo(() => {
+    let count = 0;
+    for (let ri = 0; ri < normalized.length; ri++) {
+      for (let ci = 0; ci < normalized[ri].length; ci++) {
+        if (ri !== ci && normalized[ri][ci] > 0 && normalized[ri][ci] < threshold) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }, [normalized, threshold]);
+
+  const isCompact = labels.length > 20;
+
   return (
     <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
-      <h3 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
-        Confusion Matrix
-        <span className="font-normal text-zinc-400 dark:text-zinc-500 ml-1">
-          (row-normalized)
-        </span>
-      </h3>
-      <div className="overflow-x-auto">
+      {/* Header with title and threshold slider */}
+      <div className="flex flex-wrap items-center gap-4 mb-3">
+        <h3 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300">
+          Confusion Matrix
+          <span className="font-normal text-zinc-400 dark:text-zinc-500 ml-1">
+            (row-normalized)
+          </span>
+        </h3>
+        <div className="flex items-center gap-2 ml-auto">
+          <label className="text-xs font-medium text-zinc-500 dark:text-zinc-400">
+            Min:
+          </label>
+          <input
+            type="range"
+            min={0}
+            max={0.5}
+            step={0.01}
+            value={threshold}
+            onChange={(e) => setThreshold(parseFloat(e.target.value))}
+            className="w-24 accent-blue-500"
+          />
+          <span className="text-xs font-mono text-zinc-600 dark:text-zinc-400 w-8">
+            {(threshold * 100).toFixed(0)}%
+          </span>
+          {hiddenCount > 0 && (
+            <span className="text-xs text-zinc-400 dark:text-zinc-500">
+              {hiddenCount} cells hidden
+            </span>
+          )}
+        </div>
+      </div>
+
+      {/* Overflow scroll container */}
+      <div className="overflow-auto max-h-[500px]">
         {/* "Predicted" axis title above the grid columns */}
         <div className="flex">
           <div style={{ flexShrink: 0, width: 24 }} />
@@ -68,7 +119,7 @@ export function ConfusionMatrix({ matrix, labels, onCellClick }: ConfusionMatrix
             </span>
           </div>
 
-          {/* The matrix table — no axis labels inside */}
+          {/* The matrix table */}
           <table className="text-xs border-separate border-spacing-0">
             <thead>
               <tr>
@@ -80,7 +131,7 @@ export function ConfusionMatrix({ matrix, labels, onCellClick }: ConfusionMatrix
                     className="p-1 font-medium text-zinc-600 dark:text-zinc-400"
                   >
                     <div
-                      className="whitespace-nowrap"
+                      className={`whitespace-nowrap${isCompact ? " max-w-[80px] truncate" : ""}`}
                       style={{
                         writingMode: "vertical-rl",
                         transform: "rotate(180deg)",
@@ -97,23 +148,27 @@ export function ConfusionMatrix({ matrix, labels, onCellClick }: ConfusionMatrix
               {normalized.map((row, ri) => (
                 <tr key={ri}>
                   {/* Row class label */}
-                  <td className="p-1 font-medium text-zinc-600 dark:text-zinc-400 whitespace-nowrap pr-2 text-right">
+                  <td className={`p-1 font-medium text-zinc-600 dark:text-zinc-400 whitespace-nowrap pr-2 text-right${isCompact ? " max-w-[80px] truncate" : ""}`}>
                     {labels[ri] ?? ""}
                   </td>
                   {/* Data cells */}
                   {row.map((norm, ci) => {
                     const rawValue = matrix[ri][ci];
-                    const isClickable = rawValue > 0 && !!onCellClick;
+                    const isDiagonal = ri === ci;
+                    const isBelowThreshold = !isDiagonal && norm > 0 && norm < threshold;
+                    const isClickable = rawValue > 0 && !!onCellClick && !isBelowThreshold;
                     return (
                       <td
                         key={ci}
-                        className={`p-1 text-center min-w-[32px] border border-zinc-200 dark:border-zinc-700${
+                        className={`p-1 text-center ${isCompact ? "min-w-[24px]" : "min-w-[32px]"} border border-zinc-200 dark:border-zinc-700${
                           isClickable
                             ? " cursor-pointer hover:ring-2 hover:ring-blue-500 hover:ring-inset"
                             : ""
                         }`}
                         style={{
-                          backgroundColor: cellColor(norm, 1, ri === ci),
+                          backgroundColor: isBelowThreshold
+                            ? "transparent"
+                            : cellColor(norm, 1, isDiagonal),
                         }}
                         onClick={
                           isClickable
@@ -121,9 +176,11 @@ export function ConfusionMatrix({ matrix, labels, onCellClick }: ConfusionMatrix
                             : undefined
                         }
                       >
-                        <span className="text-zinc-800 dark:text-zinc-200">
-                          {norm > 0 ? norm.toFixed(2) : ""}
-                        </span>
+                        {!isBelowThreshold && (
+                          <span className={`text-zinc-800 dark:text-zinc-200${isCompact ? " text-[10px]" : ""}`}>
+                            {norm > 0 ? norm.toFixed(2) : ""}
+                          </span>
+                        )}
                       </td>
                     );
                   })}

From 4ff366a38f8e397c8b4dd98000a0647452245c90 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:57:23 -0500
Subject: [PATCH 31/38] feat(17-02): enrich coordinates endpoint with GT/pred
 labels

- LEFT JOIN annotations for ground_truth and prediction labels
- MIN() + GROUP BY to handle multi-annotation edge cases
- Add optional gtLabel/predLabel fields to EmbeddingPoint type

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/services/reduction_service.py | 11 ++++++++++-
 frontend/src/types/embedding.ts   |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/app/services/reduction_service.py b/app/services/reduction_service.py
index 5c125fb..9379a5f 100644
--- a/app/services/reduction_service.py
+++ b/app/services/reduction_service.py
@@ -143,10 +143,17 @@ def get_coordinates(
         """
         result = cursor.execute(
             """
-            SELECT e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path
+            SELECT e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path,
+                   MIN(gt.category_name) as gt_label,
+                   MIN(pred.category_name) as pred_label
             FROM embeddings e
             JOIN samples s ON e.sample_id = s.id AND e.dataset_id = s.dataset_id
+            LEFT JOIN annotations gt ON gt.sample_id = s.id AND gt.dataset_id = s.dataset_id
+                AND gt.source = 'ground_truth'
+            LEFT JOIN annotations pred ON pred.sample_id = s.id AND pred.dataset_id = s.dataset_id
+                AND pred.source != 'ground_truth'
             WHERE e.dataset_id = ? AND e.x IS NOT NULL
+            GROUP BY e.sample_id, e.x, e.y, s.file_name, s.thumbnail_path
             ORDER BY e.sample_id
             """,
             [dataset_id],
@@ -158,6 +165,8 @@ def get_coordinates(
                 "y": r[2],
                 "fileName": r[3],
                 "thumbnailPath": r[4],
+                "gtLabel": r[5],
+                "predLabel": r[6],
             }
             for r in result
         ]
diff --git a/frontend/src/types/embedding.ts b/frontend/src/types/embedding.ts
index 1a6a117..632e8a5 100644
--- a/frontend/src/types/embedding.ts
+++ b/frontend/src/types/embedding.ts
@@ -13,6 +13,8 @@ export interface EmbeddingPoint {
   y: number;
   fileName: string;
   thumbnailPath: string | null;
+  gtLabel?: string | null;
+  predLabel?: string | null;
 }
 
 /** Current embedding status for a dataset (GET /status). */

From 660d287ebb7a0d6f99eff5bc56c09bcc0495fb66 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:58:16 -0500
Subject: [PATCH 32/38] feat(17-01): add most-confused pairs and F1 bars to
 classification eval

- Add MostConfusedPairs component: top 10 off-diagonal pairs ranked by count
- Clickable rows filter grid to show misclassified samples
- Add F1Bar component: color-coded bar (green/yellow/red) for per-class F1
- Add Performance column to ClassificationPerClassTable with F1Bar
- Integrate MostConfusedPairs between confusion matrix and per-class table

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/components/stats/evaluation-panel.tsx | 129 +++++++++++++++++-
 1 file changed, 128 insertions(+), 1 deletion(-)

diff --git a/frontend/src/components/stats/evaluation-panel.tsx b/frontend/src/components/stats/evaluation-panel.tsx
index d8128fb..08754ae 100644
--- a/frontend/src/components/stats/evaluation-panel.tsx
+++ b/frontend/src/components/stats/evaluation-panel.tsx
@@ -82,7 +82,115 @@ function ClassificationMetricsCards({ data }: { data: ClassificationEvaluationRe
   );
 }
 
-/** Per-class table for classification: Class, Precision, Recall, F1, Support */
+/** Color-coded F1 bar: green >= 0.8, yellow >= 0.5, red < 0.5 */
+function F1Bar({ f1 }: { f1: number }) {
+  const color = f1 >= 0.8 ? "bg-green-500" : f1 >= 0.5 ? "bg-yellow-500" : "bg-red-500";
+  return (
+    <div className="w-16 h-2.5 bg-zinc-200 dark:bg-zinc-700 rounded-full overflow-hidden">
+      <div className={`h-full rounded-full ${color}`} style={{ width: `${f1 * 100}%` }} />
+    </div>
+  );
+}
+
+/** Top 10 most-confused (actual, predicted) pairs from confusion matrix */
+function MostConfusedPairs({
+  matrix,
+  labels,
+  onPairClick,
+}: {
+  matrix: number[][];
+  labels: string[];
+  onPairClick?: (actual: string, predicted: string) => void;
+}) {
+  const pairs = useMemo(() => {
+    const result: { actual: string; predicted: string; count: number; pct: number }[] = [];
+    for (let i = 0; i < matrix.length; i++) {
+      const rowSum = matrix[i].reduce((a, b) => a + b, 0);
+      for (let j = 0; j < matrix[i].length; j++) {
+        if (i !== j && matrix[i][j] > 0) {
+          result.push({
+            actual: labels[i],
+            predicted: labels[j],
+            count: matrix[i][j],
+            pct: rowSum > 0 ? matrix[i][j] / rowSum : 0,
+          });
+        }
+      }
+    }
+    result.sort((a, b) => b.count - a.count);
+    return result.slice(0, 10);
+  }, [matrix, labels]);
+
+  return (
+    <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
+      <h3 className="text-sm font-semibold text-zinc-700 dark:text-zinc-300 mb-3">
+        Most Confused Pairs
+      </h3>
+      {pairs.length === 0 ? (
+        <p className="text-sm text-zinc-500 dark:text-zinc-400 text-center py-4">
+          No misclassifications found
+        </p>
+      ) : (
+        <div className="overflow-x-auto">
+          <table className="w-full text-sm">
+            <thead>
+              <tr className="border-b border-zinc-200 dark:border-zinc-700">
+                <th className="text-left py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400 w-8">
+                  #
+                </th>
+                <th className="text-left py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                  Actual
+                </th>
+                <th className="text-center py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400 w-8" />
+                <th className="text-left py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                  Predicted
+                </th>
+                <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                  Count
+                </th>
+                <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                  Pct
+                </th>
+              </tr>
+            </thead>
+            <tbody>
+              {pairs.map((p, idx) => (
+                <tr
+                  key={`${p.actual}-${p.predicted}`}
+                  className={`border-b border-zinc-100 dark:border-zinc-800${
+                    onPairClick ? " cursor-pointer hover:bg-zinc-50 dark:hover:bg-zinc-800" : ""
+                  }`}
+                  onClick={onPairClick ? () => onPairClick(p.actual, p.predicted) : undefined}
+                >
+                  <td className="py-2 px-3 text-zinc-400 dark:text-zinc-500">
+                    {idx + 1}
+                  </td>
+                  <td className="py-2 px-3 text-zinc-900 dark:text-zinc-100 font-medium">
+                    {p.actual}
+                  </td>
+                  <td className="py-2 px-3 text-center text-zinc-400 dark:text-zinc-500">
+                    {"\u2192"}
+                  </td>
+                  <td className="py-2 px-3 text-zinc-900 dark:text-zinc-100 font-medium">
+                    {p.predicted}
+                  </td>
+                  <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
+                    {p.count.toLocaleString()}
+                  </td>
+                  <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
+                    {(p.pct * 100).toFixed(1)}%
+                  </td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        </div>
+      )}
+    </div>
+  );
+}
+
+/** Per-class table for classification: Class, Precision, Recall, F1, Support, Performance */
 function ClassificationPerClassTable({ metrics }: { metrics: ClassificationEvaluationResponse["per_class_metrics"] }) {
   const sorted = useMemo(
     () => [...metrics].sort((a, b) => b.f1 - a.f1),
@@ -123,6 +231,9 @@ function ClassificationPerClassTable({ metrics }: { metrics: ClassificationEvalu
               <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
                 Support
               </th>
+              <th className="text-right py-2 px-3 font-medium text-zinc-600 dark:text-zinc-400">
+                Performance
+              </th>
             </tr>
           </thead>
           <tbody>
@@ -146,6 +257,11 @@ function ClassificationPerClassTable({ metrics }: { metrics: ClassificationEvalu
                 <td className="py-2 px-3 text-right font-mono text-zinc-700 dark:text-zinc-300">
                   {m.support.toLocaleString()}
                 </td>
+                <td className="py-2 px-3 text-right">
+                  <div className="flex justify-end">
+                    <F1Bar f1={m.f1} />
+                  </div>
+                </td>
               </tr>
             ))}
           </tbody>
@@ -279,6 +395,17 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
           />
         )}
 
+        {/* Most Confused Pairs */}
+        {isLoading || !classData ? (
+          <SkeletonChart height="h-[200px]" />
+        ) : (
+          <MostConfusedPairs
+            matrix={classData.confusion_matrix}
+            labels={classData.confusion_matrix_labels}
+            onPairClick={handleCellClick}
+          />
+        )}
+
         {/* Per-Class Table */}
         {isLoading || !classData ? (
           <SkeletonChart height="h-[200px]" />

From 1f4c85887a212eaea28a64afa4d9f2cb0d6f8ada Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:58:42 -0500
Subject: [PATCH 33/38] feat(17-02): add color mode dropdown and categorical
 coloring to embedding scatter

- Add ColorMode type and Tableau 20 categorical palette to scatter
- getFillColor branches on colorMode (default/gt_class/pred_class/correctness)
- Color mode dropdown in toolbar with prediction-dependent options disabled
- Thread datasetType prop to EmbeddingPanel from page
- Invalidate embedding-coordinates cache after prediction import

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/app/datasets/[datasetId]/page.tsx     |  2 +-
 .../components/embedding/embedding-panel.tsx  | 27 ++++++++-
 .../embedding/embedding-scatter.tsx           | 57 ++++++++++++++++---
 frontend/src/hooks/use-import-predictions.ts  |  1 +
 4 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/frontend/src/app/datasets/[datasetId]/page.tsx b/frontend/src/app/datasets/[datasetId]/page.tsx
index d764cc4..9a386fb 100644
--- a/frontend/src/app/datasets/[datasetId]/page.tsx
+++ b/frontend/src/app/datasets/[datasetId]/page.tsx
@@ -114,7 +114,7 @@ export default function DatasetPage({
         <StatsDashboard datasetId={datasetId} datasetType={dataset?.dataset_type} />
       )}
       {activeTab === "embeddings" && (
-        <EmbeddingPanel datasetId={datasetId} />
+        <EmbeddingPanel datasetId={datasetId} datasetType={dataset?.dataset_type} />
       )}
       <SampleModal datasetId={datasetId} samples={allSamples} datasetType={dataset?.dataset_type} />
       <PredictionImportDialog
diff --git a/frontend/src/components/embedding/embedding-panel.tsx b/frontend/src/components/embedding/embedding-panel.tsx
index 9b21fd9..54b6fd7 100644
--- a/frontend/src/components/embedding/embedding-panel.tsx
+++ b/frontend/src/components/embedding/embedding-panel.tsx
@@ -11,7 +11,7 @@
  * via SSE progress streams.
  */
 
-import { useEffect, useRef, useState } from "react";
+import { useEffect, useMemo, useRef, useState } from "react";
 
 import type { DeckGLRef } from "@deck.gl/react";
 import { useQueryClient } from "@tanstack/react-query";
@@ -24,7 +24,7 @@ import {
 } from "@/hooks/use-embeddings";
 import { useEmbeddingProgress } from "@/hooks/use-embedding-progress";
 import { thumbnailUrl } from "@/lib/api";
-import { EmbeddingScatter } from "@/components/embedding/embedding-scatter";
+import { EmbeddingScatter, type ColorMode } from "@/components/embedding/embedding-scatter";
 import { HoverThumbnail } from "@/components/embedding/hover-thumbnail";
 import { LassoOverlay } from "@/components/embedding/lasso-overlay";
 import { useEmbeddingStore, useLassoSelectedIds } from "@/stores/embedding-store";
@@ -32,9 +32,10 @@ import type { EmbeddingPoint } from "@/types/embedding";
 
 interface EmbeddingPanelProps {
   datasetId: string;
+  datasetType?: string;
 }
 
-export function EmbeddingPanel({ datasetId }: EmbeddingPanelProps) {
+export function EmbeddingPanel({ datasetId, datasetType }: EmbeddingPanelProps) {
   const queryClient = useQueryClient();
   const { data: status, isLoading: statusLoading } =
     useEmbeddingStatus(datasetId);
@@ -68,6 +69,13 @@ export function EmbeddingPanel({ datasetId }: EmbeddingPanelProps) {
   const clearLasso = useEmbeddingStore((s) => s.clearLasso);
   const deckRef = useRef<DeckGLRef | null>(null);
 
+  // Color mode state
+  const [colorMode, setColorMode] = useState<ColorMode>("default");
+  const hasPredictions = useMemo(
+    () => coordinates?.some((p) => p.predLabel != null) ?? false,
+    [coordinates],
+  );
+
   // Hover state for thumbnail tooltip
   const [hoveredPoint, setHoveredPoint] = useState<{
     point: EmbeddingPoint;
@@ -262,6 +270,18 @@ export function EmbeddingPanel({ datasetId }: EmbeddingPanelProps) {
           {coordinates.length.toLocaleString()} points
         </span>
 
+        {/* Color mode selector */}
+        <select
+          value={colorMode}
+          onChange={(e) => setColorMode(e.target.value as ColorMode)}
+          className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-xs px-2 py-1 text-zinc-900 dark:text-zinc-100"
+        >
+          <option value="default">Default</option>
+          <option value="gt_class">GT Class</option>
+          <option value="pred_class" disabled={!hasPredictions}>Predicted Class</option>
+          <option value="correctness" disabled={!hasPredictions}>Correct / Incorrect</option>
+        </select>
+
         {/* Lasso toggle */}
         <button
           onClick={() => setLassoActive((v) => !v)}
@@ -316,6 +336,7 @@ export function EmbeddingPanel({ datasetId }: EmbeddingPanelProps) {
           onHover={handleHover}
           selectedIds={lassoSelectedIds}
           deckRef={deckRef}
+          colorMode={colorMode}
         />
         <LassoOverlay
           points={coordinates}
diff --git a/frontend/src/components/embedding/embedding-scatter.tsx b/frontend/src/components/embedding/embedding-scatter.tsx
index ee0bc04..ea2541e 100644
--- a/frontend/src/components/embedding/embedding-scatter.tsx
+++ b/frontend/src/components/embedding/embedding-scatter.tsx
@@ -20,6 +20,8 @@ import { ScatterplotLayer } from "@deck.gl/layers";
 
 import type { EmbeddingPoint } from "@/types/embedding";
 
+export type ColorMode = "default" | "gt_class" | "pred_class" | "correctness";
+
 interface EmbeddingScatterProps {
   /** 2D coordinate points to render. */
   points: EmbeddingPoint[];
@@ -33,8 +35,18 @@ interface EmbeddingScatterProps {
   selectedIds?: string[] | null;
   /** Ref forwarded to the DeckGL component for lasso coordinate projection. */
   deckRef?: React.RefObject<DeckGLRef | null>;
+  /** Color mode for scatter point fill colors. */
+  colorMode?: ColorMode;
 }
 
+const CATEGORICAL_PALETTE: [number, number, number, number][] = [
+  [31, 119, 180, 200], [255, 127, 14, 200], [44, 160, 44, 200], [214, 39, 40, 200],
+  [148, 103, 189, 200], [140, 86, 75, 200], [227, 119, 194, 200], [127, 127, 127, 200],
+  [188, 189, 34, 200], [23, 190, 207, 200], [174, 199, 232, 200], [255, 187, 120, 200],
+  [152, 223, 138, 200], [255, 152, 150, 200], [197, 176, 213, 200], [196, 156, 148, 200],
+  [247, 182, 210, 200], [199, 199, 199, 200], [219, 219, 141, 200], [158, 218, 229, 200],
+];
+
 const INITIAL_VIEW_STATE = {
   target: [0, 0, 0] as [number, number, number],
   zoom: 1,
@@ -52,6 +64,7 @@ export function EmbeddingScatter({
   onHover,
   selectedIds = null,
   deckRef,
+  colorMode = "default",
 }: EmbeddingScatterProps) {
   const [deckKey, setDeckKey] = useState(0);
   const containerRef = useRef<HTMLDivElement>(null);
@@ -111,6 +124,19 @@ export function EmbeddingScatter({
     [onHover],
   );
 
+  // Build stable label-to-index map for categorical coloring
+  const labelIndex = useMemo(() => {
+    const labels = new Set<string>();
+    for (const p of points) {
+      if (p.gtLabel) labels.add(p.gtLabel);
+      if (p.predLabel) labels.add(p.predLabel);
+    }
+    const sorted = [...labels].sort();
+    const map = new Map<string, number>();
+    sorted.forEach((l, i) => map.set(l, i));
+    return map;
+  }, [points]);
+
   // Memoize layer to avoid recreating on every render (anti-pattern from research)
   const layers = useMemo(
     () => [
@@ -121,23 +147,38 @@ export function EmbeddingScatter({
         getRadius: 3,
         radiusMinPixels: 2,
         radiusMaxPixels: 8,
-        getFillColor: (d) =>
-          selectedSet === null
-            ? [100, 120, 220, 200]
-            : selectedSet.has(d.sampleId)
+        getFillColor: (d) => {
+          // Lasso selection overrides color mode
+          if (selectedSet !== null) {
+            return selectedSet.has(d.sampleId)
               ? [99, 102, 241, 230]
-              : [180, 180, 180, 80],
+              : [180, 180, 180, 80];
+          }
+          if (colorMode === "gt_class" && d.gtLabel) {
+            return CATEGORICAL_PALETTE[labelIndex.get(d.gtLabel)! % CATEGORICAL_PALETTE.length];
+          }
+          if (colorMode === "pred_class" && d.predLabel) {
+            return CATEGORICAL_PALETTE[labelIndex.get(d.predLabel)! % CATEGORICAL_PALETTE.length];
+          }
+          if (colorMode === "correctness") {
+            if (!d.predLabel) return [180, 180, 180, 100] as [number, number, number, number];
+            return d.gtLabel === d.predLabel
+              ? [44, 160, 44, 200] as [number, number, number, number]
+              : [214, 39, 40, 200] as [number, number, number, number];
+          }
+          return [100, 120, 220, 200];
+        },
         pickable: true,
         onHover: handleHover,
         autoHighlight: true,
         highlightColor: [255, 200, 0, 200],
-        // Force update when selection changes
+        // Force update when selection or color mode changes
         updateTriggers: {
-          getFillColor: [selectedSet],
+          getFillColor: [selectedSet, colorMode],
         },
       }),
     ],
-    [points, handleHover, selectedSet],
+    [points, handleHover, selectedSet, colorMode, labelIndex],
   );
 
   return (
diff --git a/frontend/src/hooks/use-import-predictions.ts b/frontend/src/hooks/use-import-predictions.ts
index 788c7fd..55a78a6 100644
--- a/frontend/src/hooks/use-import-predictions.ts
+++ b/frontend/src/hooks/use-import-predictions.ts
@@ -19,6 +19,7 @@ export function useImportPredictions(datasetId: string) {
       qc.invalidateQueries({ queryKey: ["dataset", datasetId] });
       qc.invalidateQueries({ queryKey: ["filter-facets", datasetId] });
       qc.invalidateQueries({ queryKey: ["annotations-batch"] });
+      qc.invalidateQueries({ queryKey: ["embedding-coordinates", datasetId] });
     },
   });
 }

From af2607e5d923a41df1f5f3e48906f098b6465f9c Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:59:33 -0500
Subject: [PATCH 34/38] docs(17-01): complete confusion matrix polish plan

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/STATE.md                            |  21 ++--
 .../17-classification-polish/17-01-SUMMARY.md | 102 ++++++++++++++++++
 2 files changed, 115 insertions(+), 8 deletions(-)
 create mode 100644 .planning/phases/17-classification-polish/17-01-SUMMARY.md

diff --git a/.planning/STATE.md b/.planning/STATE.md
index e6b9dc8..ef8fec1 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -5,16 +5,16 @@
 See: .planning/PROJECT.md (updated 2026-02-18)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** Phase 16 - Classification Evaluation
+**Current focus:** Phase 17 - Classification Polish
 
 ## Current Position
 
-Phase: 16 of 17 (Classification Evaluation)
-Plan: 2 of 2 in current phase (COMPLETE)
-Status: Phase 16 Complete
-Last activity: 2026-02-18 -- Completed 16-02 (Classification Evaluation Frontend)
+Phase: 17 of 17 (Classification Polish)
+Plan: 1 of 2 in current phase (COMPLETE)
+Status: Executing Phase 17
+Last activity: 2026-02-18 -- Completed 17-01 (Confusion Matrix Polish & Most-Confused Pairs)
 
-Progress: [##############################] 97% (v1.0 + v1.1 complete, v1.2 phase 16 complete)
+Progress: [##############################] 98% (v1.0 + v1.1 complete, v1.2 phase 17 in progress)
 
 ## Performance Metrics
 
@@ -48,6 +48,11 @@ Recent decisions affecting current work:
 - [Phase 16]: Remove response_model on evaluation endpoint for union return type support
 - [Phase 16]: Classification metric cards inline rather than reusing MetricsCards (different data shape)
 - [Phase 16]: Map backend error fields to classification labels: true_positives=correct, label_errors=misclassified
+- [Phase 17]: Threshold slider (0-50%, default 1%) hides noisy off-diagonal confusion cells
+- [Phase 17]: MostConfusedPairs derived client-side from confusion matrix (no new API endpoint)
+- [Phase 17]: F1Bar pure CSS bars with green/yellow/red thresholds at 0.8/0.5
+- [Phase 17]: LEFT JOIN annotations with MIN()+GROUP BY for one label per sample in coordinates
+- [Phase 17]: Color mode dropdown always visible; hasPredictions disables prediction-dependent modes
 
 ### Pending Todos
 
@@ -56,7 +61,7 @@ None.
 ### Blockers/Concerns
 
 - Confirm Roboflow JSONL format against actual export before finalizing parser
-- Confusion matrix at 43+ classes may need canvas rendering -- prototype early in Phase 17
+- Confusion matrix at 43+ classes: solved with threshold filtering + compact mode (no canvas needed)
 
 ### Roadmap Evolution
 
@@ -67,5 +72,5 @@ None.
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: Completed 16-02-PLAN.md (Classification Evaluation Frontend)
+Stopped at: Completed 17-01-PLAN.md (Confusion Matrix Polish & Most-Confused Pairs)
 Resume file: None
diff --git a/.planning/phases/17-classification-polish/17-01-SUMMARY.md b/.planning/phases/17-classification-polish/17-01-SUMMARY.md
new file mode 100644
index 0000000..a82a88a
--- /dev/null
+++ b/.planning/phases/17-classification-polish/17-01-SUMMARY.md
@@ -0,0 +1,102 @@
+---
+phase: 17-classification-polish
+plan: 01
+subsystem: ui
+tags: [confusion-matrix, classification, f1-bar, threshold-filter, react]
+
+# Dependency graph
+requires:
+  - phase: 16-classification-evaluation
+    provides: ConfusionMatrix component, ClassificationPerClassTable, classification eval layout
+provides:
+  - Threshold-filtered confusion matrix with overflow scroll for high-cardinality
+  - MostConfusedPairs ranked summary with clickable grid filtering
+  - F1Bar color-coded performance bars in per-class table
+affects: [17-02-PLAN]
+
+# Tech tracking
+tech-stack:
+  added: []
+  patterns: [inline sub-components for classification-specific UI, useMemo-derived analytics from raw matrix data]
+
+key-files:
+  created: []
+  modified:
+    - frontend/src/components/stats/confusion-matrix.tsx
+    - frontend/src/components/stats/evaluation-panel.tsx
+
+key-decisions:
+  - "Threshold slider with 0-50% range and 1% default hides noisy off-diagonal cells"
+  - "MostConfusedPairs derived client-side from confusion matrix (no new API endpoint)"
+  - "F1Bar is pure CSS with green/yellow/red thresholds at 0.8/0.5"
+
+patterns-established:
+  - "Inline sub-components for classification-specific widgets (F1Bar, MostConfusedPairs)"
+  - "Client-side matrix analytics derived via useMemo from existing backend data"
+
+# Metrics
+duration: 2min
+completed: 2026-02-18
+---
+
+# Phase 17 Plan 01: Classification Polish Summary
+
+**Confusion matrix threshold filtering with overflow scroll, most-confused pairs ranked summary, and color-coded F1 bars for high-cardinality classification readability**
+
+## Performance
+
+- **Duration:** 2 min
+- **Started:** 2026-02-19T03:56:37Z
+- **Completed:** 2026-02-19T03:58:23Z
+- **Tasks:** 2
+- **Files modified:** 2
+
+## Accomplishments
+- Confusion matrix threshold slider (0-50%) hides low-value off-diagonal cells for 43+ class readability
+- Overflow scroll container (max-h-500px) and compact mode for high-cardinality matrices (>20 classes)
+- Most-confused pairs ranked list (top 10) with clickable rows that filter the image grid
+- Color-coded F1 bars (green/yellow/red) in per-class metrics table Performance column
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Confusion matrix threshold filtering and overflow scroll** - `10a3230` (feat)
+2. **Task 2: Most-confused pairs summary and F1 bars in per-class table** - `660d287` (feat)
+
+## Files Created/Modified
+- `frontend/src/components/stats/confusion-matrix.tsx` - Added threshold slider, overflow scroll, compact mode for >20 classes
+- `frontend/src/components/stats/evaluation-panel.tsx` - Added MostConfusedPairs component, F1Bar component, Performance column
+
+## Decisions Made
+- Threshold slider 0-50% range with 1% default: balances noise reduction with visibility
+- MostConfusedPairs derived client-side from existing confusion matrix data (no new API endpoint needed)
+- F1Bar uses pure CSS bars with green (>=0.8), yellow (>=0.5), red (<0.5) thresholds
+- Diagonal cells always shown regardless of threshold (correct predictions are always relevant)
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+None
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Classification evaluation UI is now production-ready for high-cardinality datasets
+- Ready for 17-02 plan (additional polish items)
+- Detection evaluation layout completely unchanged
+
+## Self-Check: PASSED
+
+- confusion-matrix.tsx: FOUND
+- evaluation-panel.tsx: FOUND
+- 17-01-SUMMARY.md: FOUND
+- Commit 10a3230: FOUND
+- Commit 660d287: FOUND
+
+---
+*Phase: 17-classification-polish*
+*Completed: 2026-02-18*

From fc0e2d1e1dd672dd6d4a6925f65f1045cf800d34 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 22:59:50 -0500
Subject: [PATCH 35/38] docs(17-02): complete embedding color modes plan

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/STATE.md                            | 14 +--
 .../17-classification-polish/17-02-SUMMARY.md | 97 +++++++++++++++++++
 2 files changed, 104 insertions(+), 7 deletions(-)
 create mode 100644 .planning/phases/17-classification-polish/17-02-SUMMARY.md

diff --git a/.planning/STATE.md b/.planning/STATE.md
index ef8fec1..06e59f0 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -5,16 +5,16 @@
 See: .planning/PROJECT.md (updated 2026-02-18)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** Phase 17 - Classification Polish
+**Current focus:** Milestone v1.2 Complete
 
 ## Current Position
 
 Phase: 17 of 17 (Classification Polish)
-Plan: 1 of 2 in current phase (COMPLETE)
-Status: Executing Phase 17
-Last activity: 2026-02-18 -- Completed 17-01 (Confusion Matrix Polish & Most-Confused Pairs)
+Plan: 2 of 2 in current phase (COMPLETE)
+Status: Phase 17 Complete -- Milestone v1.2 Complete
+Last activity: 2026-02-18 -- Completed 17-02 (Embedding Color Modes)
 
-Progress: [##############################] 98% (v1.0 + v1.1 complete, v1.2 phase 17 in progress)
+Progress: [################################] 100% (v1.0 + v1.1 + v1.2 complete)
 
 ## Performance Metrics
 
@@ -67,10 +67,10 @@ None.
 
 - v1.0: 7 phases (1-7), 21 plans -- shipped 2026-02-12
 - v1.1: 7 phases (8-14), 20 plans -- shipped 2026-02-13
-- v1.2: 3 phases (15-17), TBD plans -- in progress
+- v1.2: 3 phases (15-17), 6 plans -- shipped 2026-02-18
 
 ## Session Continuity
 
 Last session: 2026-02-18
-Stopped at: Completed 17-01-PLAN.md (Confusion Matrix Polish & Most-Confused Pairs)
+Stopped at: Completed 17-02-PLAN.md (Embedding Color Modes)
 Resume file: None
diff --git a/.planning/phases/17-classification-polish/17-02-SUMMARY.md b/.planning/phases/17-classification-polish/17-02-SUMMARY.md
new file mode 100644
index 0000000..f195c4f
--- /dev/null
+++ b/.planning/phases/17-classification-polish/17-02-SUMMARY.md
@@ -0,0 +1,97 @@
+---
+phase: 17-classification-polish
+plan: 02
+subsystem: ui
+tags: [deck.gl, scatter-plot, categorical-coloring, embedding, color-mode]
+
+requires:
+  - phase: 17-classification-polish
+    provides: "Embedding scatter plot with lasso selection"
+provides:
+  - "Color mode dropdown (Default, GT Class, Predicted Class, Correct/Incorrect)"
+  - "Categorical Tableau 20 palette for class-based coloring"
+  - "Enriched coordinates API returning gtLabel and predLabel per point"
+affects: []
+
+tech-stack:
+  added: []
+  patterns:
+    - "Categorical palette with stable label-to-index mapping for consistent colors"
+    - "Color mode as prop threaded from panel to scatter component"
+
+key-files:
+  created: []
+  modified:
+    - app/services/reduction_service.py
+    - frontend/src/types/embedding.ts
+    - frontend/src/components/embedding/embedding-scatter.tsx
+    - frontend/src/components/embedding/embedding-panel.tsx
+    - frontend/src/app/datasets/[datasetId]/page.tsx
+    - frontend/src/hooks/use-import-predictions.ts
+
+key-decisions:
+  - "LEFT JOIN annotations with MIN() + GROUP BY to collapse multi-annotation to one label per sample"
+  - "Color mode dropdown always visible (not gated on dataset type) since detection datasets also have annotations"
+
+patterns-established:
+  - "Tableau 20 categorical palette for class-based visualizations"
+
+duration: 2min
+completed: 2026-02-18
+---
+
+# Phase 17 Plan 02: Embedding Color Modes Summary
+
+**Categorical color modes for embedding scatter (GT Class, Predicted Class, Correct/Incorrect) with Tableau 20 palette and enriched coordinates API**
+
+## Performance
+
+- **Duration:** 2 min
+- **Started:** 2026-02-19T03:56:39Z
+- **Completed:** 2026-02-19T03:58:47Z
+- **Tasks:** 2
+- **Files modified:** 6
+
+## Accomplishments
+- Backend coordinates endpoint enriched with gtLabel and predLabel via LEFT JOIN annotations
+- 4 color modes in embedding scatter: Default (uniform blue), GT Class, Predicted Class, Correct/Incorrect
+- Color mode dropdown in toolbar with prediction-dependent options disabled when no predictions exist
+- Embedding coordinates cache invalidated after prediction import to prevent stale data
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Backend coordinates enrichment with GT/pred labels** - `4ff366a` (feat)
+2. **Task 2: Embedding scatter color mode dropdown and categorical coloring** - `1f4c858` (feat)
+
+## Files Created/Modified
+- `app/services/reduction_service.py` - LEFT JOIN annotations for GT/pred labels in get_coordinates SQL
+- `frontend/src/types/embedding.ts` - Added optional gtLabel/predLabel to EmbeddingPoint interface
+- `frontend/src/components/embedding/embedding-scatter.tsx` - ColorMode type, Tableau 20 palette, getFillColor branching
+- `frontend/src/components/embedding/embedding-panel.tsx` - Color mode dropdown, hasPredictions memo, datasetType prop
+- `frontend/src/app/datasets/[datasetId]/page.tsx` - Thread datasetType to EmbeddingPanel
+- `frontend/src/hooks/use-import-predictions.ts` - Invalidate embedding-coordinates on prediction import
+
+## Decisions Made
+- LEFT JOIN annotations with MIN() + GROUP BY to collapse multi-annotation edge cases to one label per sample
+- Color mode dropdown always visible (not gated on dataset type) since detection datasets also have GT/pred annotations
+- Lasso selection overrides color mode coloring (selection highlight takes priority)
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+None
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Milestone v1.2 Classification Dataset Support is complete
+- All 3 phases (15-17) delivered: classification ingestion/UI, evaluation, and polish
+
+---
+*Phase: 17-classification-polish*
+*Completed: 2026-02-18*

From 67a7a9cd699e1f9dd93d37c1440644d859f37462 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 18 Feb 2026 23:04:01 -0500
Subject: [PATCH 36/38] =?UTF-8?q?docs(phase-17):=20complete=20phase=20exec?=
 =?UTF-8?q?ution=20=E2=80=94=20milestone=20v1.2=20complete?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .planning/REQUIREMENTS.md                     |  16 +--
 .planning/ROADMAP.md                          |   8 +-
 .../17-VERIFICATION.md                        | 131 ++++++++++++++++++
 3 files changed, 143 insertions(+), 12 deletions(-)
 create mode 100644 .planning/phases/17-classification-polish/17-VERIFICATION.md

diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
index b8b6a3d..3d5dfc1 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/REQUIREMENTS.md
@@ -31,10 +31,10 @@ Requirements for classification dataset support. Each maps to roadmap phases.
 
 ### Polish
 
-- [ ] **POLISH-01**: Confusion matrix scales to 43+ classes with readable rendering
-- [ ] **POLISH-02**: User can color embedding scatter by GT class, predicted class, or correct/incorrect status
-- [ ] **POLISH-03**: User sees most-confused class pairs summary from the confusion matrix
-- [ ] **POLISH-04**: User sees per-class performance sparklines with color-coded thresholds
+- [x] **POLISH-01**: Confusion matrix scales to 43+ classes with readable rendering
+- [x] **POLISH-02**: User can color embedding scatter by GT class, predicted class, or correct/incorrect status
+- [x] **POLISH-03**: User sees most-confused class pairs summary from the confusion matrix
+- [x] **POLISH-04**: User sees per-class performance sparklines with color-coded thresholds
 
 ## Future Requirements
 
@@ -79,10 +79,10 @@ Which phases cover which requirements. Updated during roadmap creation.
 | EVAL-03 | Phase 16 | Done |
 | EVAL-04 | Phase 16 | Done |
 | EVAL-05 | Phase 16 | Done |
-| POLISH-01 | Phase 17 | Pending |
-| POLISH-02 | Phase 17 | Pending |
-| POLISH-03 | Phase 17 | Pending |
-| POLISH-04 | Phase 17 | Pending |
+| POLISH-01 | Phase 17 | Done |
+| POLISH-02 | Phase 17 | Done |
+| POLISH-03 | Phase 17 | Done |
+| POLISH-04 | Phase 17 | Done |
 
 **Coverage:**
 - v1.2 requirements: 17 total
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 581ea9b..cb34b45 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -117,10 +117,10 @@ Plans:
   2. User can color the embedding scatter plot by GT class, predicted class, or correct/incorrect status
   3. User sees a ranked list of most-confused class pairs derived from the confusion matrix
   4. User sees per-class performance sparklines with color-coded thresholds (green/yellow/red) in the metrics table
-**Plans**: 2 plans
+**Plans**: 2 plans (complete)
 Plans:
-- [ ] 17-01-PLAN.md -- Confusion matrix threshold/overflow, most-confused pairs, F1 bars in per-class table
-- [ ] 17-02-PLAN.md -- Embedding scatter color modes (GT class, predicted class, correct/incorrect)
+- [x] 17-01-PLAN.md -- Confusion matrix threshold/overflow, most-confused pairs, F1 bars in per-class table
+- [x] 17-02-PLAN.md -- Embedding scatter color modes (GT class, predicted class, correct/incorrect)
 
 ## Progress
 
@@ -142,4 +142,4 @@ Plans:
 | 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
 | 15. Classification Ingestion & Display | v1.2 | 2/2 | Complete | 2026-02-18 |
 | 16. Classification Evaluation | v1.2 | 2/2 | Complete | 2026-02-18 |
-| 17. Classification Polish | v1.2 | 0/2 | Not started | - |
+| 17. Classification Polish | v1.2 | 2/2 | Complete | 2026-02-18 |
diff --git a/.planning/phases/17-classification-polish/17-VERIFICATION.md b/.planning/phases/17-classification-polish/17-VERIFICATION.md
new file mode 100644
index 0000000..c21c35e
--- /dev/null
+++ b/.planning/phases/17-classification-polish/17-VERIFICATION.md
@@ -0,0 +1,131 @@
+---
+phase: 17-classification-polish
+verified: 2026-02-19T04:01:46Z
+status: passed
+score: 8/8 must-haves verified
+re_verification: false
+---
+
+# Phase 17: Classification Polish Verification Report
+
+**Phase Goal:** Classification workflows are production-ready for high-cardinality datasets (43+ classes) with visual aids that surface actionable insights
+**Verified:** 2026-02-19T04:01:46Z
+**Status:** passed
+**Re-verification:** No — initial verification
+
+---
+
+## Goal Achievement
+
+### Observable Truths (from Success Criteria)
+
+| #  | Truth                                                                                              | Status     | Evidence                                                                                     |
+|----|----------------------------------------------------------------------------------------------------|------------|----------------------------------------------------------------------------------------------|
+| 1  | Confusion matrix renders readably at 43+ classes with threshold filtering and overflow handling     | VERIFIED   | Threshold slider (0–50%, default 1%), `overflow-auto max-h-[500px]`, compact mode for >20 classes (text-[10px], min-w-[24px], max-w-[80px] truncate) — confusion-matrix.tsx lines 34, 65, 102, 163, 180 |
+| 2  | User can color the embedding scatter plot by GT class, predicted class, or correct/incorrect status | VERIFIED   | `ColorMode` type exported from embedding-scatter.tsx; dropdown in embedding-panel.tsx with all 4 options; `getFillColor` branches on colorMode with CATEGORICAL_PALETTE — embedding-scatter.tsx lines 23, 150–169 |
+| 3  | User sees a ranked list of most-confused class pairs derived from the confusion matrix              | VERIFIED   | `MostConfusedPairs` component in evaluation-panel.tsx (lines 96–191) derives top 10 off-diagonal pairs by raw count; rendered between ConfusionMatrix and per-class table (lines 399–407) |
+| 4  | User sees per-class performance sparklines with color-coded thresholds in the metrics table         | VERIFIED   | `F1Bar` component (lines 86–93): green >= 0.8, yellow >= 0.5, red < 0.5; used in `ClassificationPerClassTable` Performance column (line 262); table has explicit "Performance" header (line 234) |
+
+**Score: 4/4 success-criteria truths verified**
+
+---
+
+## Must-Have Artifacts (17-01-PLAN.md)
+
+| Artifact                                                        | Provides                                                       | Status     | Details                                                                                           |
+|-----------------------------------------------------------------|----------------------------------------------------------------|------------|---------------------------------------------------------------------------------------------------|
+| `frontend/src/components/stats/confusion-matrix.tsx`            | Threshold slider, compact cells, overflow scroll container     | VERIFIED   | Exists, 195 lines, substantive. Contains `threshold`, `hiddenCount`, `isCompact`, `overflow-auto max-h-[500px]`. Imported + used in evaluation-panel.tsx line 22 and rendered at lines 391, 507. |
+| `frontend/src/components/stats/evaluation-panel.tsx`            | MostConfusedPairs component, F1Bar component in per-class table | VERIFIED  | Exists, 524 lines, substantive. Contains `MostConfusedPairs` (line 96), `F1Bar` (line 86), `ClassificationPerClassTable` with Performance column (line 234), and `<ConfusionMatrix` usage (lines 391, 507). |
+
+## Must-Have Artifacts (17-02-PLAN.md)
+
+| Artifact                                                             | Provides                                              | Status   | Details                                                                                                  |
+|----------------------------------------------------------------------|-------------------------------------------------------|----------|----------------------------------------------------------------------------------------------------------|
+| `app/services/reduction_service.py`                                  | Enriched get_coordinates with GT/pred label JOINs     | VERIFIED | Contains `gt_label`, `pred_label` via LEFT JOIN annotations with MIN() + GROUP BY (lines 144–172).       |
+| `frontend/src/types/embedding.ts`                                    | gtLabel and predLabel fields on EmbeddingPoint        | VERIFIED | Contains `gtLabel?: string \| null` and `predLabel?: string \| null` (lines 16–17).                     |
+| `frontend/src/components/embedding/embedding-scatter.tsx`            | colorMode-driven getFillColor with categorical palette | VERIFIED | Contains `colorMode` prop, `CATEGORICAL_PALETTE` (Tableau 20), `labelIndex` Map, `getFillColor` branching, `updateTriggers: { getFillColor: [selectedSet, colorMode] }` — lines 23, 39, 42–48, 128–181. |
+| `frontend/src/components/embedding/embedding-panel.tsx`              | Color mode dropdown in toolbar                        | VERIFIED | Contains `ColorMode` import, `colorMode` state, `hasPredictions` memo, select dropdown with 4 options and disabled logic, `colorMode` passed to `<EmbeddingScatter>` — lines 27, 73–77, 274–283, 339. |
+
+---
+
+## Key Link Verification
+
+### 17-01-PLAN.md Key Links
+
+| From                             | To                              | Via                      | Status  | Details                                                                            |
+|----------------------------------|---------------------------------|--------------------------|---------|------------------------------------------------------------------------------------|
+| `evaluation-panel.tsx`           | `confusion-matrix.tsx`          | `<ConfusionMatrix` usage | WIRED   | Import at line 22; rendered at lines 391 (classification) and 507 (detection). Both pass real matrix data. |
+
+### 17-02-PLAN.md Key Links
+
+| From                             | To                              | Via                                           | Status  | Details                                                                                                   |
+|----------------------------------|---------------------------------|-----------------------------------------------|---------|-----------------------------------------------------------------------------------------------------------|
+| `embedding-panel.tsx`            | `embedding-scatter.tsx`         | `colorMode` prop                              | WIRED   | `colorMode` state defined at line 73, passed as prop to `<EmbeddingScatter>` at line 339.                |
+| `reduction_service.py`           | `frontend/src/types/embedding.ts` | API response shape includes gtLabel, predLabel | WIRED   | Backend returns `"gtLabel": r[5], "predLabel": r[6]` (lines 167–168). Frontend type has matching `gtLabel?`, `predLabel?` (lines 16–17). |
+| `datasets/[datasetId]/page.tsx`  | `embedding-panel.tsx`           | `datasetType` prop threading                  | WIRED   | `<EmbeddingPanel datasetId={datasetId} datasetType={dataset?.dataset_type} />` at line 117.              |
+
+### Bonus Key Link (not in plan frontmatter)
+
+| From                                   | To                              | Via                                                    | Status  | Details                                                                        |
+|----------------------------------------|---------------------------------|--------------------------------------------------------|---------|--------------------------------------------------------------------------------|
+| `use-import-predictions.ts`            | `embedding-panel.tsx`           | `embedding-coordinates` query key invalidation on import | WIRED   | `qc.invalidateQueries({ queryKey: ["embedding-coordinates", datasetId] })` at line 23 of use-import-predictions.ts. |
+
+---
+
+## Requirements Coverage
+
+All 4 phase success criteria map directly to verified truths above. No unmet requirements found.
+
+---
+
+## Anti-Patterns Found
+
+None. Zero TODO/FIXME/placeholder comments in any modified files. No stub implementations (empty handlers, static returns, or unreachable branches). TypeScript compiler (`npx tsc --noEmit`) exits with zero errors.
+
+---
+
+## Commit Verification
+
+All four commits documented in SUMMARY files confirmed to exist:
+
+| Commit   | Description                                              |
+|----------|----------------------------------------------------------|
+| `10a3230`| feat(17-01): add threshold filtering and overflow scroll to confusion matrix |
+| `660d287`| feat(17-01): add most-confused pairs and F1 bars to classification eval      |
+| `4ff366a`| feat(17-02): enrich coordinates endpoint with GT/pred labels                 |
+| `1f4c858`| feat(17-02): add color mode dropdown and categorical coloring to embedding scatter |
+
+---
+
+## Human Verification Recommended
+
+The following items pass automated checks but benefit from visual confirmation:
+
+### 1. Confusion Matrix Readability at 43+ Classes
+
+**Test:** Load a classification dataset with 43+ classes and open the Evaluation tab. Adjust the threshold slider.
+**Expected:** Matrix cells below threshold disappear, "N cells hidden" counter updates, labels are truncated with ellipsis, cell values use 10px font. Matrix scrolls vertically/horizontally without breaking layout.
+**Why human:** Cell density, truncation appearance, and scroll UX cannot be verified programmatically.
+
+### 2. Color Mode Visual Correctness
+
+**Test:** With a classification dataset that has predictions imported, open the Embeddings tab, select "GT Class" then "Predicted Class" then "Correct / Incorrect" from the dropdown.
+**Expected:** Points change color per the Tableau 20 palette (GT Class / Predicted Class), or green/red/gray (Correct / Incorrect). Lasso selection still overrides coloring.
+**Why human:** Color rendering accuracy and visual distinction between modes requires visual inspection.
+
+### 3. Most Confused Pairs Click-Through
+
+**Test:** In the Evaluation tab for a classification dataset, click a row in the "Most Confused Pairs" table.
+**Expected:** The UI switches to the Grid tab and filters images to only those misclassified in that direction.
+**Why human:** State transitions and filter application require runtime verification.
+
+---
+
+## Gaps Summary
+
+No gaps. All must-haves from both 17-01-PLAN.md and 17-02-PLAN.md are verified at all three levels (exists, substantive, wired). The phase goal is achieved.
+
+---
+
+_Verified: 2026-02-19T04:01:46Z_
+_Verifier: Claude (gsd-verifier)_

From 09da91724817214810d7f85fce33148beed29609 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Thu, 19 Feb 2026 00:04:57 -0500
Subject: [PATCH 37/38] chore: archive v1.2 Classification Dataset Support
 milestone

Archive roadmap and requirements to milestones/, update PROJECT.md with
validated requirements and new decisions, collapse v1.2 in ROADMAP.md.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .planning/MILESTONES.md                       |  29 ++++
 .planning/PROJECT.md                          |  45 +++---
 .planning/ROADMAP.md                          |  47 +-----
 .planning/STATE.md                            |  42 ++---
 .../v1.2-REQUIREMENTS.md}                     |   9 ++
 .planning/milestones/v1.2-ROADMAP.md          | 145 ++++++++++++++++++
 6 files changed, 223 insertions(+), 94 deletions(-)
 rename .planning/{REQUIREMENTS.md => milestones/v1.2-REQUIREMENTS.md} (96%)
 create mode 100644 .planning/milestones/v1.2-ROADMAP.md

diff --git a/.planning/MILESTONES.md b/.planning/MILESTONES.md
index f4545bd..e3db4ca 100644
--- a/.planning/MILESTONES.md
+++ b/.planning/MILESTONES.md
@@ -56,3 +56,32 @@
 **What's next:** Interactive model evaluation dashboard (PR curves, confusion matrix, per-class AP metrics)
 
 ---
+
+## v1.2 Classification Dataset Support (Shipped: 2026-02-19)
+
+**Delivered:** First-class single-label classification dataset support with full feature parity to detection workflows — from JSONL ingestion through evaluation metrics to production-ready polish for high-cardinality datasets.
+
+**Phases completed:** 15-17 (6 plans total)
+
+**Key accomplishments:**
+
+- Classification JSONL parser with auto-detection of dataset type, multi-split ingestion, and sentinel bbox pattern for unified schema
+- Grid browsing with class label badges and detail modal with dropdown class editor (PATCH mutation)
+- Classification evaluation: accuracy, macro/weighted F1, per-class precision/recall/F1, and clickable confusion matrix
+- Error analysis categorizing each image as correct, misclassified, or missing prediction
+- Confusion matrix polish with threshold filtering and overflow scroll for 43+ classes, most-confused pairs summary
+- Embedding scatter color modes: GT class, predicted class, and correct/incorrect with Tableau 20 categorical palette
+
+**Stats:**
+
+- 61 files created/modified
+- ~6,052 lines of code added
+- 3 phases, 6 plans, 27 commits
+- 1 day (Feb 18, 2026)
+
+**Git range:** `5264e51` → `67a7a9c`
+
+**What's next:** TBD — next milestone planning
+
+---
+
diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md
index ac3425b..e62013d 100644
--- a/.planning/PROJECT.md
+++ b/.planning/PROJECT.md
@@ -2,29 +2,16 @@
 
 ## What This Is
 
-DataVisor is an open-source dataset introspection tool for computer vision — an alternative to Voxel51. It combines a high-performance visual browser with VLM-powered agentic workflows to automatically discover dataset blind spots (poor lighting, rare occlusions, label errors). Built as a personal tool for exploring 100K+ image datasets with COCO format annotations.
+DataVisor is an open-source dataset introspection tool for computer vision — an alternative to Voxel51. It combines a high-performance visual browser with VLM-powered agentic workflows to automatically discover dataset blind spots (poor lighting, rare occlusions, label errors). Built as a personal tool for exploring 100K+ image datasets with COCO detection or JSONL classification annotations.
 
 ## Core Value
 
 A single tool that replaces scattered one-off scripts: load any CV dataset, visually browse with annotation overlays, compare ground truth against predictions, cluster via embeddings, and surface mistakes — all in one workflow.
 
-## Current Milestone: v1.2 Classification Dataset Support
-
-**Goal:** Add first-class support for single-label classification datasets with full feature parity to detection workflows.
-
-**Target features:**
-- Auto-detect dataset type (detection vs classification) from annotation format
-- JSONL classification ingestion (Roboflow format: image/prefix/suffix)
-- Grid browsing with class label overlays
-- Classification prediction import and GT vs predicted comparison
-- Classification-specific stats: accuracy, F1, per-class precision/recall, confusion matrix
-- Embedding visualization and clustering for classification datasets
-- Filter/search by class label
-
 ## Current State
 
-**Shipped:** v1.1 (2026-02-13)
-**Codebase:** ~32K LOC (16,256 Python + 15,924 TypeScript) across 14 phases
+**Shipped:** v1.2 (2026-02-19)
+**Codebase:** ~38K LOC (16,256+ Python + 15,924+ TypeScript) across 17 phases
 **Architecture:** FastAPI + DuckDB + Qdrant (backend), Next.js + Tailwind + deck.gl + Recharts (frontend), Pydantic AI (agents), Moondream2 (VLM)
 
 ## Requirements
@@ -45,16 +32,17 @@ A single tool that replaces scattered one-off scripts: load any CV dataset, visu
 - Error triage: sample tagging, per-annotation TP/FP/FN via IoU, worst-images ranking, highlight mode — v1.1
 - Interactive discovery: confusion matrix, near-duplicates, histogram filtering, find-similar — v1.1
 - Keyboard shortcuts: 16 shortcuts across grid, modal, triage, editing — v1.1
+- Auto-detect dataset type (detection vs classification) from annotation format — v1.2
+- JSONL classification ingestion with multi-split support — v1.2
+- Grid browsing with class label badges for classification datasets — v1.2
+- Classification prediction import and GT vs predicted comparison — v1.2
+- Classification stats: accuracy, F1, per-class precision/recall, confusion matrix — v1.2
+- Embedding color modes (GT class, predicted class, correct/incorrect) — v1.2
+- Confusion matrix scaling to 43+ classes with threshold filtering — v1.2
 
 ### Active
 
-- [ ] Auto-detect dataset type from annotation format (COCO JSON → detection, JSONL → classification)
-- [ ] JSONL classification ingestion with multi-split support
-- [ ] Grid browsing with class label overlays for classification datasets
-- [ ] Classification prediction import and GT vs predicted comparison
-- [ ] Classification stats: accuracy, F1, per-class precision/recall, confusion matrix
-- [ ] Embedding visualization for classification datasets
-- [ ] Filter/search by class label
+(None — planning next milestone)
 
 ### Out of Scope
 
@@ -63,7 +51,7 @@ A single tool that replaces scattered one-off scripts: load any CV dataset, visu
 - Training pipeline integration — DataVisor inspects data, doesn't train
 - Mobile/tablet interface — desktop browser only
 - Full annotation editor (polygons, segmentation) — bounding box only
-- Multi-label classification — single-label per image only for v1.2
+- Multi-label classification — single-label per image only for now
 
 ## Constraints
 
@@ -90,6 +78,13 @@ A single tool that replaces scattered one-off scripts: load any CV dataset, visu
 | Pre-computed agent prompt | All data in prompt, no tool calls; avoids Pydantic AI request_limit issues | Good |
 | t-SNE over UMAP | umap-learn blocked by Python 3.14 numba incompatibility | Revisit when numba supports 3.14 |
 | Moondream2 via transformers | trust_remote_code with all_tied_weights_keys patch for transformers 5.x | Fragile — monitor updates |
+| Sentinel bbox values (0.0) for classification | Avoids 30+ null guards; unified schema for detection and classification | Good |
+| Separate classification evaluation service | ~50-line function vs modifying 560-line detection eval; clean separation | Good |
+| Dataset-type routing at endpoint level | Keep classification/detection services separate; route in router layer | Good |
+| Parser registry in IngestionService | Format-based dispatch to COCOParser or ClassificationJSONLParser | Good |
+| Threshold slider for confusion matrix | Hide noisy off-diagonal cells at high cardinality (0-50%, default 1%) | Good |
+| Client-side most-confused pairs | Derived from confusion matrix data; no new API endpoint needed | Good |
+| Tableau 20 palette for embeddings | Stable categorical coloring for class-based scatter modes | Good |
 
 ---
-*Last updated: 2026-02-18 after v1.2 milestone started*
+*Last updated: 2026-02-19 after v1.2 milestone*
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index cb34b45..36db1ce 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -4,7 +4,7 @@
 
 - v1.0 MVP - Phases 1-7 (shipped 2026-02-12) — [archive](.planning/milestones/v1.0-ROADMAP.md)
 - v1.1 Deployment, Workflow & Competitive Parity - Phases 8-14 (shipped 2026-02-13) — [archive](.planning/milestones/v1.1-ROADMAP.md)
-- v1.2 Classification Dataset Support - Phases 15-17 (in progress)
+- v1.2 Classification Dataset Support - Phases 15-17 (shipped 2026-02-19) — [archive](.planning/milestones/v1.2-ROADMAP.md)
 
 ## Phases
 
@@ -74,53 +74,22 @@
 
 </details>
 
-### v1.2 Classification Dataset Support (In Progress)
-
-**Milestone Goal:** First-class single-label classification dataset support with full feature parity to detection workflows -- from ingestion through evaluation to polish.
+<details>
+<summary>v1.2 Classification Dataset Support (Phases 15-17) - SHIPPED 2026-02-19</summary>
 
-#### Phase 15: Classification Ingestion & Display
+### Phase 15: Classification Ingestion & Display
 **Goal**: Users can import, browse, and inspect classification datasets with the same ease as detection datasets
-**Depends on**: Phase 14 (existing codebase)
-**Requirements**: INGEST-01, INGEST-02, INGEST-03, INGEST-04, DISP-01, DISP-02, DISP-03, DISP-04
-**Success Criteria** (what must be TRUE):
-  1. User can point the ingestion wizard at a folder with JSONL annotations and images, and the system auto-detects it as a classification dataset
-  2. User can import multi-split classification datasets (train/valid/test) in a single operation, just like detection datasets
-  3. User sees class label badges on grid thumbnails instead of bounding box overlays when browsing a classification dataset
-  4. User sees GT class label prominently in the sample detail modal and can change it via a dropdown
-  5. Statistics dashboard shows classification-appropriate metrics (labeled images count, class distribution) with no detection-only elements visible (no bbox area histogram, no IoU slider)
 **Plans**: 2 plans (complete)
-Plans:
-- [x] 15-01-PLAN.md -- Backend: schema migration, ClassificationJSONLParser, FolderScanner detection, IngestionService dispatch, API endpoints
-- [x] 15-02-PLAN.md -- Frontend: type updates, grid class badges, detail modal class label/dropdown, classification-aware statistics
 
-#### Phase 16: Classification Evaluation
+### Phase 16: Classification Evaluation
 **Goal**: Users can import predictions and analyze classification model performance with accuracy, F1, confusion matrix, and error categorization
-**Depends on**: Phase 15
-**Requirements**: EVAL-01, EVAL-02, EVAL-03, EVAL-04, EVAL-05
-**Success Criteria** (what must be TRUE):
-  1. User can import classification predictions in JSONL format with confidence scores and see them alongside ground truth
-  2. User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics in the evaluation panel
-  3. User sees a confusion matrix and can click any cell to filter the grid to images with that GT/predicted class pair
-  4. User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view
-  5. User sees GT vs predicted label comparison on grid thumbnails and in the detail modal
 **Plans**: 2 plans (complete)
-Plans:
-- [x] 16-01-PLAN.md -- Backend: classification prediction parser, evaluation service, error analysis service, endpoint routing
-- [x] 16-02-PLAN.md -- Frontend: types, hooks, prediction import dialog, evaluation panel, error analysis panel, grid badges
 
-#### Phase 17: Classification Polish
+### Phase 17: Classification Polish
 **Goal**: Classification workflows are production-ready for high-cardinality datasets (43+ classes) with visual aids that surface actionable insights
-**Depends on**: Phase 16
-**Requirements**: POLISH-01, POLISH-02, POLISH-03, POLISH-04
-**Success Criteria** (what must be TRUE):
-  1. Confusion matrix renders readably at 43+ classes with threshold filtering and overflow handling
-  2. User can color the embedding scatter plot by GT class, predicted class, or correct/incorrect status
-  3. User sees a ranked list of most-confused class pairs derived from the confusion matrix
-  4. User sees per-class performance sparklines with color-coded thresholds (green/yellow/red) in the metrics table
 **Plans**: 2 plans (complete)
-Plans:
-- [x] 17-01-PLAN.md -- Confusion matrix threshold/overflow, most-confused pairs, F1 bars in per-class table
-- [x] 17-02-PLAN.md -- Embedding scatter color modes (GT class, predicted class, correct/incorrect)
+
+</details>
 
 ## Progress
 
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 06e59f0..ab9e078 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -2,17 +2,16 @@
 
 ## Project Reference
 
-See: .planning/PROJECT.md (updated 2026-02-18)
+See: .planning/PROJECT.md (updated 2026-02-19)
 
 **Core value:** A single tool that replaces scattered scripts: load any CV dataset, visually browse with annotation overlays, compare GT vs predictions, cluster via embeddings, and surface mistakes -- all in one workflow.
-**Current focus:** Milestone v1.2 Complete
+**Current focus:** Planning next milestone
 
 ## Current Position
 
-Phase: 17 of 17 (Classification Polish)
-Plan: 2 of 2 in current phase (COMPLETE)
-Status: Phase 17 Complete -- Milestone v1.2 Complete
-Last activity: 2026-02-18 -- Completed 17-02 (Embedding Color Modes)
+Phase: 17 of 17 (all milestones complete)
+Status: v1.2 archived, ready for next milestone
+Last activity: 2026-02-19 -- Completed v1.2 milestone archival
 
 Progress: [################################] 100% (v1.0 + v1.1 + v1.2 complete)
 
@@ -28,31 +27,15 @@ Progress: [################################] 100% (v1.0 + v1.1 + v1.2 complete)
 - Average duration: 3.7 min
 - Total execution time: 73 min
 
+**Velocity (v1.2):**
+- Total plans completed: 6
+- Timeline: 1 day (2026-02-18)
+
 ## Accumulated Context
 
 ### Decisions
 
 Decisions are logged in PROJECT.md Key Decisions table.
-Recent decisions affecting current work:
-
-- Schema approach: sentinel bbox values (0.0) over nullable columns -- avoids 30+ null guards
-- Separate classification evaluation function (~50 lines) vs modifying 560-line detection eval
-- Thread `datasetType` prop from page level, branch at component boundaries
-- Parser registry in IngestionService for format dispatch
-- Classification JSONL layouts checked before COCO (more specific first)
-- Classification gt_annotations = COUNT(DISTINCT sample_id) for labeled images
-- [Phase 15]: Thread datasetType from page level, branch at component boundaries with isClassification flag
-- [Phase 15]: Hide detection-only stats tabs for classification (Evaluation, Error Analysis, Worst Images, Intelligence)
-- [Phase 16]: Reuse ErrorAnalysisResponse model from detection for classification error analysis
-- [Phase 16]: Route by dataset_type at endpoint level, keeping classification/detection services separate
-- [Phase 16]: Remove response_model on evaluation endpoint for union return type support
-- [Phase 16]: Classification metric cards inline rather than reusing MetricsCards (different data shape)
-- [Phase 16]: Map backend error fields to classification labels: true_positives=correct, label_errors=misclassified
-- [Phase 17]: Threshold slider (0-50%, default 1%) hides noisy off-diagonal confusion cells
-- [Phase 17]: MostConfusedPairs derived client-side from confusion matrix (no new API endpoint)
-- [Phase 17]: F1Bar pure CSS bars with green/yellow/red thresholds at 0.8/0.5
-- [Phase 17]: LEFT JOIN annotations with MIN()+GROUP BY for one label per sample in coordinates
-- [Phase 17]: Color mode dropdown always visible; hasPredictions disables prediction-dependent modes
 
 ### Pending Todos
 
@@ -61,16 +44,15 @@ None.
 ### Blockers/Concerns
 
 - Confirm Roboflow JSONL format against actual export before finalizing parser
-- Confusion matrix at 43+ classes: solved with threshold filtering + compact mode (no canvas needed)
 
 ### Roadmap Evolution
 
 - v1.0: 7 phases (1-7), 21 plans -- shipped 2026-02-12
 - v1.1: 7 phases (8-14), 20 plans -- shipped 2026-02-13
-- v1.2: 3 phases (15-17), 6 plans -- shipped 2026-02-18
+- v1.2: 3 phases (15-17), 6 plans -- shipped 2026-02-19
 
 ## Session Continuity
 
-Last session: 2026-02-18
-Stopped at: Completed 17-02-PLAN.md (Embedding Color Modes)
+Last session: 2026-02-19
+Stopped at: Completed v1.2 milestone archival
 Resume file: None
diff --git a/.planning/REQUIREMENTS.md b/.planning/milestones/v1.2-REQUIREMENTS.md
similarity index 96%
rename from .planning/REQUIREMENTS.md
rename to .planning/milestones/v1.2-REQUIREMENTS.md
index 3d5dfc1..28bec2b 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/milestones/v1.2-REQUIREMENTS.md
@@ -1,3 +1,12 @@
+# Requirements Archive: v1.2 Classification Dataset Support
+
+**Archived:** 2026-02-19
+**Status:** SHIPPED
+
+For current requirements, see `.planning/REQUIREMENTS.md`.
+
+---
+
 # Requirements: DataVisor
 
 **Defined:** 2026-02-18
diff --git a/.planning/milestones/v1.2-ROADMAP.md b/.planning/milestones/v1.2-ROADMAP.md
new file mode 100644
index 0000000..cb34b45
--- /dev/null
+++ b/.planning/milestones/v1.2-ROADMAP.md
@@ -0,0 +1,145 @@
+# Roadmap: DataVisor
+
+## Milestones
+
+- v1.0 MVP - Phases 1-7 (shipped 2026-02-12) — [archive](.planning/milestones/v1.0-ROADMAP.md)
+- v1.1 Deployment, Workflow & Competitive Parity - Phases 8-14 (shipped 2026-02-13) — [archive](.planning/milestones/v1.1-ROADMAP.md)
+- v1.2 Classification Dataset Support - Phases 15-17 (in progress)
+
+## Phases
+
+<details>
+<summary>v1.0 MVP (Phases 1-7) - SHIPPED 2026-02-12</summary>
+
+### Phase 1: Data Foundation
+**Goal**: DuckDB-backed streaming ingestion pipeline for COCO datasets at 100K+ scale
+**Plans**: 4 plans (complete)
+
+### Phase 2: Visual Grid
+**Goal**: Virtualized infinite-scroll grid with SVG annotation overlays
+**Plans**: 3 plans (complete)
+
+### Phase 3: Filtering & Search
+**Goal**: Full metadata filtering, search, saved views, and bulk tagging
+**Plans**: 2 plans (complete)
+
+### Phase 4: Predictions & Comparison
+**Goal**: Model prediction import with GT vs Predictions comparison
+**Plans**: 3 plans (complete)
+
+### Phase 5: Embeddings & Visualization
+**Goal**: DINOv2 embeddings with t-SNE reduction and deck.gl scatter plot
+**Plans**: 4 plans (complete)
+
+### Phase 6: Error Analysis & Similarity
+**Goal**: Error categorization pipeline and Qdrant-powered similarity search
+**Plans**: 2 plans (complete)
+
+### Phase 7: Intelligence & Agents
+**Goal**: Pydantic AI agent for error patterns and Moondream2 VLM auto-tagging
+**Plans**: 3 plans (complete)
+
+</details>
+
+<details>
+<summary>v1.1 Deployment, Workflow & Competitive Parity (Phases 8-14) - SHIPPED 2026-02-13</summary>
+
+### Phase 8: Docker Deployment & Auth
+**Goal**: Deployable Docker stack with single-user auth, accessible on cloud VM or locally
+**Plans**: 5 plans (complete)
+
+### Phase 9: Smart Ingestion
+**Goal**: No-code dataset import from folder path with auto-detection and confirmation
+**Plans**: 2 plans (complete)
+
+### Phase 10: Annotation Editing
+**Goal**: Move, resize, delete, and draw bounding boxes via react-konva in sample detail modal
+**Plans**: 3 plans (complete)
+
+### Phase 11: Error Triage
+**Goal**: Tag errors, highlight mode, and worst-images ranking with DuckDB persistence
+**Plans**: 2 plans (complete)
+
+### Phase 12: Interactive Viz & Discovery
+**Goal**: Confusion matrix, near-duplicates, interactive histograms, and find-similar
+**Plans**: 3 plans (complete)
+
+### Phase 13: Keyboard Shortcuts
+**Goal**: Keyboard navigation, triage hotkeys, edit shortcuts, and help overlay
+**Plans**: 2 plans (complete)
+
+### Phase 14: Per-Annotation Triage
+**Goal**: Auto-discover TP/FP/FN per bounding box via IoU overlap, color-coded boxes in detail modal, click to override classifications
+**Plans**: 3 plans (complete)
+
+</details>
+
+### v1.2 Classification Dataset Support (In Progress)
+
+**Milestone Goal:** First-class single-label classification dataset support with full feature parity to detection workflows -- from ingestion through evaluation to polish.
+
+#### Phase 15: Classification Ingestion & Display
+**Goal**: Users can import, browse, and inspect classification datasets with the same ease as detection datasets
+**Depends on**: Phase 14 (existing codebase)
+**Requirements**: INGEST-01, INGEST-02, INGEST-03, INGEST-04, DISP-01, DISP-02, DISP-03, DISP-04
+**Success Criteria** (what must be TRUE):
+  1. User can point the ingestion wizard at a folder with JSONL annotations and images, and the system auto-detects it as a classification dataset
+  2. User can import multi-split classification datasets (train/valid/test) in a single operation, just like detection datasets
+  3. User sees class label badges on grid thumbnails instead of bounding box overlays when browsing a classification dataset
+  4. User sees GT class label prominently in the sample detail modal and can change it via a dropdown
+  5. Statistics dashboard shows classification-appropriate metrics (labeled images count, class distribution) with no detection-only elements visible (no bbox area histogram, no IoU slider)
+**Plans**: 2 plans (complete)
+Plans:
+- [x] 15-01-PLAN.md -- Backend: schema migration, ClassificationJSONLParser, FolderScanner detection, IngestionService dispatch, API endpoints
+- [x] 15-02-PLAN.md -- Frontend: type updates, grid class badges, detail modal class label/dropdown, classification-aware statistics
+
+#### Phase 16: Classification Evaluation
+**Goal**: Users can import predictions and analyze classification model performance with accuracy, F1, confusion matrix, and error categorization
+**Depends on**: Phase 15
+**Requirements**: EVAL-01, EVAL-02, EVAL-03, EVAL-04, EVAL-05
+**Success Criteria** (what must be TRUE):
+  1. User can import classification predictions in JSONL format with confidence scores and see them alongside ground truth
+  2. User sees accuracy, macro F1, weighted F1, and per-class precision/recall/F1 metrics in the evaluation panel
+  3. User sees a confusion matrix and can click any cell to filter the grid to images with that GT/predicted class pair
+  4. User sees each image categorized as correct, misclassified, or missing prediction in the error analysis view
+  5. User sees GT vs predicted label comparison on grid thumbnails and in the detail modal
+**Plans**: 2 plans (complete)
+Plans:
+- [x] 16-01-PLAN.md -- Backend: classification prediction parser, evaluation service, error analysis service, endpoint routing
+- [x] 16-02-PLAN.md -- Frontend: types, hooks, prediction import dialog, evaluation panel, error analysis panel, grid badges
+
+#### Phase 17: Classification Polish
+**Goal**: Classification workflows are production-ready for high-cardinality datasets (43+ classes) with visual aids that surface actionable insights
+**Depends on**: Phase 16
+**Requirements**: POLISH-01, POLISH-02, POLISH-03, POLISH-04
+**Success Criteria** (what must be TRUE):
+  1. Confusion matrix renders readably at 43+ classes with threshold filtering and overflow handling
+  2. User can color the embedding scatter plot by GT class, predicted class, or correct/incorrect status
+  3. User sees a ranked list of most-confused class pairs derived from the confusion matrix
+  4. User sees per-class performance sparklines with color-coded thresholds (green/yellow/red) in the metrics table
+**Plans**: 2 plans (complete)
+Plans:
+- [x] 17-01-PLAN.md -- Confusion matrix threshold/overflow, most-confused pairs, F1 bars in per-class table
+- [x] 17-02-PLAN.md -- Embedding scatter color modes (GT class, predicted class, correct/incorrect)
+
+## Progress
+
+| Phase | Milestone | Plans Complete | Status | Completed |
+|-------|-----------|----------------|--------|-----------|
+| 1. Data Foundation | v1.0 | 4/4 | Complete | 2026-02-10 |
+| 2. Visual Grid | v1.0 | 3/3 | Complete | 2026-02-10 |
+| 3. Filtering & Search | v1.0 | 2/2 | Complete | 2026-02-11 |
+| 4. Predictions & Comparison | v1.0 | 3/3 | Complete | 2026-02-11 |
+| 5. Embeddings & Visualization | v1.0 | 4/4 | Complete | 2026-02-11 |
+| 6. Error Analysis & Similarity | v1.0 | 2/2 | Complete | 2026-02-12 |
+| 7. Intelligence & Agents | v1.0 | 3/3 | Complete | 2026-02-12 |
+| 8. Docker Deployment & Auth | v1.1 | 5/5 | Complete | 2026-02-12 |
+| 9. Smart Ingestion | v1.1 | 2/2 | Complete | 2026-02-12 |
+| 10. Annotation Editing | v1.1 | 3/3 | Complete | 2026-02-12 |
+| 11. Error Triage | v1.1 | 2/2 | Complete | 2026-02-12 |
+| 12. Interactive Viz & Discovery | v1.1 | 3/3 | Complete | 2026-02-13 |
+| 13. Keyboard Shortcuts | v1.1 | 2/2 | Complete | 2026-02-13 |
+| 14. Per-Annotation Triage | v1.1 | 3/3 | Complete | 2026-02-13 |
+| 15. Classification Ingestion & Display | v1.2 | 2/2 | Complete | 2026-02-18 |
+| 16. Classification Evaluation | v1.2 | 2/2 | Complete | 2026-02-18 |
+| 17. Classification Polish | v1.2 | 2/2 | Complete | 2026-02-18 |

From bc50e0a90a89dc8d8395bd40cb32f31eae04e778 Mon Sep 17 00:00:00 2001
From: "Enrique G. Ortiz" <ortizeg@gmail.com>
Date: Wed, 25 Feb 2026 15:10:16 -0500
Subject: [PATCH 38/38] feat: add delete predictions endpoint and persist stats
 selection state

Add DELETE /datasets/{id}/predictions/{run_name} API endpoint with
ground_truth guard and prediction_count recalculation. Add trash button
to source dropdowns in evaluation, error analysis, and intelligence
panels. Lift statsSubTab and statsSource into ui-store so selection
persists across sub-tab switches.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/routers/datasets.py                       | 48 ++++++++++++++++
 .../components/stats/error-analysis-panel.tsx | 55 +++++++++++++++---
 .../src/components/stats/evaluation-panel.tsx | 56 +++++++++++++++----
 .../components/stats/intelligence-panel.tsx   | 41 +++++++++++---
 .../src/components/stats/stats-dashboard.tsx  |  6 +-
 .../components/triage/worst-images-panel.tsx  | 13 +++--
 frontend/src/hooks/use-delete-predictions.ts  | 20 +++++++
 frontend/src/stores/ui-store.ts               | 15 +++++
 8 files changed, 220 insertions(+), 34 deletions(-)
 create mode 100644 frontend/src/hooks/use-delete-predictions.ts

diff --git a/app/routers/datasets.py b/app/routers/datasets.py
index 99ae72f..760e947 100644
--- a/app/routers/datasets.py
+++ b/app/routers/datasets.py
@@ -323,6 +323,54 @@ def import_predictions(
     )
 
 
+@router.delete("/{dataset_id}/predictions/{run_name}", status_code=204)
+def delete_predictions(
+    dataset_id: str,
+    run_name: str,
+    db: DuckDBRepo = Depends(get_db),
+) -> None:
+    """Delete all prediction annotations for a specific run."""
+    if run_name == "ground_truth":
+        raise HTTPException(status_code=400, detail="Cannot delete ground_truth annotations")
+
+    cursor = db.connection.cursor()
+    try:
+        # Verify dataset exists
+        row = cursor.execute(
+            "SELECT id FROM datasets WHERE id = ?", [dataset_id]
+        ).fetchone()
+        if row is None:
+            raise HTTPException(status_code=404, detail="Dataset not found")
+
+        # Check annotations exist for this run
+        count = cursor.execute(
+            "SELECT COUNT(*) FROM annotations WHERE dataset_id = ? AND source = ?",
+            [dataset_id, run_name],
+        ).fetchone()[0]
+        if count == 0:
+            raise HTTPException(status_code=404, detail=f"No predictions found for run '{run_name}'")
+
+        # Delete annotations for this run
+        cursor.execute(
+            "DELETE FROM annotations WHERE dataset_id = ? AND source = ?",
+            [dataset_id, run_name],
+        )
+
+        # Recalculate prediction_count
+        pred_count = cursor.execute(
+            "SELECT COUNT(*) FROM annotations WHERE dataset_id = ? AND source != 'ground_truth'",
+            [dataset_id],
+        ).fetchone()[0]
+        cursor.execute(
+            "UPDATE datasets SET prediction_count = ? WHERE id = ?",
+            [pred_count, dataset_id],
+        )
+    finally:
+        cursor.close()
+
+    logger.info("Dataset %s: deleted %d predictions for run '%s'", dataset_id, count, run_name)
+
+
 @router.delete("/{dataset_id}", status_code=204)
 def delete_dataset(
     dataset_id: str,
diff --git a/frontend/src/components/stats/error-analysis-panel.tsx b/frontend/src/components/stats/error-analysis-panel.tsx
index f852715..022ef67 100644
--- a/frontend/src/components/stats/error-analysis-panel.tsx
+++ b/frontend/src/components/stats/error-analysis-panel.tsx
@@ -10,7 +10,7 @@
  * For classification: shows Correct, Misclassified, and Missing Prediction categories.
  */
 
-import { useState, useEffect, useMemo } from "react";
+import { useState, useEffect, useMemo, useCallback } from "react";
 import {
   BarChart,
   Bar,
@@ -24,6 +24,8 @@ import {
 
 import { useFilterFacets } from "@/hooks/use-filter-facets";
 import { useErrorAnalysis } from "@/hooks/use-error-analysis";
+import { useUIStore } from "@/stores/ui-store";
+import { useDeletePredictions } from "@/hooks/use-delete-predictions";
 import { ErrorSamplesGrid } from "@/components/stats/error-samples-grid";
 
 interface ErrorAnalysisPanelProps {
@@ -86,23 +88,38 @@ export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnaly
     [facets],
   );
 
-  const [source, setSource] = useState("prediction");
+  const source = useUIStore((s) => s.statsSource);
+  const setSource = useUIStore((s) => s.setStatsSource);
+  const deleteMutation = useDeletePredictions(datasetId);
   const [iouThreshold, setIouThreshold] = useState(0.5);
   const [confThreshold, setConfThreshold] = useState(0.25);
 
   // Auto-select first available source
   useEffect(() => {
-    if (predSources.length > 0 && !predSources.includes(source)) {
+    if (predSources.length > 0 && (!source || !predSources.includes(source))) {
       setSource(predSources[0]);
     }
-  }, [predSources, source]);
+  }, [predSources, source, setSource]);
+
+  const effectiveSource = source ?? "prediction";
+
+  const handleDeleteSource = useCallback(() => {
+    if (!source || source === "ground_truth") return;
+    if (!window.confirm(`Delete all predictions for "${source}"?`)) return;
+    deleteMutation.mutate(source, {
+      onSuccess: () => {
+        const remaining = predSources.filter((s) => s !== source);
+        setSource(remaining.length > 0 ? remaining[0] : null);
+      },
+    });
+  }, [source, predSources, deleteMutation, setSource]);
 
   const debouncedIou = useDebouncedValue(iouThreshold, 300);
   const debouncedConf = useDebouncedValue(confThreshold, 300);
 
   const { data, isLoading } = useErrorAnalysis(
     datasetId,
-    source,
+    effectiveSource,
     debouncedIou,
     debouncedConf,
     split,
@@ -135,13 +152,13 @@ export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnaly
       <div className="space-y-6">
         {/* Controls Bar -- no IoU slider for classification */}
         <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
-          {/* Source dropdown */}
+          {/* Source dropdown + delete */}
           <div className="flex items-center gap-2">
             <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
               Source:
             </label>
             <select
-              value={source}
+              value={effectiveSource}
               onChange={(e) => setSource(e.target.value)}
               className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
             >
@@ -151,6 +168,16 @@ export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnaly
                 </option>
               ))}
             </select>
+            <button
+              onClick={handleDeleteSource}
+              disabled={deleteMutation.isPending || !source}
+              title="Delete this prediction run"
+              className="p-1 rounded text-zinc-400 hover:text-red-500 hover:bg-red-50 dark:hover:bg-red-950/30 disabled:opacity-40 disabled:cursor-not-allowed transition-colors"
+            >
+              <svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={1.5}>
+                <path strokeLinecap="round" strokeLinejoin="round" d="m14.74 9-.346 9m-4.788 0L9.26 9m9.968-3.21c.342.052.682.107 1.022.166m-1.022-.165L18.16 19.673a2.25 2.25 0 0 1-2.244 2.077H8.084a2.25 2.25 0 0 1-2.244-2.077L4.772 5.79m14.456 0a48.108 48.108 0 0 0-3.478-.397m-12 .562c.34-.059.68-.114 1.022-.165m0 0a48.11 48.11 0 0 1 3.478-.397m7.5 0v-.916c0-1.18-.91-2.164-2.09-2.201a51.964 51.964 0 0 0-3.32 0c-1.18.037-2.09 1.022-2.09 2.201v.916m7.5 0a48.667 48.667 0 0 0-7.5 0" />
+              </svg>
+            </button>
           </div>
 
           {/* Confidence slider */}
@@ -320,13 +347,13 @@ export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnaly
     <div className="space-y-6">
       {/* Controls Bar */}
       <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
-        {/* Source dropdown */}
+        {/* Source dropdown + delete */}
         <div className="flex items-center gap-2">
           <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
             Source:
           </label>
           <select
-            value={source}
+            value={effectiveSource}
             onChange={(e) => setSource(e.target.value)}
             className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
           >
@@ -336,6 +363,16 @@ export function ErrorAnalysisPanel({ datasetId, split, datasetType }: ErrorAnaly
               </option>
             ))}
           </select>
+          <button
+            onClick={handleDeleteSource}
+            disabled={deleteMutation.isPending || !source}
+            title="Delete this prediction run"
+            className="p-1 rounded text-zinc-400 hover:text-red-500 hover:bg-red-50 dark:hover:bg-red-950/30 disabled:opacity-40 disabled:cursor-not-allowed transition-colors"
+          >
+            <svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={1.5}>
+              <path strokeLinecap="round" strokeLinejoin="round" d="m14.74 9-.346 9m-4.788 0L9.26 9m9.968-3.21c.342.052.682.107 1.022.166m-1.022-.165L18.16 19.673a2.25 2.25 0 0 1-2.244 2.077H8.084a2.25 2.25 0 0 1-2.244-2.077L4.772 5.79m14.456 0a48.108 48.108 0 0 0-3.478-.397m-12 .562c.34-.059.68-.114 1.022-.165m0 0a48.11 48.11 0 0 1 3.478-.397m7.5 0v-.916c0-1.18-.91-2.164-2.09-2.201a51.964 51.964 0 0 0-3.32 0c-1.18.037-2.09 1.022-2.09 2.201v.916m7.5 0a48.667 48.667 0 0 0-7.5 0" />
+            </svg>
+          </button>
         </div>
 
         {/* IoU slider */}
diff --git a/frontend/src/components/stats/evaluation-panel.tsx b/frontend/src/components/stats/evaluation-panel.tsx
index 08754ae..839397e 100644
--- a/frontend/src/components/stats/evaluation-panel.tsx
+++ b/frontend/src/components/stats/evaluation-panel.tsx
@@ -17,6 +17,7 @@ import { useFilteredEvaluation } from "@/hooks/use-filtered-evaluation";
 import { fetchConfusionCellSamples } from "@/hooks/use-confusion-cell";
 import { useFilterStore } from "@/stores/filter-store";
 import { useUIStore } from "@/stores/ui-store";
+import { useDeletePredictions } from "@/hooks/use-delete-predictions";
 import { MetricsCards } from "@/components/stats/metrics-cards";
 import { PRCurveChart } from "@/components/stats/pr-curve-chart";
 import { ConfusionMatrix } from "@/components/stats/confusion-matrix";
@@ -284,23 +285,38 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
     [facets],
   );
 
-  const [source, setSource] = useState("prediction");
+  const source = useUIStore((s) => s.statsSource);
+  const setSource = useUIStore((s) => s.setStatsSource);
+  const deleteMutation = useDeletePredictions(datasetId);
   const [iouThreshold, setIouThreshold] = useState(0.5);
   const [confThreshold, setConfThreshold] = useState(0.25);
 
   // Auto-select first available source
   useEffect(() => {
-    if (predSources.length > 0 && !predSources.includes(source)) {
+    if (predSources.length > 0 && (!source || !predSources.includes(source))) {
       setSource(predSources[0]);
     }
-  }, [predSources, source]);
+  }, [predSources, source, setSource]);
+
+  const handleDeleteSource = useCallback(() => {
+    if (!source || source === "ground_truth") return;
+    if (!window.confirm(`Delete all predictions for "${source}"?`)) return;
+    deleteMutation.mutate(source, {
+      onSuccess: () => {
+        const remaining = predSources.filter((s) => s !== source);
+        setSource(remaining.length > 0 ? remaining[0] : null);
+      },
+    });
+  }, [source, predSources, deleteMutation, setSource]);
 
   const debouncedIou = useDebouncedValue(iouThreshold, 300);
   const debouncedConf = useDebouncedValue(confThreshold, 300);
 
+  const effectiveSource = source ?? "prediction";
+
   const { data: rawData, isLoading } = useEvaluation(
     datasetId,
-    source,
+    effectiveSource,
     debouncedIou,
     debouncedConf,
     split,
@@ -314,7 +330,7 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
           datasetId,
           actualClass,
           predictedClass,
-          source,
+          effectiveSource,
           debouncedIou,
           debouncedConf,
           split,
@@ -325,7 +341,7 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
         console.error("Failed to fetch confusion cell samples:", err);
       }
     },
-    [datasetId, source, debouncedIou, debouncedConf, split],
+    [datasetId, effectiveSource, debouncedIou, debouncedConf, split],
   );
 
   // Classification evaluation layout
@@ -335,13 +351,13 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
       <div className="space-y-6">
         {/* Controls Bar -- no IoU slider for classification */}
         <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
-          {/* Source dropdown */}
+          {/* Source dropdown + delete */}
           <div className="flex items-center gap-2">
             <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
               Source:
             </label>
             <select
-              value={source}
+              value={effectiveSource}
               onChange={(e) => setSource(e.target.value)}
               className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
             >
@@ -351,6 +367,16 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
                 </option>
               ))}
             </select>
+            <button
+              onClick={handleDeleteSource}
+              disabled={deleteMutation.isPending || !source}
+              title="Delete this prediction run"
+              className="p-1 rounded text-zinc-400 hover:text-red-500 hover:bg-red-50 dark:hover:bg-red-950/30 disabled:opacity-40 disabled:cursor-not-allowed transition-colors"
+            >
+              <svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={1.5}>
+                <path strokeLinecap="round" strokeLinejoin="round" d="m14.74 9-.346 9m-4.788 0L9.26 9m9.968-3.21c.342.052.682.107 1.022.166m-1.022-.165L18.16 19.673a2.25 2.25 0 0 1-2.244 2.077H8.084a2.25 2.25 0 0 1-2.244-2.077L4.772 5.79m14.456 0a48.108 48.108 0 0 0-3.478-.397m-12 .562c.34-.059.68-.114 1.022-.165m0 0a48.11 48.11 0 0 1 3.478-.397m7.5 0v-.916c0-1.18-.91-2.164-2.09-2.201a51.964 51.964 0 0 0-3.32 0c-1.18.037-2.09 1.022-2.09 2.201v.916m7.5 0a48.667 48.667 0 0 0-7.5 0" />
+              </svg>
+            </button>
           </div>
 
           {/* Confidence slider */}
@@ -423,13 +449,13 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
     <div className="space-y-6">
       {/* Controls Bar */}
       <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
-        {/* Source dropdown */}
+        {/* Source dropdown + delete */}
         <div className="flex items-center gap-2">
           <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
             Source:
           </label>
           <select
-            value={source}
+            value={effectiveSource}
             onChange={(e) => setSource(e.target.value)}
             className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
           >
@@ -439,6 +465,16 @@ export function EvaluationPanel({ datasetId, split, excludedClasses, datasetType
               </option>
             ))}
           </select>
+          <button
+            onClick={handleDeleteSource}
+            disabled={deleteMutation.isPending || !source}
+            title="Delete this prediction run"
+            className="p-1 rounded text-zinc-400 hover:text-red-500 hover:bg-red-50 dark:hover:bg-red-950/30 disabled:opacity-40 disabled:cursor-not-allowed transition-colors"
+          >
+            <svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={1.5}>
+              <path strokeLinecap="round" strokeLinejoin="round" d="m14.74 9-.346 9m-4.788 0L9.26 9m9.968-3.21c.342.052.682.107 1.022.166m-1.022-.165L18.16 19.673a2.25 2.25 0 0 1-2.244 2.077H8.084a2.25 2.25 0 0 1-2.244-2.077L4.772 5.79m14.456 0a48.108 48.108 0 0 0-3.478-.397m-12 .562c.34-.059.68-.114 1.022-.165m0 0a48.11 48.11 0 0 1 3.478-.397m7.5 0v-.916c0-1.18-.91-2.164-2.09-2.201a51.964 51.964 0 0 0-3.32 0c-1.18.037-2.09 1.022-2.09 2.201v.916m7.5 0a48.667 48.667 0 0 0-7.5 0" />
+            </svg>
+          </button>
         </div>
 
         {/* IoU slider */}
diff --git a/frontend/src/components/stats/intelligence-panel.tsx b/frontend/src/components/stats/intelligence-panel.tsx
index 04186ef..08784d6 100644
--- a/frontend/src/components/stats/intelligence-panel.tsx
+++ b/frontend/src/components/stats/intelligence-panel.tsx
@@ -8,10 +8,12 @@
  * severity badges, and prioritized recommendations with category badges.
  */
 
-import { useState, useEffect, useMemo } from "react";
+import { useState, useEffect, useMemo, useCallback } from "react";
 
 import { useFilterFacets } from "@/hooks/use-filter-facets";
 import { useAgentAnalysis } from "@/hooks/use-agent-analysis";
+import { useUIStore } from "@/stores/ui-store";
+import { useDeletePredictions } from "@/hooks/use-delete-predictions";
 import type { AnalysisReport } from "@/types/agent";
 import type { PatternInsight, Recommendation } from "@/types/agent";
 
@@ -198,16 +200,31 @@ export function IntelligencePanel({ datasetId }: IntelligencePanelProps) {
     [facets],
   );
 
-  const [source, setSource] = useState("prediction");
+  const source = useUIStore((s) => s.statsSource);
+  const setSource = useUIStore((s) => s.setStatsSource);
+  const deleteMutation = useDeletePredictions(datasetId);
   const [iouThreshold, setIouThreshold] = useState(0.5);
   const [confThreshold, setConfThreshold] = useState(0.25);
 
   // Auto-select first available source
   useEffect(() => {
-    if (predSources.length > 0 && !predSources.includes(source)) {
+    if (predSources.length > 0 && (!source || !predSources.includes(source))) {
       setSource(predSources[0]);
     }
-  }, [predSources, source]);
+  }, [predSources, source, setSource]);
+
+  const effectiveSource = source ?? "prediction";
+
+  const handleDeleteSource = useCallback(() => {
+    if (!source || source === "ground_truth") return;
+    if (!window.confirm(`Delete all predictions for "${source}"?`)) return;
+    deleteMutation.mutate(source, {
+      onSuccess: () => {
+        const remaining = predSources.filter((s) => s !== source);
+        setSource(remaining.length > 0 ? remaining[0] : null);
+      },
+    });
+  }, [source, predSources, deleteMutation, setSource]);
 
   const debouncedIou = useDebouncedValue(iouThreshold, 300);
   const debouncedConf = useDebouncedValue(confThreshold, 300);
@@ -215,7 +232,7 @@ export function IntelligencePanel({ datasetId }: IntelligencePanelProps) {
   const handleAnalyze = () => {
     mutation.mutate({
       datasetId,
-      source,
+      source: effectiveSource,
       iouThreshold: debouncedIou,
       confThreshold: debouncedConf,
     });
@@ -228,13 +245,13 @@ export function IntelligencePanel({ datasetId }: IntelligencePanelProps) {
     <div className="space-y-6">
       {/* Header + Controls */}
       <div className="flex flex-wrap items-center gap-6 rounded-lg border border-zinc-200 dark:border-zinc-700 bg-white dark:bg-zinc-900 p-4">
-        {/* Source dropdown */}
+        {/* Source dropdown + delete */}
         <div className="flex items-center gap-2">
           <label className="text-sm font-medium text-zinc-700 dark:text-zinc-300">
             Source:
           </label>
           <select
-            value={source}
+            value={effectiveSource}
             onChange={(e) => setSource(e.target.value)}
             className="rounded border border-zinc-300 dark:border-zinc-600 bg-white dark:bg-zinc-800 text-sm px-2 py-1 text-zinc-900 dark:text-zinc-100"
           >
@@ -244,6 +261,16 @@ export function IntelligencePanel({ datasetId }: IntelligencePanelProps) {
               </option>
             ))}
           </select>
+          <button
+            onClick={handleDeleteSource}
+            disabled={deleteMutation.isPending || !source}
+            title="Delete this prediction run"
+            className="p-1 rounded text-zinc-400 hover:text-red-500 hover:bg-red-50 dark:hover:bg-red-950/30 disabled:opacity-40 disabled:cursor-not-allowed transition-colors"
+          >
+            <svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={1.5}>
+              <path strokeLinecap="round" strokeLinejoin="round" d="m14.74 9-.346 9m-4.788 0L9.26 9m9.968-3.21c.342.052.682.107 1.022.166m-1.022-.165L18.16 19.673a2.25 2.25 0 0 1-2.244 2.077H8.084a2.25 2.25 0 0 1-2.244-2.077L4.772 5.79m14.456 0a48.108 48.108 0 0 0-3.478-.397m-12 .562c.34-.059.68-.114 1.022-.165m0 0a48.11 48.11 0 0 1 3.478-.397m7.5 0v-.916c0-1.18-.91-2.164-2.09-2.201a51.964 51.964 0 0 0-3.32 0c-1.18.037-2.09 1.022-2.09 2.201v.916m7.5 0a48.667 48.667 0 0 0-7.5 0" />
+            </svg>
+          </button>
         </div>
 
         {/* IoU slider */}
diff --git a/frontend/src/components/stats/stats-dashboard.tsx b/frontend/src/components/stats/stats-dashboard.tsx
index 39e2deb..b26af41 100644
--- a/frontend/src/components/stats/stats-dashboard.tsx
+++ b/frontend/src/components/stats/stats-dashboard.tsx
@@ -16,6 +16,7 @@ import { useState, useMemo, useCallback } from "react";
 import { useStatistics } from "@/hooks/use-statistics";
 import { useFilterFacets } from "@/hooks/use-filter-facets";
 import { useSplit, useFilterStore } from "@/stores/filter-store";
+import { useUIStore } from "@/stores/ui-store";
 import { AnnotationSummary } from "@/components/stats/annotation-summary";
 import { ClassDistribution } from "@/components/stats/class-distribution";
 import { ClassFilter } from "@/components/stats/class-filter";
@@ -31,8 +32,6 @@ interface StatsDashboardProps {
   datasetType?: string;
 }
 
-type SubTab = "overview" | "evaluation" | "error_analysis" | "worst_images" | "near_duplicates" | "intelligence";
-
 function SkeletonCard() {
   return (
     <div className="rounded-lg border border-zinc-200 dark:border-zinc-700 p-4 bg-white dark:bg-zinc-900 animate-pulse">
@@ -56,7 +55,8 @@ export function StatsDashboard({ datasetId, datasetType }: StatsDashboardProps)
   const setSplit = useFilterStore((s) => s.setSplit);
   const { data: facets } = useFilterFacets(datasetId);
   const { data: stats, isLoading, error } = useStatistics(datasetId, split);
-  const [activeTab, setActiveTab] = useState<SubTab>("overview");
+  const activeTab = useUIStore((s) => s.statsSubTab);
+  const setActiveTab = useUIStore((s) => s.setStatsSubTab);
   const [excludedClasses, setExcludedClasses] = useState<Set<string>>(new Set());
 
   const availableSplits = facets?.splits.map((s) => s.name) ?? [];
diff --git a/frontend/src/components/triage/worst-images-panel.tsx b/frontend/src/components/triage/worst-images-panel.tsx
index fdfc4b3..88fde0d 100644
--- a/frontend/src/components/triage/worst-images-panel.tsx
+++ b/frontend/src/components/triage/worst-images-panel.tsx
@@ -42,23 +42,26 @@ export function WorstImagesPanel({ datasetId, split }: WorstImagesPanelProps) {
     [facets],
   );
 
-  const [source, setSource] = useState("prediction");
+  const source = useUIStore((s) => s.statsSource);
+  const setSource = useUIStore((s) => s.setStatsSource);
   const [iouThreshold, setIouThreshold] = useState(0.5);
   const [confThreshold, setConfThreshold] = useState(0.25);
 
   // Auto-select first available source
   useEffect(() => {
-    if (predSources.length > 0 && !predSources.includes(source)) {
+    if (predSources.length > 0 && (!source || !predSources.includes(source))) {
       setSource(predSources[0]);
     }
-  }, [predSources, source]);
+  }, [predSources, source, setSource]);
+
+  const effectiveSource = source ?? "prediction";
 
   const debouncedIou = useDebouncedValue(iouThreshold, 300);
   const debouncedConf = useDebouncedValue(confThreshold, 300);
 
   const { data, isLoading } = useWorstImages(
     datasetId,
-    source,
+    effectiveSource,
     debouncedIou,
     debouncedConf,
     split,
@@ -77,7 +80,7 @@ export function WorstImagesPanel({ datasetId, split }: WorstImagesPanelProps) {
             Source:
           </label>
           <select
-            value={source}
+            value={effectiveSource}
             onChange={(e) => setSource(e.target.value)}
             className="rounded border border-zinc-300 bg-white px-2 py-1 text-sm text-zinc-900 dark:border-zinc-600 dark:bg-zinc-800 dark:text-zinc-100"
           >
diff --git a/frontend/src/hooks/use-delete-predictions.ts b/frontend/src/hooks/use-delete-predictions.ts
new file mode 100644
index 0000000..5d3ce2c
--- /dev/null
+++ b/frontend/src/hooks/use-delete-predictions.ts
@@ -0,0 +1,20 @@
+import { useMutation, useQueryClient } from "@tanstack/react-query";
+
+import { apiDelete } from "@/lib/api";
+
+export function useDeletePredictions(datasetId: string) {
+  const qc = useQueryClient();
+
+  return useMutation({
+    mutationFn: (runName: string) =>
+      apiDelete(`/datasets/${datasetId}/predictions/${encodeURIComponent(runName)}`),
+    onSuccess: () => {
+      qc.invalidateQueries({ queryKey: ["dataset", datasetId] });
+      qc.invalidateQueries({ queryKey: ["filter-facets", datasetId] });
+      qc.invalidateQueries({ queryKey: ["annotations-batch"] });
+      qc.invalidateQueries({ queryKey: ["evaluation"] });
+      qc.invalidateQueries({ queryKey: ["statistics"] });
+      qc.invalidateQueries({ queryKey: ["embedding-coordinates", datasetId] });
+    },
+  });
+}
diff --git a/frontend/src/stores/ui-store.ts b/frontend/src/stores/ui-store.ts
index 0a0e540..174572f 100644
--- a/frontend/src/stores/ui-store.ts
+++ b/frontend/src/stores/ui-store.ts
@@ -13,6 +13,9 @@ import { DEFAULT_COLUMNS } from "@/lib/constants";
 /** Which tab is active on the dataset page. */
 export type DatasetTab = "grid" | "statistics" | "embeddings";
 
+/** Which sub-tab is active within the Statistics dashboard. */
+export type StatsSubTab = "overview" | "evaluation" | "error_analysis" | "worst_images" | "near_duplicates" | "intelligence";
+
 interface UIState {
   /** Currently selected sample ID for the detail modal. */
   selectedSampleId: string | null;
@@ -36,6 +39,10 @@ interface UIState {
   focusedGridIndex: number | null;
   /** Whether the keyboard shortcuts help overlay is open. */
   isHelpOverlayOpen: boolean;
+  /** Which statistics sub-tab is active. */
+  statsSubTab: StatsSubTab;
+  /** Selected prediction source across statistics sub-tabs. null = not yet selected. */
+  statsSource: string | null;
 
   /** Open the detail modal for a given sample. */
   openDetailModal: (sampleId: string) => void;
@@ -61,6 +68,10 @@ interface UIState {
   setFocusedGridIndex: (index: number | null) => void;
   /** Toggle the keyboard shortcuts help overlay. */
   toggleHelpOverlay: () => void;
+  /** Set the active statistics sub-tab. */
+  setStatsSubTab: (tab: StatsSubTab) => void;
+  /** Set the selected prediction source for statistics panels. */
+  setStatsSource: (source: string | null) => void;
 }
 
 export const useUIStore = create<UIState>((set) => ({
@@ -75,6 +86,8 @@ export const useUIStore = create<UIState>((set) => ({
   isHighlightMode: false,
   focusedGridIndex: null,
   isHelpOverlayOpen: false,
+  statsSubTab: "overview",
+  statsSource: null,
 
   openDetailModal: (sampleId) =>
     set({ selectedSampleId: sampleId, isDetailModalOpen: true }),
@@ -126,4 +139,6 @@ export const useUIStore = create<UIState>((set) => ({
   setFocusedGridIndex: (index) => set({ focusedGridIndex: index }),
   toggleHelpOverlay: () =>
     set((state) => ({ isHelpOverlayOpen: !state.isHelpOverlayOpen })),
+  setStatsSubTab: (tab) => set({ statsSubTab: tab }),
+  setStatsSource: (source) => set({ statsSource: source }),
 }));