diff --git a/codi-rs/docs/PRODUCTION_READINESS_PLAN.md b/codi-rs/docs/PRODUCTION_READINESS_PLAN.md new file mode 100644 index 0000000..dd1891b --- /dev/null +++ b/codi-rs/docs/PRODUCTION_READINESS_PLAN.md @@ -0,0 +1,284 @@ +# Codi-RS Production Readiness Plan + +**Objective:** Bring codi/codi-rs from 88% to 100% production readiness +**Timeline:** 2-3 weeks +**Branch:** `feat/production-readiness-phase-1` +**Priority:** Fix critical panics first, then polish + +--- + +## Executive Summary + +This plan addresses all production readiness issues identified in the assessment: +- **Critical:** 100+ panic/unwrap/expect calls in production code +- **Medium:** 7 outstanding TODOs, minor warnings +- **Low:** Documentation, monitoring, performance validation + +--- + +## Phase 1: Critical Issues (Week 1) - ELIMINATE PRODUCTION PANICS + +### 1.1 Priority Files (Fix First) + +**File 1: `src/tui/app.rs:1683-1687`** +- **Issue:** Two `panic!` calls in production app logic +- **Fix:** Replace with proper error handling and user notification +- **Lines:** 1683, 1687 + +**File 2: `src/orchestrate/ipc/transport.rs:133-152`** +- **Issue:** 9 `.expect()` calls in IPC transport layer +- **Fix:** Convert to `Result` with descriptive errors +- **Impact:** IPC failures crash the entire application + +**File 3: `src/orchestrate/ipc/client.rs:347, 488`** +- **Issue:** `.unwrap()` and `.expect()` in message handling +- **Fix:** Graceful degradation on malformed messages +- **Impact:** Worker communication failures + +### 1.2 Implementation Strategy + +```rust +// BEFORE (BAD): +.expect("bind failed"); + +// AFTER (GOOD): +.map_err(|e| { + tracing::error!("IPC bind failed: {}", e); + IpcError::Transport(format!("Failed to bind: {}", e)) +})?; +``` + +**Pattern:** +1. Identify all panics/unwraps/expects (100 total) +2. Replace with proper error types +3. Add context with `tracing::error!` +4. Ensure graceful degradation +5. Test error paths + +### 1.3 Error Type Design + +Add comprehensive error types to affected modules: + +```rust +// src/orchestrate/ipc/error.rs +#[derive(Debug, thiserror::Error)] +pub enum IpcError { + #[error("Transport error: {0}")] + Transport(String), + #[error("Bind failed: {0}")] + BindFailed(String), + #[error("Accept failed: {0}")] + AcceptFailed(String), + #[error("Read failed: {0}")] + ReadFailed(String), + #[error("Write failed: {0}")] + WriteFailed(String), + #[error("Connection failed: {0}")] + ConnectFailed(String), + #[error("Handshake failed: {0}")] + HandshakeFailed(String), + #[error("Permission request failed: {0}")] + PermissionFailed(String), +} +``` + +### 1.4 Acceptance Criteria + +- [ ] Zero `panic!` calls in production code (tests OK) +- [ ] Zero `expect()` calls in production code +- [ ] Zero `unwrap()` calls in production code paths +- [ ] All errors properly propagated with context +- [ ] No behavioral regressions + +--- + +## Phase 2: Code Quality (Week 1-2) + +### 2.1 Clean Up Warnings + +**Issues:** +- Unused import `Stylize` in `src/tui/terminal_ui.rs:18` +- Unused function `load_session` in `src/main.rs:505` + +**Action:** Remove or implement + +### 2.2 Address TODOs by Priority + +**HIGH (Complete in Phase 2):** +1. `src/symbol_index/indexer.rs:561` - File cleanup for deleted/renamed files +2. `src/symbol_index/service.rs:206, 229` - Usage detection & dependency graph +3. `src/tui/syntax/highlighter.rs:49` - Tree-sitter-markdown compatibility + +**LOW (Defer to Phase 4):** +- `src/tui/app.rs:1355` - Worktree listing exposure +- `src/cli/models.rs:84` - Error collection +- `src/rag/embeddings/mod.rs:47` - Model map integration + +**Strategy:** Create GitHub issues for low-priority TODOs + +--- + +## Phase 3: Testing & Validation (Week 2) + +### 3.1 Error Path Tests + +**Missing Coverage:** +- IPC failure scenarios (bind, accept, read, write failures) +- Provider API failures (timeouts, auth errors) +- Tool execution errors (file not found, permissions) +- Cancellation mid-operation + +**Implementation:** +```rust +#[tokio::test] +async fn test_ipc_bind_failure() { + let result = server.bind("/invalid/path").await; + assert!(matches!(result, Err(IpcError::BindFailed(_)))); +} +``` + +**Target:** Error path coverage >80% + +### 3.2 Performance Benchmarking + +**Benchmarks:** +- Cold start time (target: < 2 seconds) +- Tool execution latency +- TUI responsiveness (target: < 16ms) +- Memory usage under load +- Context compaction performance + +**Implementation:** +- Use existing `criterion` benchmarks +- Add CI performance regression detection +- Document baseline metrics + +--- + +## Phase 4: Documentation & Polish (Week 2-3) + +### 4.1 Production Deployment Guide + +**Create:** `docs/DEPLOYMENT.md` +- Environment variables reference +- Configuration file examples +- Security best practices +- Performance tuning +- Monitoring setup + +### 4.2 Security Audit + +**Actions:** +- Review bash dangerous patterns +- Audit file path validation +- Check for directory traversal +- Verify tool auto-approval logic +- Document security model + +**Output:** `docs/SECURITY.md` + +--- + +## Phase 5: Monitoring & Observability (Week 3) + +### 5.1 Health Check + +**Command:** `codi --health` or `/health` in TUI +- Provider connectivity +- Tool availability +- System status + +### 5.2 Telemetry Enhancements + +**Metrics to Add:** +- Per-tool execution metrics (count, latency, errors) +- Error rate tracking by category +- Performance histograms +- Export formats (Prometheus, StatsD) + +--- + +## Risk Assessment + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Panic fixes introduce regressions | High | Comprehensive testing, gradual rollout, feature flags | +| IPC error handling changes behavior | Medium | Extensive testing, backward compatibility checks | +| Performance degradation | Medium | Benchmarks, performance budgets, A/B testing | +| Documentation outdated | Low | Regular reviews, user feedback loop | + +--- + +## Success Criteria + +### Phase 1 (Critical) ✅ +- [ ] Zero `panic!` in production code +- [ ] Zero `expect()` in production code +- [ ] Zero `unwrap()` in production code paths +- [ ] All IPC errors handled gracefully +- [ ] Comprehensive error types implemented + +### Phase 2 (Quality) ✅ +- [ ] Clean build with zero warnings +- [ ] High-priority TODOs resolved +- [ ] Remaining TODOs documented in GitHub issues + +### Phase 3 (Testing) ✅ +- [ ] Error path coverage >80% +- [ ] Performance benchmarks established +- [ ] Performance budgets defined + +### Phase 4 (Docs) ✅ +- [ ] Deployment guide complete +- [ ] Security audit passed +- [ ] Configuration reference complete + +### Phase 5 (Monitoring) ✅ +- [ ] Health check implemented +- [ ] Metrics collection comprehensive +- [ ] Production-ready telemetry + +--- + +## Implementation Log + +### Day 1: Setup & Priority Files +- [ ] Create error types for IPC layer +- [ ] Fix `src/orchestrate/ipc/transport.rs` +- [ ] Fix `src/orchestrate/ipc/client.rs` + +### Day 2: TUI & App Layer +- [ ] Fix `src/tui/app.rs` panics +- [ ] Fix remaining IPC unwraps +- [ ] Add error context with tracing + +### Day 3: Remaining Unwraps +- [ ] Audit all 100 unwrap/panic locations +- [ ] Fix remaining production code issues +- [ ] Verify tests still pass + +### Day 4: Testing & Validation +- [ ] Add error path tests +- [ ] Run full test suite +- [ ] Performance sanity check + +### Day 5: Documentation & Cleanup +- [ ] Update docs +- [ ] Create GitHub issues for TODOs +- [ ] Final review + +--- + +## Notes + +- **Rust Edition:** 2024 +- **MSRV:** 1.85 +- **Test Command:** `cargo test` +- **Lint Command:** `cargo clippy -- -D warnings` +- **Format Command:** `cargo fmt --check` + +--- + +**Last Updated:** 2026-02-07 +**Author:** Codi AI Assistant +**Branch:** feat/production-readiness-phase-1 diff --git a/codi-rs/src/orchestrate/commander.rs b/codi-rs/src/orchestrate/commander.rs index 0fb19ef..37ac00b 100644 --- a/codi-rs/src/orchestrate/commander.rs +++ b/codi-rs/src/orchestrate/commander.rs @@ -46,7 +46,7 @@ use tracing::{debug, error, info, warn}; use super::isolation::{detect_isolator, IsolationError, WorkspaceIsolator}; use super::ipc::{ - CommanderMessage, IpcServer, PermissionResult, WorkerMessage, + CommanderMessage, IpcError, IpcServer, PermissionResult, WorkerMessage, }; use super::types::{ CommanderConfig, WorkerConfig, WorkerResult, WorkerState, WorkerStatus, @@ -56,7 +56,7 @@ use super::types::{ #[derive(Debug, thiserror::Error)] pub enum CommanderError { #[error("IPC error: {0}")] - Ipc(#[from] super::ipc::server::IpcError), + Ipc(#[from] IpcError), #[error("Isolation error: {0}")] Isolation(#[from] IsolationError), @@ -194,7 +194,7 @@ impl Commander { let process = Command::new(&exe) .arg("--child-mode") .arg("--socket-path") - .arg(self.config.socket_path.to_str().unwrap()) + .arg(self.config.socket_path.as_os_str()) .arg("--child-id") .arg(&config.id) .arg("--child-task") diff --git a/codi-rs/src/orchestrate/griptree.rs b/codi-rs/src/orchestrate/griptree.rs index daad067..dc35e69 100644 --- a/codi-rs/src/orchestrate/griptree.rs +++ b/codi-rs/src/orchestrate/griptree.rs @@ -313,15 +313,16 @@ impl WorkspaceIsolator for GriptreeIsolator { } // Create worktree + let worktree_path_str = worktree_path.to_string_lossy().to_string(); let create_args = if self.branch_exists(&repo_path, branch).await { - vec!["worktree", "add", worktree_path.to_str().unwrap(), branch] + vec!["worktree", "add", &worktree_path_str, branch] } else { vec![ "worktree", "add", "-b", branch, - worktree_path.to_str().unwrap(), + &worktree_path_str, base_branch, ] }; @@ -400,6 +401,7 @@ impl WorkspaceIsolator for GriptreeIsolator { if let Some(repo_config) = manifest.repos.get(&repo.name) { let main_repo_path = self.main_workspace.join(&repo_config.path); + let worktree_path_str = repo.worktree_path.to_string_lossy().to_string(); let result = self .git( &main_repo_path, @@ -407,7 +409,7 @@ impl WorkspaceIsolator for GriptreeIsolator { "worktree", "remove", "--force", - repo.worktree_path.to_str().unwrap(), + &worktree_path_str, ], ) .await; @@ -546,6 +548,7 @@ impl GriptreeIsolator { if let Ok(manifest) = self.get_manifest().await { if let Some(repo_config) = manifest.repos.get(&repo.name) { let main_repo_path = self.main_workspace.join(&repo_config.path); + let worktree_path_str = repo.worktree_path.to_string_lossy().to_string(); let _ = self .git( &main_repo_path, @@ -553,7 +556,7 @@ impl GriptreeIsolator { "worktree", "remove", "--force", - repo.worktree_path.to_str().unwrap(), + &worktree_path_str, ], ) .await; diff --git a/codi-rs/src/orchestrate/ipc/client.rs b/codi-rs/src/orchestrate/ipc/client.rs index 0a8d776..c241892 100644 --- a/codi-rs/src/orchestrate/ipc/client.rs +++ b/codi-rs/src/orchestrate/ipc/client.rs @@ -56,6 +56,9 @@ pub enum IpcClientError { #[error("Cancelled")] Cancelled, + + #[error("Invalid message: {0}")] + InvalidMessage(String), } /// Handshake acknowledgment from commander. @@ -344,7 +347,12 @@ impl IpcClient { // Create permission request message let msg = WorkerMessage::permission_request(confirmation); - let request_id = msg.request_id().unwrap().to_string(); + let request_id = msg.request_id() + .ok_or_else(|| { + tracing::error!("Failed to get request_id from permission message"); + IpcClientError::InvalidMessage("Permission message missing request_id".to_string()) + })? + .to_string(); // Set up response channel let (tx, rx) = oneshot::channel(); diff --git a/codi-rs/src/orchestrate/ipc/error.rs b/codi-rs/src/orchestrate/ipc/error.rs new file mode 100644 index 0000000..465e81a --- /dev/null +++ b/codi-rs/src/orchestrate/ipc/error.rs @@ -0,0 +1,135 @@ +// Copyright 2026 Layne Penney +// SPDX-License-Identifier: AGPL-3.0-or-later + +//! IPC error types for multi-agent orchestration. + +use std::io; +use thiserror::Error; + +/// Errors that can occur in the IPC subsystem. +#[derive(Debug, Error)] +pub enum IpcError { + /// Failed to bind to the socket/pipe. + #[error("Failed to bind IPC endpoint: {0}")] + BindFailed(String), + + /// Failed to accept an incoming connection. + #[error("Failed to accept IPC connection: {0}")] + AcceptFailed(String), + + /// Failed to connect to the IPC endpoint. + #[error("Failed to connect to IPC endpoint: {0}")] + ConnectFailed(String), + + /// Failed to read from the IPC stream. + #[error("Failed to read from IPC stream: {0}")] + ReadFailed(String), + + /// Failed to write to the IPC stream. + #[error("Failed to write to IPC stream: {0}")] + WriteFailed(String), + + /// Failed to flush the IPC stream. + #[error("Failed to flush IPC stream: {0}")] + FlushFailed(String), + + /// Handshake with peer failed. + #[error("IPC handshake failed: {0}")] + HandshakeFailed(String), + + /// Permission request failed. + #[error("Permission request failed: {0}")] + PermissionFailed(String), + + /// Server failed to start. + #[error("IPC server failed to start: {0}")] + ServerStartFailed(String), + + /// Server task panicked or failed. + #[error("IPC server task failed: {0}")] + ServerTaskFailed(String), + + /// Failed to receive message. + #[error("Failed to receive IPC message: {0}")] + ReceiveFailed(String), + + /// Failed to send message. + #[error("Failed to send IPC message: {0}")] + SendFailed(String), + + /// Receiver already taken. + #[error("IPC receiver already taken")] + ReceiverAlreadyTaken, + + /// Invalid message received. + #[error("Invalid IPC message: {0}")] + InvalidMessage(String), + + /// Connection closed unexpectedly. + #[error("IPC connection closed unexpectedly")] + ConnectionClosed, + + /// Timeout waiting for response. + #[error("IPC operation timed out")] + Timeout, + + /// Platform-specific error. + #[error("Platform error: {0}")] + Platform(String), + + /// General transport error. + #[error("Transport error: {0}")] + Transport(String), + + /// Worker is not connected. + #[error("Worker not connected: {0}")] + WorkerNotConnected(String), + + /// Invalid handshake message received. + #[error("Invalid handshake")] + InvalidHandshake, + + /// Channel closed unexpectedly. + #[error("Channel closed")] + ChannelClosed, + + /// Server has not been started. + #[error("Server not started")] + NotStarted, + + /// Serialization error. + #[error("Serialization error: {0}")] + Serialization(String), +} + +impl IpcError { + /// Create an IPC error from an IO error with context. + pub fn from_io_error(context: &str, err: io::Error) -> Self { + IpcError::Transport(format!("{}: {}", context, err)) + } +} + +/// Result type for IPC operations. +pub type IpcResult = Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ipc_error_display() { + let err = IpcError::BindFailed("permission denied".to_string()); + assert_eq!( + err.to_string(), + "Failed to bind IPC endpoint: permission denied" + ); + } + + #[test] + fn test_from_io_error() { + let io_err = io::Error::new(io::ErrorKind::NotFound, "file not found"); + let err = IpcError::from_io_error("opening socket", io_err); + assert!(err.to_string().contains("opening socket")); + assert!(err.to_string().contains("file not found")); + } +} diff --git a/codi-rs/src/orchestrate/ipc/mod.rs b/codi-rs/src/orchestrate/ipc/mod.rs index ae2caab..fdfd8f2 100644 --- a/codi-rs/src/orchestrate/ipc/mod.rs +++ b/codi-rs/src/orchestrate/ipc/mod.rs @@ -49,6 +49,9 @@ pub mod protocol; pub mod server; pub mod client; pub mod transport; +pub mod error; + +pub use error::{IpcError, IpcResult}; pub use protocol::{ WorkerMessage, CommanderMessage, PermissionResult, diff --git a/codi-rs/src/orchestrate/ipc/server.rs b/codi-rs/src/orchestrate/ipc/server.rs index 490f846..7ebd999 100644 --- a/codi-rs/src/orchestrate/ipc/server.rs +++ b/codi-rs/src/orchestrate/ipc/server.rs @@ -18,28 +18,7 @@ use super::protocol::{ decode, encode, CommanderMessage, WorkerMessage, }; use super::transport::{self, IpcListener, IpcStream}; - -/// Error type for IPC operations. -#[derive(Debug, thiserror::Error)] -pub enum IpcError { - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - - #[error("Serialization error: {0}")] - Serialization(#[from] serde_json::Error), - - #[error("Worker not connected: {0}")] - WorkerNotConnected(String), - - #[error("Invalid handshake")] - InvalidHandshake, - - #[error("Channel closed")] - ChannelClosed, - - #[error("Server not started")] - NotStarted, -} +use super::error::IpcError; /// A connected worker client. struct ConnectedWorker { @@ -83,7 +62,9 @@ impl IpcServer { /// Start the server. pub async fn start(&mut self) -> Result<(), IpcError> { - let listener = transport::bind(&self.socket_path).await?; + let listener = transport::bind(&self.socket_path) + .await + .map_err(|e| IpcError::from_io_error("binding socket", e))?; info!("IPC server listening on {:?}", self.socket_path); self.listener = Some(listener); @@ -96,7 +77,8 @@ impl IpcServer { let mut workers = self.workers.write().await; workers.clear(); - transport::cleanup(&self.socket_path)?; + transport::cleanup(&self.socket_path) + .map_err(|e| IpcError::from_io_error("cleaning up socket", e))?; self.listener = None; info!("IPC server stopped"); @@ -117,7 +99,9 @@ impl IpcServer { pub async fn accept(&self) -> Result { let listener = self.listener.as_ref().ok_or(IpcError::NotStarted)?; - let stream = listener.accept().await?; + let stream = listener.accept() + .await + .map_err(|e| IpcError::from_io_error("accepting connection", e))?; debug!("New connection accepted"); let (read_half, write_half) = tokio::io::split(stream); @@ -125,9 +109,12 @@ impl IpcServer { // Read handshake message let mut line = String::new(); - reader.read_line(&mut line).await?; + reader.read_line(&mut line) + .await + .map_err(|e| IpcError::from_io_error("reading handshake", e))?; - let msg: WorkerMessage = decode(&line)?; + let msg: WorkerMessage = decode(&line) + .map_err(|e| IpcError::InvalidMessage(format!("handshake decode failed: {}", e)))?; if let WorkerMessage::Handshake { worker_id, .. } = &msg { let worker_id = worker_id.clone(); @@ -210,17 +197,23 @@ impl IpcServer { .get(worker_id) .ok_or_else(|| IpcError::WorkerNotConnected(worker_id.to_string()))?; - let encoded = encode(msg)?; + let encoded = encode(msg) + .map_err(|e| IpcError::InvalidMessage(format!("encode failed: {}", e)))?; let mut worker = worker.lock().await; - worker.writer.write_all(encoded.as_bytes()).await?; - worker.writer.flush().await?; + worker.writer.write_all(encoded.as_bytes()) + .await + .map_err(|e| IpcError::from_io_error("sending message", e))?; + worker.writer.flush() + .await + .map_err(|e| IpcError::from_io_error("flushing writer", e))?; Ok(()) } /// Broadcast a message to all connected workers. pub async fn broadcast(&self, msg: &CommanderMessage) -> Result<(), IpcError> { - let encoded = encode(msg)?; + let encoded = encode(msg) + .map_err(|e| IpcError::InvalidMessage(format!("encode failed: {}", e)))?; let workers = self.workers.read().await; for (worker_id, worker) in workers.iter() { diff --git a/codi-rs/src/orchestrate/worktree.rs b/codi-rs/src/orchestrate/worktree.rs index fc8d7d3..ea875b2 100644 --- a/codi-rs/src/orchestrate/worktree.rs +++ b/codi-rs/src/orchestrate/worktree.rs @@ -218,12 +218,15 @@ impl WorkspaceIsolator for GitWorktreeIsolator { // git worktree add -b info!("Creating worktree for {} at {:?}", branch, worktree_path); + // Convert path to string for git command + let worktree_path_str = worktree_path.to_string_lossy().to_string(); + let result = if self.branch_exists(branch).await { // Branch exists, just add worktree self.git(&[ "worktree", "add", - worktree_path.to_str().unwrap(), + &worktree_path_str, branch, ]) .await @@ -234,7 +237,7 @@ impl WorkspaceIsolator for GitWorktreeIsolator { "add", "-b", branch, - worktree_path.to_str().unwrap(), + &worktree_path_str, base_branch, ]) .await @@ -269,8 +272,9 @@ impl WorkspaceIsolator for GitWorktreeIsolator { info!("Removing worktree for {} at {:?}", branch, path); // Remove worktree + let path_str = path.to_string_lossy().to_string(); let result = self - .git(&["worktree", "remove", "--force", path.to_str().unwrap()]) + .git(&["worktree", "remove", "--force", &path_str]) .await; if let Err(e) = result { diff --git a/codi-rs/src/tui/terminal_ui.rs b/codi-rs/src/tui/terminal_ui.rs index 1236190..0885609 100644 --- a/codi-rs/src/tui/terminal_ui.rs +++ b/codi-rs/src/tui/terminal_ui.rs @@ -15,7 +15,7 @@ use std::sync::Arc; use std::time::Instant; use crossterm::{ - style::{Color, Print, ResetColor, SetForegroundColor, Stylize}, + style::{Color, Print, ResetColor, SetForegroundColor}, ExecutableCommand, };