blackaxgit · blackaxgit · May 2, 2026 · May 2, 2026 · May 2, 2026 · May 2, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,20 @@ All notable changes to CLX will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/),
 and this project adheres to [Semantic Versioning](https://semver.org/).
 
+## [0.7.2] - 2026-05-02
+
+### Fixed
+- Auto-recall (`UserPromptSubmit` hook) silently produced no semantic
+  context when embeddings were routed to Azure OpenAI. The recall path
+  called `embed(query, None)`; Azure rejects `None` with
+  `DeploymentNotFound` (only Ollama tolerated it via its own
+  baked-in default). `RecallEngine` now accepts an explicit embedding
+  model via `with_embedding_model(...)`, and both production callers
+  (`clx-hook` auto-recall and `clx-mcp` `clx_recall` tool) pass the
+  configured `llm.embeddings.model`. FTS5 fallback was working all
+  along, but the headline semantic-recall feature was dark on
+  Azure-routed embeddings until this fix.
+
 ## [0.7.1] - 2026-05-02
 
 ### Fixed

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -8,7 +8,7 @@ members = [
 ]
 
 [workspace.package]
-version = "0.7.1"
+version = "0.7.2"
 edition = "2024"
 license = "MPL-2.0"
 authors = ["CLX Contributors"]

diff --git a/crates/clx-core/src/recall.rs b/crates/clx-core/src/recall.rs
@@ -69,6 +69,11 @@ pub struct RecallEngine<'a> {
     /// active: `check_model_mismatch` returns the stored vs. configured pair
     /// when they differ.
     configured_model_ident: Option<String>,
+    /// The bare embedding model / deployment name to pass to the backend
+    /// when generating the query embedding. Required for backends that do
+    /// not have a baked-in default model (e.g., `AzureOpenAIBackend`). Optional because
+    /// Ollama tolerates `None` by falling back to its configured default.
+    embedding_model: Option<String>,
 }
 
 impl<'a> RecallEngine<'a> {
@@ -84,9 +89,20 @@ impl<'a> RecallEngine<'a> {
             ollama,
             embedding_store,
             configured_model_ident: None,
+            embedding_model: None,
         }
     }
 
+    /// Attach the bare embedding model / deployment name. Required for
+    /// Azure-routed embeddings (Azure backend errors with
+    /// `DeploymentNotFound` when called with `None`); Ollama tolerates
+    /// missing model and falls back to its config default.
+    #[must_use]
+    pub fn with_embedding_model(mut self, model: impl Into<String>) -> Self {
+        self.embedding_model = Some(model.into());
+        self
+    }
+
     /// Attach the configured embedding model identifier so that mismatch
     /// detection works.  The identifier should be `"<provider>:<model>"`.
     #[must_use]
@@ -153,8 +169,9 @@ impl<'a> RecallEngine<'a> {
         emb_store: &EmbeddingStore,
         config: &RecallQueryConfig,
     ) -> Vec<RecallHit> {
-        // Generate embedding for the query
-        let embedding = match ollama.embed(query, None).await {
+        // Generate embedding for the query. Pass the configured embedding
+        // model so backends without a baked-in default (Azure) work.
+        let embedding = match ollama.embed(query, self.embedding_model.as_deref()).await {
             Ok(emb) => emb,
             Err(e) => {
                 warn!("Recall semantic embedding failed: {e}");
@@ -886,4 +903,30 @@ mod tests {
             search_type,
         }
     }
+
+    /// Regression for the 0.7.1 bug: `auto_recall` passed `None` for the
+    /// embeddings model, which Azure rejects with `DeploymentNotFound`.
+    /// 0.7.2 plumbs the configured model through `with_embedding_model`.
+    /// This test asserts the builder stores the model so `try_semantic`
+    /// will pass it to the backend.
+    #[test]
+    fn embedding_model_builder_persists_value() {
+        let storage = Storage::open_in_memory().unwrap();
+        let engine =
+            RecallEngine::new(&storage, None, None).with_embedding_model("text-embedding-3-small");
+        assert_eq!(
+            engine.embedding_model.as_deref(),
+            Some("text-embedding-3-small")
+        );
+    }
+
+    #[test]
+    fn embedding_model_default_is_none_for_back_compat() {
+        let storage = Storage::open_in_memory().unwrap();
+        let engine = RecallEngine::new(&storage, None, None);
+        assert!(
+            engine.embedding_model.is_none(),
+            "default must be None so existing callers keep relying on Ollama's baked-in default"
+        );
+    }
 }
diff --git a/crates/clx-hook/src/hooks/subagent.rs b/crates/clx-hook/src/hooks/subagent.rs
@@ -119,8 +119,11 @@ async fn do_recall(prompt: &str, config: &clx_core::config::Config) -> Option<St
         include_key_facts: config.auto_recall.include_key_facts,
     };
 
-    let engine =
+    let mut engine =
         clx_core::recall::RecallEngine::new(&storage, ollama.as_ref(), embedding_store.as_ref());
+    if let Ok(route) = config.capability_route(clx_core::config::Capability::Embeddings) {
+        engine = engine.with_embedding_model(route.model.clone());
+    }
     let hits = engine.query(prompt, &recall_config).await;
 
     if hits.is_empty() {

diff --git a/crates/clx-mcp/src/tools/recall.rs b/crates/clx-mcp/src/tools/recall.rs
@@ -30,7 +30,8 @@ impl McpServer {
             &self.storage,
             self.ollama_client.as_ref(),
             self.embedding_store.as_ref(),
-        );
+        )
+        .with_embedding_model(self.embed_model.clone());
 
         // MCP recall uses a more permissive threshold (0.25) than auto-recall (0.35)
         // because it is user-invoked and benefits from broader results.