From 30fc379d71c49b9bf6307934c2c5c206da407e83 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Wed, 1 Jul 2026 00:08:10 -0700
Subject: [PATCH 1/3] fix(reasoning): don't persist request-scoped
 reasoning_effort into model config

When a model sets `reasoning_effort: none` (or any default) in its YAML
without an explicit `reasoning.disable`, ApplyReasoningEffort resolves that
default at request time and sets ReasoningConfig.DisableReasoning on the
request-scoped config copy. The post-load thinking/marker probe then wrote
that request-scoped value back into the loader's persistent config via
UpdateModelConfig, making it look as though the operator had explicitly set
reasoning.disable=true. From then on, per-request `reasoning_effort` overrides
were silently ignored (an explicit operator disable wins over a request
asking to think).

DetectThinkingSupportFromBackend only fills reasoning slots that are still
nil, so a slot already set here came from ApplyReasoningEffort, not the probe.
Snapshot which slots were nil before the probe and only persist those, so the
probe's genuine backend detection is still saved while request-time reasoning
effort never leaks into the persistent config.

Fixes #10622

Signed-off-by: Tai An <antai12232931@outlook.com>
---
 core/backend/llm.go | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/core/backend/llm.go b/core/backend/llm.go
index 053e984e8a77..0d46c3889cdc 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -110,11 +110,25 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 	needsMarkerProbe := c.MediaMarker == ""
 	if shouldProbeThinking || needsMarkerProbe {
 		modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
+		// DetectThinkingSupportFromBackend only fills reasoning slots that are
+		// still nil, so a slot that already carries a value here was populated by
+		// request-time ApplyReasoningEffort (e.g. a `reasoning_effort: none`
+		// default), not by backend detection. Persisting such a request-scoped
+		// value would masquerade as an operator's explicit reasoning.disable and
+		// permanently defeat future per-request reasoning_effort overrides
+		// (see #10622). Only persist the slots the probe is actually allowed to
+		// fill.
+		persistDisableReasoning := c.ReasoningConfig.DisableReasoning == nil
+		persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil
 		config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
 		// Update the config in the loader so it persists for future requests
 		cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
-			cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
-			cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
+			if persistDisableReasoning {
+				cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
+			}
+			if persistDisableTagPrefill {
+				cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
+			}
 			if c.MediaMarker != "" {
 				cfg.MediaMarker = c.MediaMarker
 			}

From fabaf3ecb41c5e93f18a7d24445ceae98f42c459 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 2 Jul 2026 09:11:56 +0000
Subject: [PATCH 2/3] test(reasoning): cover persist-guard added in this PR,
 extract for testability

ModelInference's post-probe persistence of ReasoningConfig.DisableReasoning /
DisableReasoningTagPrefill had no test: the guard logic lived inline in a
closure only reachable through a live gRPC backend. Extract it into
persistProbedReasoning (pure refactor, no behavior change) so it can be
exercised directly against a ModelConfigLoader, then add specs covering:

- a probe-filled slot (nil beforehand) gets persisted
- a slot that already carried a request-scoped value (e.g. from
  reasoning_effort: none) is left alone, i.e. the #10622 regression stays
  fixed
- an operator's explicit persisted disable is preserved when the guard is
  false
- the media marker still persists unconditionally

Verified red/green: reverting persistProbedReasoning to the old unconditional
copy fails exactly the two guard specs.

Assisted-by: Claude:claude-sonnet-5 go vet
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/backend/llm.go            | 34 ++++++++-----
 core/backend/llm_probe_test.go | 89 ++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 11 deletions(-)

diff --git a/core/backend/llm.go b/core/backend/llm.go
index 0d46c3889cdc..39fce5c9d977 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -46,6 +46,28 @@ func needsThinkingProbe(c *config.ModelConfig) bool {
 			c.ReasoningConfig.DisableReasoningTagPrefill == nil)
 }
 
+// persistProbedReasoning writes the post-probe reasoning slots (and media
+// marker) from probed back into the loader's persisted config for modelName,
+// skipping any reasoning slot the probe was not actually allowed to fill.
+// persistDisableReasoning/persistDisableTagPrefill must be snapshotted from
+// probed's reasoning slots *before* the probe ran: a slot that already
+// carried a value at that point was populated by request-time
+// ApplyReasoningEffort, not by backend detection, and persisting it would
+// masquerade as an operator's explicit reasoning.disable (see #10622).
+func persistProbedReasoning(cl *config.ModelConfigLoader, modelName string, probed *config.ModelConfig, persistDisableReasoning, persistDisableTagPrefill bool) {
+	cl.UpdateModelConfig(modelName, func(cfg *config.ModelConfig) {
+		if persistDisableReasoning {
+			cfg.ReasoningConfig.DisableReasoning = probed.ReasoningConfig.DisableReasoning
+		}
+		if persistDisableTagPrefill {
+			cfg.ReasoningConfig.DisableReasoningTagPrefill = probed.ReasoningConfig.DisableReasoningTagPrefill
+		}
+		if probed.MediaMarker != "" {
+			cfg.MediaMarker = probed.MediaMarker
+		}
+	})
+}
+
 // HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
 // Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
 func (t TokenUsage) HasChatDeltaContent() bool {
@@ -122,17 +144,7 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 		persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil
 		config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
 		// Update the config in the loader so it persists for future requests
-		cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
-			if persistDisableReasoning {
-				cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
-			}
-			if persistDisableTagPrefill {
-				cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
-			}
-			if c.MediaMarker != "" {
-				cfg.MediaMarker = c.MediaMarker
-			}
-		})
+		persistProbedReasoning(cl, c.Name, c, persistDisableReasoning, persistDisableTagPrefill)
 	}
 
 	var protoMessages []*proto.Message
diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go
index 73ed9f967b85..0ae8e3718a86 100644
--- a/core/backend/llm_probe_test.go
+++ b/core/backend/llm_probe_test.go
@@ -1,6 +1,8 @@
 package backend
 
 import (
+	"os"
+
 	"github.com/mudler/LocalAI/core/config"
 
 	"github.com/gpustack/gguf-parser-go/util/ptr"
@@ -27,3 +29,90 @@ var _ = Describe("thinking probe gating", func() {
 		Expect(needsThinkingProbe(cfg)).To(BeFalse())
 	})
 })
+
+var _ = Describe("persistProbedReasoning", func() {
+	const modelName = "probe-test"
+
+	// newLoaderWithConfig seeds a ModelConfigLoader with a single model config
+	// parsed from yamlBody, mirroring how the loader is populated from disk.
+	newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader {
+		tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml")
+		Expect(err).ToNot(HaveOccurred())
+		defer os.Remove(tmp.Name())
+
+		_, err = tmp.WriteString(yamlBody)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(tmp.Close()).To(Succeed())
+
+		cl := config.NewModelConfigLoader("")
+		Expect(cl.ReadModelConfig(tmp.Name())).To(Succeed())
+		return cl
+	}
+
+	It("persists a reasoning slot the probe was allowed to fill (was nil beforehand)", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		probed.ReasoningConfig.DisableReasoning = ptr.To(false) // backend detected: supports thinking
+		probed.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
+
+		persistProbedReasoning(cl, modelName, probed, true, true)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue())
+	})
+
+	It("does not persist a slot that already carried a request-scoped value before the probe ran", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		// Simulates ApplyReasoningEffort("none") having set this on the
+		// request-scoped copy before the probe ran - not a genuine backend
+		// detection, so it must never reach the persisted config (#10622).
+		probed.ReasoningConfig.DisableReasoning = ptr.To(true)
+
+		persistProbedReasoning(cl, modelName, probed, false, false)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoning).To(BeNil())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeNil())
+	})
+
+	It("preserves an operator's explicit persisted disable when the guard is false", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\nreasoning:\n  disable: true\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		// Even if the request-scoped copy ends up holding a different value,
+		// persistDisableReasoning=false must keep the operator's own setting.
+		probed.ReasoningConfig.DisableReasoning = ptr.To(false)
+
+		persistProbedReasoning(cl, modelName, probed, false, false)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("persists the media marker regardless of the reasoning guards", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		probed.MediaMarker = "<__media__>"
+
+		persistProbedReasoning(cl, modelName, probed, false, false)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.MediaMarker).To(Equal("<__media__>"))
+	})
+})

From 30b2fe95aea1bf1ab96c0b9fefdd189bcaf2353e Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Thu, 2 Jul 2026 10:09:45 +0000
Subject: [PATCH 3/3] test(reasoning): ignore os.Remove error in temp file
 cleanup (errcheck)

Signed-off-by: Tai An <antai12232931@outlook.com>
---
 core/backend/llm_probe_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go
index 0ae8e3718a86..29b68f5f52a0 100644
--- a/core/backend/llm_probe_test.go
+++ b/core/backend/llm_probe_test.go
@@ -38,7 +38,7 @@ var _ = Describe("persistProbedReasoning", func() {
 	newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader {
 		tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml")
 		Expect(err).ToNot(HaveOccurred())
-		defer os.Remove(tmp.Name())
+		defer func() { _ = os.Remove(tmp.Name()) }()
 
 		_, err = tmp.WriteString(yamlBody)
 		Expect(err).ToNot(HaveOccurred())