From 30fc379d71c49b9bf6307934c2c5c206da407e83 Mon Sep 17 00:00:00 2001 From: Tai An Date: Wed, 1 Jul 2026 00:08:10 -0700 Subject: [PATCH 1/3] fix(reasoning): don't persist request-scoped reasoning_effort into model config When a model sets `reasoning_effort: none` (or any default) in its YAML without an explicit `reasoning.disable`, ApplyReasoningEffort resolves that default at request time and sets ReasoningConfig.DisableReasoning on the request-scoped config copy. The post-load thinking/marker probe then wrote that request-scoped value back into the loader's persistent config via UpdateModelConfig, making it look as though the operator had explicitly set reasoning.disable=true. From then on, per-request `reasoning_effort` overrides were silently ignored (an explicit operator disable wins over a request asking to think). DetectThinkingSupportFromBackend only fills reasoning slots that are still nil, so a slot already set here came from ApplyReasoningEffort, not the probe. Snapshot which slots were nil before the probe and only persist those, so the probe's genuine backend detection is still saved while request-time reasoning effort never leaks into the persistent config. Fixes #10622 Signed-off-by: Tai An --- core/backend/llm.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/core/backend/llm.go b/core/backend/llm.go index 053e984e8a77..0d46c3889cdc 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -110,11 +110,25 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima needsMarkerProbe := c.MediaMarker == "" if shouldProbeThinking || needsMarkerProbe { modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath) + // DetectThinkingSupportFromBackend only fills reasoning slots that are + // still nil, so a slot that already carries a value here was populated by + // request-time ApplyReasoningEffort (e.g. a `reasoning_effort: none` + // default), not by backend detection. Persisting such a request-scoped + // value would masquerade as an operator's explicit reasoning.disable and + // permanently defeat future per-request reasoning_effort overrides + // (see #10622). Only persist the slots the probe is actually allowed to + // fill. + persistDisableReasoning := c.ReasoningConfig.DisableReasoning == nil + persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts) // Update the config in the loader so it persists for future requests cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) { - cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning - cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill + if persistDisableReasoning { + cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning + } + if persistDisableTagPrefill { + cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill + } if c.MediaMarker != "" { cfg.MediaMarker = c.MediaMarker } From fabaf3ecb41c5e93f18a7d24445ceae98f42c459 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 2 Jul 2026 09:11:56 +0000 Subject: [PATCH 2/3] test(reasoning): cover persist-guard added in this PR, extract for testability ModelInference's post-probe persistence of ReasoningConfig.DisableReasoning / DisableReasoningTagPrefill had no test: the guard logic lived inline in a closure only reachable through a live gRPC backend. Extract it into persistProbedReasoning (pure refactor, no behavior change) so it can be exercised directly against a ModelConfigLoader, then add specs covering: - a probe-filled slot (nil beforehand) gets persisted - a slot that already carried a request-scoped value (e.g. from reasoning_effort: none) is left alone, i.e. the #10622 regression stays fixed - an operator's explicit persisted disable is preserved when the guard is false - the media marker still persists unconditionally Verified red/green: reverting persistProbedReasoning to the old unconditional copy fails exactly the two guard specs. Assisted-by: Claude:claude-sonnet-5 go vet Signed-off-by: Ettore Di Giacinto --- core/backend/llm.go | 34 ++++++++----- core/backend/llm_probe_test.go | 89 ++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 11 deletions(-) diff --git a/core/backend/llm.go b/core/backend/llm.go index 0d46c3889cdc..39fce5c9d977 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -46,6 +46,28 @@ func needsThinkingProbe(c *config.ModelConfig) bool { c.ReasoningConfig.DisableReasoningTagPrefill == nil) } +// persistProbedReasoning writes the post-probe reasoning slots (and media +// marker) from probed back into the loader's persisted config for modelName, +// skipping any reasoning slot the probe was not actually allowed to fill. +// persistDisableReasoning/persistDisableTagPrefill must be snapshotted from +// probed's reasoning slots *before* the probe ran: a slot that already +// carried a value at that point was populated by request-time +// ApplyReasoningEffort, not by backend detection, and persisting it would +// masquerade as an operator's explicit reasoning.disable (see #10622). +func persistProbedReasoning(cl *config.ModelConfigLoader, modelName string, probed *config.ModelConfig, persistDisableReasoning, persistDisableTagPrefill bool) { + cl.UpdateModelConfig(modelName, func(cfg *config.ModelConfig) { + if persistDisableReasoning { + cfg.ReasoningConfig.DisableReasoning = probed.ReasoningConfig.DisableReasoning + } + if persistDisableTagPrefill { + cfg.ReasoningConfig.DisableReasoningTagPrefill = probed.ReasoningConfig.DisableReasoningTagPrefill + } + if probed.MediaMarker != "" { + cfg.MediaMarker = probed.MediaMarker + } + }) +} + // HasChatDeltaContent returns true if any chat delta carries content or reasoning text. // Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction. func (t TokenUsage) HasChatDeltaContent() bool { @@ -122,17 +144,7 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts) // Update the config in the loader so it persists for future requests - cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) { - if persistDisableReasoning { - cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning - } - if persistDisableTagPrefill { - cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill - } - if c.MediaMarker != "" { - cfg.MediaMarker = c.MediaMarker - } - }) + persistProbedReasoning(cl, c.Name, c, persistDisableReasoning, persistDisableTagPrefill) } var protoMessages []*proto.Message diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go index 73ed9f967b85..0ae8e3718a86 100644 --- a/core/backend/llm_probe_test.go +++ b/core/backend/llm_probe_test.go @@ -1,6 +1,8 @@ package backend import ( + "os" + "github.com/mudler/LocalAI/core/config" "github.com/gpustack/gguf-parser-go/util/ptr" @@ -27,3 +29,90 @@ var _ = Describe("thinking probe gating", func() { Expect(needsThinkingProbe(cfg)).To(BeFalse()) }) }) + +var _ = Describe("persistProbedReasoning", func() { + const modelName = "probe-test" + + // newLoaderWithConfig seeds a ModelConfigLoader with a single model config + // parsed from yamlBody, mirroring how the loader is populated from disk. + newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader { + tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml") + Expect(err).ToNot(HaveOccurred()) + defer os.Remove(tmp.Name()) + + _, err = tmp.WriteString(yamlBody) + Expect(err).ToNot(HaveOccurred()) + Expect(tmp.Close()).To(Succeed()) + + cl := config.NewModelConfigLoader("") + Expect(cl.ReadModelConfig(tmp.Name())).To(Succeed()) + return cl + } + + It("persists a reasoning slot the probe was allowed to fill (was nil beforehand)", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + probed.ReasoningConfig.DisableReasoning = ptr.To(false) // backend detected: supports thinking + probed.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true) + + persistProbedReasoning(cl, modelName, probed, true, true) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue()) + }) + + It("does not persist a slot that already carried a request-scoped value before the probe ran", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + // Simulates ApplyReasoningEffort("none") having set this on the + // request-scoped copy before the probe ran - not a genuine backend + // detection, so it must never reach the persisted config (#10622). + probed.ReasoningConfig.DisableReasoning = ptr.To(true) + + persistProbedReasoning(cl, modelName, probed, false, false) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoning).To(BeNil()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeNil()) + }) + + It("preserves an operator's explicit persisted disable when the guard is false", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\nreasoning:\n disable: true\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + // Even if the request-scoped copy ends up holding a different value, + // persistDisableReasoning=false must keep the operator's own setting. + probed.ReasoningConfig.DisableReasoning = ptr.To(false) + + persistProbedReasoning(cl, modelName, probed, false, false) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("persists the media marker regardless of the reasoning guards", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + probed.MediaMarker = "<__media__>" + + persistProbedReasoning(cl, modelName, probed, false, false) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.MediaMarker).To(Equal("<__media__>")) + }) +}) From 30b2fe95aea1bf1ab96c0b9fefdd189bcaf2353e Mon Sep 17 00:00:00 2001 From: Tai An Date: Thu, 2 Jul 2026 10:09:45 +0000 Subject: [PATCH 3/3] test(reasoning): ignore os.Remove error in temp file cleanup (errcheck) Signed-off-by: Tai An --- core/backend/llm_probe_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go index 0ae8e3718a86..29b68f5f52a0 100644 --- a/core/backend/llm_probe_test.go +++ b/core/backend/llm_probe_test.go @@ -38,7 +38,7 @@ var _ = Describe("persistProbedReasoning", func() { newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader { tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml") Expect(err).ToNot(HaveOccurred()) - defer os.Remove(tmp.Name()) + defer func() { _ = os.Remove(tmp.Name()) }() _, err = tmp.WriteString(yamlBody) Expect(err).ToNot(HaveOccurred())