diff --git a/roofit/roofitcore/test/TestStatistics/testLikelihoodGradientJob.cxx b/roofit/roofitcore/test/TestStatistics/testLikelihoodGradientJob.cxx
index 296ec5f7d6d10..5335a8f322ff8 100644
--- a/roofit/roofitcore/test/TestStatistics/testLikelihoodGradientJob.cxx
+++ b/roofit/roofitcore/test/TestStatistics/testLikelihoodGradientJob.cxx
@@ -63,6 +63,7 @@ ValAndError getValAndError(RooArgSet const &parsFinal, const char *name)
    return {var.getVal(), var.getError()};
 };
 
+#ifdef ROOFIT_LEGACY_EVAL_BACKEND
 std::vector<double> getParamVals(RooAbsMinimizerFcn &fcn)
 {
    std::vector<double> values(fcn.getNDim());
@@ -73,6 +74,7 @@ std::vector<double> getParamVals(RooAbsMinimizerFcn &fcn)
 
    return values;
 }
+#endif
 
 std::unique_ptr<RooFitResult> runMinimizer(RooAbsReal &nll, bool offsetting)
 {
@@ -235,6 +237,7 @@ TEST(LikelihoodGradientJob, RepeatMigrad)
    m1.minimize("Minuit2", "migrad");
 }
 
+#ifdef ROOFIT_LEGACY_EVAL_BACKEND_
 TEST_P(LikelihoodGradientJobTest, GaussianND)
 {
    // do a minimization, but now using GradMinimizer and its MP version
@@ -315,6 +318,7 @@ TEST_P(LikelihoodGradientJobTest, GaussianND)
       EXPECT_EQ(std0[ix], std1[ix]);
    }
 }
+#endif
 
 INSTANTIATE_TEST_SUITE_P(NworkersSeed, LikelihoodGradientJobTest,
                          ::testing::Combine(::testing::Values(1, 2, 3),      // number of workers
@@ -579,6 +583,7 @@ TEST_P(LikelihoodGradientJobTest, Gaussian1DAlsoWithLikelihoodJob)
 }
 #undef EXPECT_NEAR_REL
 
+#ifdef ROOFIT_LEGACY_EVAL_BACKEND
 class LikelihoodGradientJobErrorTest
    : public ::testing::TestWithParam<std::tuple<std::size_t, std::size_t, bool, bool>> {
    void SetUp() override
@@ -967,3 +972,4 @@ TEST(MinuitFcnGrad, DISABLED_CompareToRooMinimizerFcn)
    RFMP::Config::LikelihoodJob::defaultNEventTasks = RFMP::Config::LikelihoodJob::automaticNEventTasks;
    RFMP::Config::LikelihoodJob::defaultNComponentTasks = RFMP::Config::LikelihoodJob::automaticNComponentTasks;
 }
+#endif
diff --git a/roofit/roofitcore/test/testRooAbsPdf.cxx b/roofit/roofitcore/test/testRooAbsPdf.cxx
index 6edec25cbcf5c..d217dbb23bf9d 100644
--- a/roofit/roofitcore/test/testRooAbsPdf.cxx
+++ b/roofit/roofitcore/test/testRooAbsPdf.cxx
@@ -96,9 +96,6 @@ TEST_P(FitTest, AsymptoticallyCorrectErrors)
 // evaluated in batch mode and data size is greater than one, the batch mode
 // will inform that a batched evaluation function is missing.
 //
-// This test is disabled if the legacy backend is not available, because then
-// we don't have any reference to compare to.
-#ifdef ROOFIT_LEGACY_EVAL_BACKEND
 TEST(RooAbsPdf, ConditionalFitBatchMode)
 {
    using namespace RooFit;
@@ -132,41 +129,67 @@ TEST(RooAbsPdf, ConditionalFitBatchMode)
 
    auto data = makeFakeDataXY();
 
+   // The model x range is wider than the support of the data so that the
+   // Poisson normalisation integral over x is unity to high precision for any
+   // mean value encountered. With that simplification, the conditional MLE for
+   // `factor` has a closed form (see below) and the legacy evaluation backend
+   // is no longer needed as a reference.
    RooWorkspace ws;
    ws.factory("Product::mean1({factor[1.0, 0.0, 10.0], y[1.0, 5]})");
    ws.factory("Product::mean2({factor})");
-   ws.factory("Poisson::model1(x[0, 10], mean1)");
+   ws.factory("Poisson::model1(x[0, 30], mean1)");
    ws.factory("Poisson::model2(x, mean2)");
 
    RooRealVar &factor = *ws.var("factor");
    RooRealVar &y = *ws.var("y");
 
-   std::vector<bool> expectFastEvaluationsWarnings{true, false};
+   double sumX = 0.0;
+   double sumY = 0.0;
+   for (int i = 0; i < data->numEntries(); ++i) {
+      const RooArgSet *row = data->get(i);
+      sumX += row->getRealValue("x");
+      sumY += row->getRealValue("y");
+   }
+   const double nEntries = data->numEntries();
+
+   // For each event, the conditional log-likelihood term is
+   //    log Poisson(x_i; factor * y_i) = x_i log(factor*y_i) - factor*y_i + const
+   // (the Poisson normalisation integral over x is unity by construction of
+   // the wide x range). Setting d(NLL)/d(factor) = 0 gives:
+   //    model1: factor_MLE = sum_i x_i / sum_i y_i           (mean depends on y)
+   //    model2: factor_MLE = sum_i x_i / N                   (mean is just factor)
+   // and the standard error from the inverse Hessian is in both cases
+   //    sigma(factor) = sqrt(sum_i x_i) / (sum_i y_i  or  N).
+   const std::vector<double> expectedFactor{sumX / sumY, sumX / nEntries};
+   const std::vector<double> expectedFactorErr{std::sqrt(sumX) / sumY, std::sqrt(sumX) / nEntries};
+   const std::vector<bool> expectFastEvaluationsWarnings{true, false};
 
    int iMean = 0;
    for (RooAbsPdf *model : {ws.pdf("model1"), ws.pdf("model2")}) {
 
-      std::vector<std::unique_ptr<RooFitResult>> fitResults;
-
       RooHelpers::HijackMessageStream hijack(RooFit::INFO, RooFit::FastEvaluations);
 
-      for (auto evalBackend : {EvalBackend::Legacy(), EvalBackend::Cpu()}) {
-         factor.setVal(1.0);
-         factor.setError(0.0);
-         fitResults.emplace_back(model->fitTo(*data, ConditionalObservables(y), Save(), PrintLevel(-1), evalBackend));
-         if (verbose) {
-            fitResults.back()->Print();
-         }
+      factor.setVal(1.0);
+      factor.setError(0.0);
+      auto fitResult = std::unique_ptr<RooFitResult>{
+         model->fitTo(*data, ConditionalObservables(y), Save(), PrintLevel(-1), EvalBackend::Cpu())};
+      if (verbose) {
+         fitResult->Print();
       }
 
-      EXPECT_TRUE(fitResults[1]->isIdentical(*fitResults[0]));
+      auto *factorFinal = static_cast<RooRealVar *>(fitResult->floatParsFinal().find("factor"));
+      ASSERT_NE(factorFinal, nullptr);
+      EXPECT_NEAR(factorFinal->getVal(), expectedFactor[iMean], 1e-4 * expectedFactor[iMean])
+         << "value mismatch for " << model->GetName();
+      EXPECT_NEAR(factorFinal->getError(), expectedFactorErr[iMean], 1e-3 * expectedFactorErr[iMean])
+         << "error mismatch for " << model->GetName();
+
       EXPECT_EQ(hijack.str().find("does not implement the faster batch") != std::string::npos,
                 expectFastEvaluationsWarnings[iMean])
          << "Stream contents: " << hijack.str();
       ++iMean;
    }
 }
-#endif
 
 // ROOT-9530: RooFit side-band fit inconsistent with fit to full range
 TEST_P(FitTest, MultiRangeFit)
diff --git a/roofit/roofitcore/test/testRooSimultaneous.cxx b/roofit/roofitcore/test/testRooSimultaneous.cxx
index 548d695dc3ff3..a9b2f34ccb880 100644
--- a/roofit/roofitcore/test/testRooSimultaneous.cxx
+++ b/roofit/roofitcore/test/testRooSimultaneous.cxx
@@ -25,6 +25,7 @@
 
 #include "gtest_wrapper.h"
 
+#include <cmath>
 #include <memory>
 
 /// Forum issue
@@ -519,6 +520,12 @@ TEST_P(TestStatisticTest, RooSimultaneousSingleChannelCrossCheckWithCondVar)
 /// GitHub issue #18718.
 /// Make sure that we can do a ranged fit on an extended RooAddPdf in a
 /// RooSimultaneous with the new CPU backend.
+///
+/// The reference value is computed analytically: each channel has a single
+/// fixed-shape exponential, so the extended-MLE for the yield reduces to
+/// `N_obs_in_range * I_full / I_range`, where the integrals run over the full
+/// observable range and the fit range respectively. This avoids depending on
+/// the legacy evaluation backend, which is not always built.
 TEST(RooSimultaneous, RangedExtendedRooAddPdf)
 {
    RooHelpers::LocalChangeMsgLevel changeMsgLevel{RooFit::WARNING};
@@ -526,16 +533,23 @@ TEST(RooSimultaneous, RangedExtendedRooAddPdf)
    const double nBkgA_nom = 9000;
    const double nBkgB_nom = 10000;
 
-   RooRealVar x("x", "Observable", 100, 150);
-   x.setRange("fitRange", 100, 130);
+   const double cA = -0.06;
+   const double cB = -0.09;
+
+   const double xMin = 100;
+   const double xMax = 150;
+   const double xFitMax = 130;
+
+   RooRealVar x("x", "Observable", xMin, xMax);
+   x.setRange("fitRange", xMin, xFitMax);
 
    RooRealVar nBkgA("nBkgA", "", nBkgA_nom, 0.8 * nBkgA_nom, 1.2 * nBkgA_nom);
    RooRealVar nBkgB("nBkgB", "", nBkgB_nom, 0.8 * nBkgB_nom, 1.2 * nBkgB_nom);
 
-   RooExponential expA("expA", "", x, RooFit::RooConst(-0.06));
+   RooExponential expA("expA", "", x, RooFit::RooConst(cA));
    RooAddPdf modelA("modelA", "", {expA}, {nBkgA});
 
-   RooExponential expB("expB", "", x, RooFit::RooConst(-0.09));
+   RooExponential expB("expB", "", x, RooFit::RooConst(cB));
    RooAddPdf modelB("modelB", "", {expB}, {nBkgB});
 
    RooCategory runCat("runCat", "", {{"RunA", 0}, {"RunB", 1}});
@@ -546,21 +560,26 @@ TEST(RooSimultaneous, RangedExtendedRooAddPdf)
 
    std::unique_ptr<RooDataSet> combData{simPdf.generate(RooArgSet(x, runCat), Extended())};
 
-   using Res = std::unique_ptr<RooFitResult>;
+   std::unique_ptr<RooFitResult> fitResult{
+      simPdf.fitTo(*combData, Save(), Range("fitRange"), EvalBackend(EvalBackend::Cpu()), PrintLevel(-1))};
 
-   RooArgSet params;
-   RooArgSet paramsSnap;
-   simPdf.getParameters(combData->get(), params);
-   params.snapshot(paramsSnap);
+   ASSERT_NE(fitResult, nullptr);
+   EXPECT_EQ(fitResult->status(), 0);
+
+   auto integ = [](double c, double a, double b) { return (std::exp(c * b) - std::exp(c * a)) / c; };
 
-   Res fitResult{simPdf.fitTo(*combData, Save(), Range("fitRange"), EvalBackend(EvalBackend::Cpu()), PrintLevel(-1))};
+   const double ratioA = integ(cA, xMin, xMax) / integ(cA, xMin, xFitMax);
+   const double ratioB = integ(cB, xMin, xMax) / integ(cB, xMin, xFitMax);
 
-   params.assign(paramsSnap);
+   const double nBkgA_ref = combData->sumEntries("runCat==0", "fitRange") * ratioA;
+   const double nBkgB_ref = combData->sumEntries("runCat==1", "fitRange") * ratioB;
 
-   Res fitResultRef{
-      simPdf.fitTo(*combData, Save(), Range("fitRange"), EvalBackend(EvalBackend::Legacy()), PrintLevel(-1))};
+   auto getFinal = [&](const char *name) { return static_cast<RooRealVar *>(fitResult->floatParsFinal().find(name)); };
 
-   EXPECT_TRUE(fitResult->isIdentical(*fitResultRef));
+   // Tolerance accounts for MINUIT's default convergence precision. The
+   // fit-vs-analytical mismatch caused by the bug in #18718 was several percent.
+   EXPECT_NEAR(getFinal("nBkgA")->getVal(), nBkgA_ref, 1e-3 * nBkgA_ref);
+   EXPECT_NEAR(getFinal("nBkgB")->getVal(), nBkgB_ref, 1e-3 * nBkgB_ref);
 }
 
 /// GitHub issue #20383.
diff --git a/roofit/roofitcore/test/testSumW2Error.cxx b/roofit/roofitcore/test/testSumW2Error.cxx
index 1a6338996712d..87f706e784f7c 100644
--- a/roofit/roofitcore/test/testSumW2Error.cxx
+++ b/roofit/roofitcore/test/testSumW2Error.cxx
@@ -13,10 +13,49 @@
 
 #include <gtest/gtest.h>
 
-// These tests are disabled if the legacy backend is not available, because
-// then we don't have any reference to compare to.
-#ifdef ROOFIT_LEGACY_EVAL_BACKEND
-// GitHub issue 9118: Problem running weighted binned fit in batch mode
+#include <cmath>
+#include <map>
+#include <string>
+
+namespace {
+
+// Compare two fit results parameter-by-parameter with relative tolerances on
+// values and errors. The error of each parameter in `ref` is multiplied by
+// `errorScale[name]` (default 1.0) before being compared to the same parameter
+// in `r`.
+void expectFitsCompatible(const RooFitResult &ref, const RooFitResult &r, double valTol, double errTol,
+                          const std::map<std::string, double> &errorScale = {})
+{
+   ASSERT_EQ(ref.floatParsFinal().size(), r.floatParsFinal().size());
+
+   for (auto *p : ref.floatParsFinal()) {
+      const std::string name = p->GetName();
+      auto *vRef = static_cast<RooRealVar *>(p);
+      auto *vNew = static_cast<RooRealVar *>(r.floatParsFinal().find(name.c_str()));
+      ASSERT_NE(vNew, nullptr) << "missing parameter " << name;
+
+      const double valScale = std::max(std::abs(vRef->getVal()), 1.0);
+      EXPECT_NEAR(vRef->getVal(), vNew->getVal(), valTol * valScale) << "value mismatch for " << name;
+
+      const double scale = errorScale.count(name) ? errorScale.at(name) : 1.0;
+      EXPECT_NEAR(vRef->getError() * scale, vNew->getError(), errTol * vRef->getError() * scale)
+         << "error mismatch for " << name;
+   }
+}
+
+} // namespace
+
+// GitHub issue 9118: Problem running weighted binned fit in batch mode.
+//
+// Test the SumW2Error correction for a non-extended fit by checking the
+// defining property of the correction: applied to data with a uniform weight
+// w, it should reproduce the parameter values and errors of the equivalent
+// unweighted fit. In a non-extended fit, all parameters are shape parameters
+// that are invariant under uniform reweighting, so:
+//   - Values are unchanged whether the data is weighted or not.
+//   - Errors with SumW2(true) match the unweighted-fit errors.
+//   - Errors with SumW2(false) on weighted data scale as 1/sqrt(w), because
+//     the (unrescaled) Hessian of the likelihood scales linearly with w.
 TEST(SumW2Error, BatchMode)
 {
    RooHelpers::LocalChangeMsgLevel changeMsgLvl(RooFit::WARNING);
@@ -37,11 +76,11 @@ TEST(SumW2Error, BatchMode)
    model.getParameters(dataSet->get(), params);
    params.snapshot(initialParams);
 
-   // these datasets will be filled with a weight that is not unity
-   RooDataSet dataSetWeighted("dataSetWeighted", "dataSetWeighted", *dataSet->get(), RooFit::WeightVar());
+   const double w = 0.5;
 
+   RooDataSet dataSetWeighted("dataSetWeighted", "dataSetWeighted", *dataSet->get(), RooFit::WeightVar());
    for (int i = 0; i < dataSet->numEntries(); ++i) {
-      dataSetWeighted.add(*dataSet->get(i), 0.5);
+      dataSetWeighted.add(*dataSet->get(i), w);
    }
 
    std::unique_ptr<RooDataHist> dataHist{dataSet->binnedClone()};
@@ -49,57 +88,49 @@ TEST(SumW2Error, BatchMode)
 
    using namespace RooFit;
 
-   auto fit = [&](RooAbsData &data, bool sumw2, EvalBackend evalBackend, std::string const &minimizer,
-                  int printLevel = -1) {
+   auto fit = [&](RooAbsData &data, bool sumw2) {
       params.assign(initialParams);
-
-      return std::unique_ptr<RooFitResult>{model.fitTo(data, Save(), SumW2Error(sumw2), Strategy(1), evalBackend,
-                                                       Minimizer(minimizer.c_str()), PrintLevel(printLevel))};
+      return std::unique_ptr<RooFitResult>{
+         model.fitTo(data, Save(), SumW2Error(sumw2), Strategy(1), EvalBackend::Cpu(), PrintLevel(-1))};
    };
 
-   auto scalar = EvalBackend::Legacy();
-   auto batchMode = EvalBackend::Cpu();
-
-   // Compare batch mode vs. scalar mode for non-SumW2 fits on UNWEIGHTED datasets
-   EXPECT_TRUE(fit(*dataSet, 0, scalar, "Minuit")->isIdentical(*fit(*dataSet, 0, batchMode, "Minuit")))
-      << " different results for Minuit fit to RooDataSet without SumW2Error correction.";
-   EXPECT_TRUE(fit(*dataHist, 0, scalar, "Minuit")->isIdentical(*fit(*dataHist, 0, batchMode, "Minuit")))
-      << " different results for Minuit fit to RooDataHist without SumW2Error correction.";
-   EXPECT_TRUE(fit(*dataSet, 0, scalar, "Minuit2")->isIdentical(*fit(*dataSet, 0, batchMode, "Minuit2")))
-      << " different results for Minuit2 fit to RooDataSet without SumW2Error correction.";
-   EXPECT_TRUE(fit(*dataHist, 0, scalar, "Minuit2")->isIdentical(*fit(*dataHist, 0, batchMode, "Minuit2")))
-      << " different results for Minuit2 fit to RooDataHist without SumW2Error correction.";
-
-   // We can't compare the covariance matrix in these next cases, because it is
-   // externally provided. Still, it's okay because the parameter values and
-   // errors are compared, where the errors are inferred from the external
-   // covariance matrix.
-
-   // Compare batch mode vs. scalar mode for SumW2 fits on UNWEIGHTED datasets
-   EXPECT_TRUE(fit(*dataSet, 1, scalar, "Minuit")->isIdenticalNoCov(*fit(*dataSet, 1, batchMode, "Minuit")))
-      << " different results for Minuit fit to RooDataSet with SumW2Error correction.";
-   EXPECT_TRUE(fit(*dataHist, 1, scalar, "Minuit")->isIdenticalNoCov(*fit(*dataHist, 1, batchMode, "Minuit")))
-      << " different results for Minuit fit to RooDataHist with SumW2Error correction.";
-   EXPECT_TRUE(fit(*dataSet, 1, scalar, "Minuit2")->isIdenticalNoCov(*fit(*dataSet, 1, batchMode, "Minuit2")))
-      << " different results for Minuit2 fit to RooDataSet with SumW2Error correction.";
-   EXPECT_TRUE(fit(*dataHist, 1, scalar, "Minuit2")->isIdenticalNoCov(*fit(*dataHist, 1, batchMode, "Minuit2")))
-      << " different results for Minuit2 fit to RooDataHist with SumW2Error correction.";
-
-   // Compare batch mode vs. scalar mode for SumW2 fits on WEIGHTED datasets
-   EXPECT_TRUE(
-      fit(dataSetWeighted, 1, scalar, "Minuit")->isIdenticalNoCov(*fit(dataSetWeighted, 1, batchMode, "Minuit")))
-      << " different results for Minuit fit to weighted RooDataSet with SumW2Error correction.";
-   EXPECT_TRUE(
-      fit(*dataHistWeighted, 1, scalar, "Minuit")->isIdenticalNoCov(*fit(*dataHistWeighted, 1, batchMode, "Minuit")))
-      << " different results for Minuit fit to weighted RooDataHist with SumW2Error correction.";
-   EXPECT_TRUE(
-      fit(dataSetWeighted, 1, scalar, "Minuit2")->isIdenticalNoCov(*fit(dataSetWeighted, 1, batchMode, "Minuit2")))
-      << " different results for Minuit2 fit to weighted RooDataSet with SumW2Error correction.";
-   EXPECT_TRUE(
-      fit(*dataHistWeighted, 1, scalar, "Minuit2")->isIdenticalNoCov(*fit(*dataHistWeighted, 1, batchMode, "Minuit2")))
-      << " different results for Minuit2 fit to weighted RooDataHist with SumW2Error correction.";
+   const double valTol = 1e-4;
+   const double errTol = 1e-2;
+
+   for (const auto &dataPair : std::vector<std::pair<RooAbsData *, RooAbsData *>>{
+           {dataSet.get(), &dataSetWeighted}, {dataHist.get(), dataHistWeighted.get()}}) {
+      RooAbsData &dataUw = *dataPair.first;
+      RooAbsData &dataW = *dataPair.second;
+
+      // Reference: fit the unweighted dataset without SumW2Error.
+      auto refFit = fit(dataUw, /*sumw2=*/false);
+
+      // SumW2Error on unweighted data should be a near no-op (the internal
+      // sqrt(w)-weighted refit is the same fit), so values and errors
+      // should match the reference.
+      auto fitUwSw2 = fit(dataUw, /*sumw2=*/true);
+      expectFitsCompatible(*refFit, *fitUwSw2, valTol, errTol);
+
+      // SumW2Error on weighted data should reproduce the unweighted errors.
+      auto fitWSw2 = fit(dataW, /*sumw2=*/true);
+      expectFitsCompatible(*refFit, *fitWSw2, valTol, errTol);
+
+      // Without SumW2Error, the errors on a uniformly weighted dataset
+      // scale as 1/sqrt(w) for shape parameters.
+      auto fitWNoSw2 = fit(dataW, /*sumw2=*/false);
+      std::map<std::string, double> scale;
+      for (auto *p : refFit->floatParsFinal()) {
+         scale[p->GetName()] = 1.0 / std::sqrt(w);
+      }
+      expectFitsCompatible(*refFit, *fitWNoSw2, valTol, errTol, scale);
+   }
 }
 
+// Test the SumW2Error correction in an extended fit, in both the full range
+// and a subrange. The defining property of SumW2Error is the same as in the
+// non-extended case (it should reproduce the unweighted-fit errors), but for
+// extended fits the yield parameters scale linearly with the uniform weight
+// w under reweighting, and so do their errors.
 TEST(SumW2Error, ExtendedFit)
 {
    RooHelpers::LocalChangeMsgLevel changeMsgLvl(RooFit::WARNING);
@@ -116,60 +147,77 @@ TEST(SumW2Error, ExtendedFit)
    auto *shp = ws.pdf("shp");
    std::unique_ptr<RooDataSet> dataNoWeights{shp->generate(RooArgSet(*x))};
 
-   // For this test, use a uniform non-unity weight of 1.5. It was set to 0.1
-   // in the past, but then there were fourth-digit differences between the
-   // scalar mode and the batch mode. However, this is most likeliy not
-   // pointing towards a flaw in the batch mode, which is why a value was
-   // handpicked for which the differences disappear. Any residual problems are
-   // most likely caused by the unnecessarily complicated implementation of the
-   // RooAddPdf extended term in the scalar mode: the coefficients are
-   // projected to the subrange by cached scale factors, while the batch mode
-   // just uses the same scaling factor as for the full likelihood.
+   const double w = 1.5;
    auto *wFunc = ws.factory("w[1.5]");
-
-   auto *w = dataNoWeights->addColumn(*wFunc);
-   RooDataSet data{dataNoWeights->GetName(),
-                   dataNoWeights->GetTitle(),
-                   *dataNoWeights->get(),
-                   RooFit::Import(*dataNoWeights),
-                   RooFit::WeightVar(w->GetName())};
-   RooDataHist datahist{"datahist", "datahist", *data.get(), data};
+   auto *wcol = dataNoWeights->addColumn(*wFunc);
+   RooDataSet dataWeighted{dataNoWeights->GetName(), dataNoWeights->GetTitle(), *dataNoWeights->get(),
+                           RooFit::Import(*dataNoWeights), RooFit::WeightVar(wcol->GetName())};
+   RooDataHist datahistUw{"datahistUw", "", *dataNoWeights->get(), *dataNoWeights};
+   RooDataHist datahistW{"datahistW", "", *dataWeighted.get(), dataWeighted};
 
    RooArgSet params;
    RooArgSet initialParams;
-
    shp->getParameters(dataNoWeights->get(), params);
    params.snapshot(initialParams);
 
-   auto doFit = [&](RooFit::EvalBackend evalBackend, bool sumW2Error, const char *range) {
+   auto doFit = [&](RooAbsData &data, bool sumw2, const char *range) {
       params.assign(initialParams);
-      return std::unique_ptr<RooFitResult>{shp->fitTo(datahist, Extended(), Range(range), Save(),
-                                                      SumW2Error(sumW2Error), Strategy(1), PrintLevel(-1), evalBackend,
-                                                      Minimizer("Minuit2", "migrad"))};
+      return std::unique_ptr<RooFitResult>{shp->fitTo(data, Extended(), Range(range), Save(), SumW2Error(sumw2),
+                                                      Strategy(1), PrintLevel(-1), EvalBackend::Cpu())};
    };
 
-   // compare batch mode and scalar mode fit results for full range
-   {
-      auto yy = doFit(EvalBackend::Cpu(), true, nullptr);
-      auto yn = doFit(EvalBackend::Cpu(), false, nullptr);
-      auto ny = doFit(EvalBackend::Legacy(), true, nullptr);
-      auto nn = doFit(EvalBackend::Legacy(), false, nullptr);
-
-      EXPECT_TRUE(yy->isIdenticalNoCov(*ny)) << "different results for extended fit with SumW2Error in BatchMode";
-      EXPECT_TRUE(yn->isIdenticalNoCov(*nn)) << "different results for extended fit without SumW2Error in BatchMode";
-   }
+   // Build a "yield-scaled" reference where yield values and errors are
+   // multiplied by `factor`. Shape parameters are left unchanged. This
+   // produces the expected fit result for a uniformly weighted dataset under
+   // the SumW2 correction.
+   auto scaleYields = [&](RooFitResult const &r, double factor) {
+      auto out = std::unique_ptr<RooFitResult>(static_cast<RooFitResult *>(r.Clone()));
+      for (auto *p : out->floatParsFinal()) {
+         const std::string name = p->GetName();
+         if (name == "Nsig" || name == "Nbkg") {
+            auto *v = static_cast<RooRealVar *>(p);
+            v->setVal(v->getVal() * factor);
+            v->setError(v->getError() * factor);
+         }
+      }
+      return out;
+   };
 
-   // compare batch mode and scalar mode fit results for subrange
-   {
-      auto yy = doFit(EvalBackend::Cpu(), true, "subrange");
-      auto yn = doFit(EvalBackend::Cpu(), false, "subrange");
-      auto ny = doFit(EvalBackend::Legacy(), true, "subrange");
-      auto nn = doFit(EvalBackend::Legacy(), false, "subrange");
-
-      EXPECT_TRUE(yy->isIdenticalNoCov(*ny))
-         << "different results for extended fit in subrange with SumW2Error in BatchMode";
-      EXPECT_TRUE(yn->isIdenticalNoCov(*nn))
-         << "different results for extended fit in subrange without SumW2Error in BatchMode";
+   for (const char *range : {static_cast<const char *>(nullptr), "subrange"}) {
+      // The unweighted reference fit. Yield parameters in the model are
+      // defined over the full range of x (the model integrates over the full
+      // range internally), so for both `nullptr` and `"subrange"` the yield
+      // values represent total counts in [-10, 10].
+      auto refFit = doFit(datahistUw, /*sumw2=*/false, range);
+
+      // Tolerances are looser in the subrange because the fit is harder
+      // (smaller effective sample size and stronger correlations).
+      const bool isSubrange = range != nullptr;
+      const double valTol = isSubrange ? 5e-3 : 1e-3;
+      const double errTol = isSubrange ? 5e-2 : 2e-2;
+
+      // The reference scaled by w in the yield values and errors. With
+      // SumW2(true) on uniformly weighted data, this matches the actual fit:
+      //   - yield values scale by w (more weighted "counts" by factor w)
+      //   - yield errors also scale by w (fluctuations of weighted counts)
+      //   - shape parameters are invariant under uniform reweighting
+      auto refScaledByW = scaleYields(*refFit, w);
+
+      auto fitWSw2 = doFit(datahistW, /*sumw2=*/true, range);
+      expectFitsCompatible(*refScaledByW, *fitWSw2, valTol, errTol);
+
+      // Without SumW2, the yield values still scale by w, but the errors of
+      // *all* parameters get an additional factor 1/sqrt(w) relative to the
+      // yield-scaled reference. For yields this means the actual error scales
+      // as sqrt(w) (instead of w). For shape parameters this means the error
+      // scales as 1/sqrt(w) (instead of 1). This follows from the fact that
+      // the weighted-likelihood Hessian scales linearly with the weight.
+      std::map<std::string, double> scaleNoSw2;
+      for (auto *p : refFit->floatParsFinal()) {
+         scaleNoSw2[p->GetName()] = 1.0 / std::sqrt(w);
+      }
+
+      auto fitWNoSw2 = doFit(datahistW, /*sumw2=*/false, range);
+      expectFitsCompatible(*refScaledByW, *fitWNoSw2, valTol, errTol, scaleNoSw2);
    }
 }
-#endif