diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md
index ba868b0f0..40f7a1eed 100644
--- a/benchmarks/microbenchmarks/README.md
+++ b/benchmarks/microbenchmarks/README.md
@@ -29,7 +29,16 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv
 ```
 
 The samples CSV contains one row per timing sample with columns for all
-benchmark parameters plus `label`, `sample_idx`, and `time_ms`.
+benchmark parameters plus `label`, `sample_idx`, `time_ms`, `throughput`, and
+`unit`. The per-sample `throughput` is derived from the sample time (it is blank
+for samples-only records such as `Forward+Backward`, which carry no throughput).
+
+torch's autorange records only a few raw timing blocks per metric. To get
+enough samples for statistical comparison, `--min-samples N` (default 12)
+ensures at least `N` raw blocks are recorded, topping up any shortfall with
+additional equal-sized blocks rather than re-averaging the whole measurement.
+Statistical comparison (see below) needs at least ~10 samples, so keep the
+default (or higher) when producing samples CSVs for `--stats`.
 
 ## Shared configuration
 
@@ -75,3 +84,36 @@ python compare_results.py baseline.csv candidate.csv --bench-name GEMM
 
 The script auto-detects metric columns, computes speedups for overlapping rows,
 and reports rows that exist only in the baseline or only in the candidate.
+
+### Statistical comparison (`--stats`)
+
+The ratio comparison above uses point estimates and cannot tell a real
+regression from measurement noise. To test whether timing differences are
+statistically significant, run the benchmark with `--csv-samples` on both
+sides and compare the samples CSVs with `--stats`:
+
+```bash
+pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
+
+# baseline checkout
+python benchmark_gemm.py --csv-samples baseline_samples.csv --min-samples 12
+# candidate checkout
+python benchmark_gemm.py --csv-samples candidate_samples.csv --min-samples 12
+
+python compare_results.py baseline_samples.csv candidate_samples.csv --stats
+```
+
+`--stats` uses the [benchstats](https://github.com/Arech/benchstats) package to
+apply a Brunner-Munzel test (override with `--method`) at significance level
+`--alpha` (default `0.001`) to each `(config, label)` pair. It prints a table
+marking each benchmark as faster (`<`), slower (`>`), or not significantly
+different (`~`), and exits `1` when a significant difference in the timing
+metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`,
+`.txt`) to save the report, and `--always-show-pvalues` to show p-values for
+non-significant rows.
+
+Time (the main metric, which drives the exit code) is shown alongside the
+calculated throughput/bandwidth (`TFLOPS` or `GB/s`), reported as a secondary
+metric. Because the table requires a uniform metric set, samples-only composite
+rows that carry no throughput (e.g. `Forward+Backward`) are omitted from the
+comparison; their raw samples remain in the CSV.
diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py
index 4a7e1dab8..1c231af86 100755
--- a/benchmarks/microbenchmarks/compare_results.py
+++ b/benchmarks/microbenchmarks/compare_results.py
@@ -7,12 +7,22 @@
 """
 Compare two CSVs from the same benchmark suite.
 
-Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns.
-Outputs a markdown <details> block to stdout with per-config results,
-and optionally appends a summary table row to --summary-file.
+Two modes:
+
+1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric
+   columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput
+   speedups, and emits a markdown <details> block (optionally appending a
+   summary row to --summary-file).
+
+2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using
+   a statistical test (Brunner-Munzel by default) via the benchstats package.
+   Reports whether per-config timing differences are significant and exits 1
+   when a significant regression is found (for CI gating). Requires
+   ``pip install -r requirements.txt``.
 
 Usage:
     python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE
+    python compare_results.py base_samples.csv cand_samples.csv --stats
 """
 
 import argparse
@@ -49,6 +59,75 @@ def print_key_table(title, rows_df, key_cols):
     print()
 
 
+def run_stats(args):
+    """Compare two samples CSVs with a statistical test via benchstats.
+
+    Timing (``time_ms``) is the main metric and drives the exit code; throughput
+    / bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when
+    present in the CSV.
+
+    Returns a process exit code: 1 if a significant difference is found in the
+    main (timing) metric, else 0.
+    """
+    import os
+
+    import rich.table  # noqa: F401  benchstats 3.4.0 render uses rich.table.Table without importing it
+    from parser_TEsamples import parser_TEsamples
+    from benchstats.compare import compareStats
+    from benchstats.render import renderComparisonResults
+    from benchstats.common import LoggingConsole, detectExportFormat
+
+    main_metrics = ["time_s"]
+
+    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
+    if export_fmt is not None and os.path.isfile(args.export_to):
+        os.remove(args.export_to)
+
+    console = LoggingConsole(
+        record=export_fmt is not None,
+        log_level=LoggingConsole.LogLevel.Warning,
+    )
+
+    # metrics=None exposes every metric the CSV carries (time + throughput).
+    s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats()
+    s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats()
+
+    cr = compareStats(
+        s1, s2,
+        method=args.method,
+        alpha=args.alpha,
+        main_metrics=main_metrics,
+        debug_log=console,
+    )
+
+    # Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats'
+    # default per-value "s" suffix for them (the column header names the unit).
+    secondary = [m for m in cr.getMetrics() if m not in main_metrics]
+    style_overrides = {f"metric_{m}_unit": "" for m in secondary}
+
+    renderComparisonResults(
+        cr, console,
+        main_metrics=main_metrics,
+        style_overrides=style_overrides or None,
+        always_show_pvalues=args.always_show_pvalues,
+    )
+
+    if export_fmt is not None:
+        if export_fmt == "txt":
+            console.save_text(args.export_to)
+        elif export_fmt == "svg":
+            console.save_svg(args.export_to, title="")
+        elif export_fmt == "html":
+            console.save_html(args.export_to)
+
+    if cr.at_least_one_differs:
+        console.warning(
+            "At least one significant timing difference was detected (exit 1)."
+        )
+        return 1
+    return 0
+
+
 def main():
     parser = argparse.ArgumentParser(description="Compare benchmark CSVs")
     parser.add_argument("baseline_csv", help="Baseline CSV")
@@ -67,8 +146,35 @@ def main():
             "Set to 0 to disable the filter."
         ),
     )
+
+    stats_group = parser.add_argument_group(
+        "statistical comparison (--stats mode; operates on --csv-samples CSVs)"
+    )
+    stats_group.add_argument(
+        "--stats", action="store_true",
+        help="Compare per-sample CSVs with a statistical test via benchstats.",
+    )
+    stats_group.add_argument(
+        "--alpha", type=float, default=0.001,
+        help="Significance level for the test (default: 0.001).",
+    )
+    stats_group.add_argument(
+        "--method", default="brunnermunzel",
+        help="Statistical test to use (default: brunnermunzel).",
+    )
+    stats_group.add_argument(
+        "--always-show-pvalues", action="store_true",
+        help="Always show p-values, including for non-significant results.",
+    )
+    stats_group.add_argument(
+        "--export-to", default=None, metavar="FILE",
+        help="Export the report to a .txt/.svg/.html file (format from extension).",
+    )
     args = parser.parse_args()
 
+    if args.stats:
+        return run_stats(args)
+
     baseline_df = pd.read_csv(args.baseline_csv)
     candidate_df = pd.read_csv(args.candidate_csv)
 
diff --git a/benchmarks/microbenchmarks/parser_TEsamples.py b/benchmarks/microbenchmarks/parser_TEsamples.py
new file mode 100644
index 000000000..44d472158
--- /dev/null
+++ b/benchmarks/microbenchmarks/parser_TEsamples.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""benchstats parser for Transformer Engine microbenchmark samples CSVs.
+
+Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the
+``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}``
+structure consumed by ``benchstats.compare.compareStats``.
+
+Columns of the samples CSV: the benchmark parameter columns, plus ``label``,
+``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is
+built by joining every parameter column and ``label``, so each unique
+(parameters, label) combination becomes one benchmark.
+
+Two metrics are exposed:
+
+- ``time_s`` (seconds, lower is better) -- always present; intended as the
+  *main* metric. Exposed in seconds because benchstats' renderer auto-scales
+  time values (to ms/us/ns) assuming a seconds base unit.
+- the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher
+  is better) -- present when the CSV carries throughput values.
+
+``benchstats``' renderer requires every benchmark to expose the same metric set.
+Records without throughput (the samples-only ``Forward+Backward`` composites) are
+therefore dropped from the comparison when throughput is available for the other
+benchmarks; their raw samples remain in the CSV for other downstream analysis.
+
+The class name matches the file name (``parser_TEsamples``) so it can also be
+loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
+"""
+
+import re
+
+import numpy as np
+import pandas as pd
+
+from benchstats.common import ParserBase, LoggingConsole
+
+_TIME_COL = "time_ms"          # column name in the samples CSV (milliseconds)
+_TIME_KEY = "time_s"           # metric key exposed to benchstats (seconds)
+_THR_COL = "throughput"
+_UNIT_COL = "unit"
+_GENERIC_THR = "throughput"
+_NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL)
+_NAME_DELIM = " | "
+
+
+class parser_TEsamples(ParserBase):
+    def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None:
+        assert isinstance(csv_file_path, str)
+        assert filter is None or isinstance(filter, (str, re.Pattern))
+        assert metrics is None or (
+            isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics)
+        )
+
+        if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
+            self.debug_log = False
+        elif isinstance(debug_log, bool) and debug_log:
+            self.debug_log = True
+            self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug)
+        else:
+            self.debug_log = True
+            self.logger = debug_log
+
+        self.file = csv_file_path
+        self.filter = (
+            filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
+        )
+        self._requested_metrics = list(metrics) if metrics is not None else None
+        self._stats = self._build()
+
+    def getStats(self) -> dict[str, dict[str, np.ndarray]]:
+        return self._stats
+
+    def _log(self, level, msg):
+        if self.debug_log:
+            getattr(self.logger, level)(f"parser_TEsamples: {msg}")
+
+    def _build(self) -> dict[str, dict[str, np.ndarray]]:
+        df = pd.read_csv(self.file)
+
+        if _TIME_COL not in df.columns or "sample_idx" not in df.columns:
+            raise ValueError(
+                f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. "
+                "Was it written with --csv-samples?"
+            )
+
+        name_cols = [c for c in df.columns if c not in _NON_NAME_COLS]
+        if not name_cols:
+            raise ValueError(f"No benchmark-name columns found in '{self.file}'.")
+
+        df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce")
+        has_thr_col = _THR_COL in df.columns
+        if has_thr_col:
+            df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce")
+
+        # First pass: collect per-benchmark time samples, throughput samples and unit.
+        per_bm = {}  # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None}
+        for key_vals, group in df.groupby(name_cols, sort=False):
+            if not isinstance(key_vals, tuple):
+                key_vals = (key_vals,)
+            bm_name = _NAME_DELIM.join(str(v) for v in key_vals)
+
+            if self.filter is not None and self.filter.search(bm_name) is None:
+                continue
+
+            time_ms = group[_TIME_COL].to_numpy(dtype=np.float64)
+            time_ms = time_ms[np.isfinite(time_ms)]
+            if time_ms.size == 0:
+                self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.")
+                continue
+            # benchstats' renderer auto-scales assuming seconds, so expose seconds.
+            time_s = time_ms / 1e3
+
+            thr_s, unit = None, None
+            if has_thr_col:
+                thr_s = group[_THR_COL].to_numpy(dtype=np.float64)
+                thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)]
+                if thr_s.size == 0:
+                    thr_s = None
+                else:
+                    units = [u for u in group[_UNIT_COL].astype(str).unique()
+                             if u and u.lower() != "nan"] if _UNIT_COL in df.columns else []
+                    unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None)
+
+            if self.debug_log and time_s.size < 10:
+                self._log(
+                    "warning",
+                    f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); "
+                    "re-run with a larger --min-samples.",
+                )
+            per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit}
+
+        if not per_bm:
+            self._log("warning", f"no benchmarks read from '{self.file}'.")
+            return {}
+
+        # Decide on a uniform metric set across all benchmarks.
+        with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None}
+        thr_key = self._resolve_throughput_key(with_thr)
+
+        if thr_key is not None and 0 < len(with_thr) < len(per_bm):
+            dropped = sorted(set(per_bm) - set(with_thr))
+            self._log(
+                "warning",
+                f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so "
+                f"throughput can be shown uniformly: {', '.join(dropped)}",
+            )
+            per_bm = with_thr
+
+        # Build the result, honoring an explicit metric request if given.
+        stats = {}
+        for bm_name, d in per_bm.items():
+            entry = {}
+            if self._metric_requested(_TIME_KEY, thr_key):
+                entry[_TIME_KEY] = d["time"]
+            if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key):
+                entry[thr_key] = d["thr"]
+            if entry:
+                stats[bm_name] = entry
+        return stats
+
+    def _resolve_throughput_key(self, with_thr):
+        """Return a single throughput metric key shared by all throughput-bearing benchmarks."""
+        if not with_thr:
+            return None
+        units = {d["unit"] for d in with_thr.values()}
+        if len(units) == 1:
+            return next(iter(units)) or _GENERIC_THR
+        return _GENERIC_THR  # mixed units in one file (atypical) -> generic header
+
+    def _metric_requested(self, key, thr_key):
+        """Honor an explicit metrics= request (benchstats CLI), else expose everything."""
+        if self._requested_metrics is None:
+            return True
+        req = self._requested_metrics
+        if key == _TIME_KEY:
+            return any(t in req for t in (_TIME_KEY, _TIME_COL, "time"))
+        # throughput: match the unit key, the generic name, or literal 'throughput'
+        return key in req or _GENERIC_THR in req or _THR_COL in req
diff --git a/benchmarks/microbenchmarks/requirements.txt b/benchmarks/microbenchmarks/requirements.txt
new file mode 100644
index 000000000..32aa6a635
--- /dev/null
+++ b/benchmarks/microbenchmarks/requirements.txt
@@ -0,0 +1,3 @@
+# Extra dependencies for statistical benchmark comparison
+# (compare_results.py --stats). benchstats pulls in rich, scipy and numpy.
+benchstats>=3.4
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 99ae6a244..929019ee8 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -21,6 +21,12 @@
 
 DEFAULT_MIN_RUN_TIME_SECONDS = 0.2
 
+# Minimum number of raw timing samples (blocks) ``time_func`` ensures when a
+# caller passes ``min_samples=None``. ``run_benchmarks`` sets this from the
+# ``--min-samples`` CLI flag so every benchmark script inherits the knob without
+# per-script edits. ``None`` leaves torch's autorange result untouched.
+_ACTIVE_MIN_SAMPLES = None
+
 # ---------------------------------------------------------------------------
 # Model configurations
 # ---------------------------------------------------------------------------
@@ -88,22 +94,60 @@ def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None):
 # Timing helpers
 # ---------------------------------------------------------------------------
 
-def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS):
+class _RawSamples:
+    """Minimal ``torch...benchmark.Measurement`` stand-in holding raw block times.
+
+    Exposes the ``times`` (per-run seconds, one entry per recorded timing block),
+    ``number_per_run`` and ``mean`` attributes that ``time_func`` callers and the
+    samples-CSV writer rely on.
+    """
+
+    def __init__(self, times, number_per_run, mean):
+        self.times = times
+        self.number_per_run = number_per_run
+        self.mean = mean
+
+
+def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS,
+              min_samples=None):
     """Time *fn* and return ``(mean_ms, measurement)``.
 
-    The ``Measurement`` object carries per-sample times accessible via
-    ``measurement.times`` (total wall time per run) and
-    ``measurement.number_per_run``.
+    The returned measurement exposes per-run sample times via
+    ``measurement.times`` -- one entry per recorded timing block (each block is
+    an average over ``measurement.number_per_run`` executions, as chosen by
+    torch to amortize timer overhead).
 
     method: "adaptive" uses adaptive_autorange (good for compute-bound),
             "blocked"  uses blocked_autorange  (good for memory-bound).
+
+    min_samples: ensure at least this many raw timing blocks are recorded, so the
+        per-sample data is large enough for statistical comparison
+        (compare_results.py --stats). torch's autorange usually records only a
+        few blocks; any shortfall is topped up with additional equal-sized blocks
+        rather than re-running and re-averaging the whole measurement. ``None``
+        falls back to the module-level ``_ACTIVE_MIN_SAMPLES`` (set from
+        ``--min-samples``); ``None`` there too leaves the autorange result as-is.
     """
+    if min_samples is None:
+        min_samples = _ACTIVE_MIN_SAMPLES
+
     timer = benchmark.Timer(stmt="fn()", globals={"fn": fn})
     if method == "blocked":
         m = timer.blocked_autorange(min_run_time=min_run_time)
     else:
         m = timer.adaptive_autorange(min_run_time=min_run_time)
-    return m.mean * 1e3, m
+
+    if min_samples is None or len(m.times) >= min_samples:
+        return m.mean * 1e3, m
+
+    # Top up with additional equal-sized blocks (each timeit() records one block
+    # averaged over number_per_run runs) until enough raw samples are collected.
+    times = list(m.times)  # per-run seconds
+    number = m.number_per_run
+    while len(times) < min_samples:
+        times.append(timer.timeit(number).mean)
+    mean_s = sum(times) / len(times)
+    return mean_s * 1e3, _RawSamples(times=times, number_per_run=number, mean=mean_s)
 
 
 # ---------------------------------------------------------------------------
@@ -266,6 +310,16 @@ def make_parser(**kwargs):
             "Optional filename; default derived from script name."
         ),
     )
+    parser.add_argument(
+        "--min-samples", type=int, default=12, metavar="N",
+        help=(
+            "Ensure at least N raw timing samples (blocks) are recorded per "
+            "metric for statistical comparison (compare_results.py --stats). "
+            "torch's autorange records only a few; any shortfall is topped up "
+            "with additional equal-sized blocks. Use a small value (e.g. 2) to "
+            "effectively disable top-up. Default: 12."
+        ),
+    )
     return parser
 
 
@@ -301,6 +355,11 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None,
     if args is None:
         args = make_parser().parse_args()
 
+    # Let time_func (called by bench_fns without an explicit min_samples arg)
+    # inherit the CLI value without editing every benchmark script.
+    global _ACTIVE_MIN_SAMPLES
+    _ACTIVE_MIN_SAMPLES = getattr(args, "min_samples", None)
+
     rows = []
     all_case_metrics = []
     resolved_metric_columns = None
@@ -355,16 +414,36 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None,
                 if measurement is None:
                     continue
                 lbl = metric["label"]
+                unit = metric.get("unit")
+                thr_mean = metric.get("throughput") or 0.0
+                ms_mean = metric.get("ms") or 0.0
+                # Throughput is a deterministic function of time for a given
+                # config (throughput = C / time), so a per-sample throughput is
+                # recovered from the aggregate as thr_mean * ms_mean / sample_ms.
+                # samples_only records (e.g. Forward+Backward) carry no
+                # throughput and are left blank.
+                has_thr = (
+                    not metric.get("samples_only") and thr_mean > 0 and ms_mean > 0
+                )
                 for i, t in enumerate(measurement.times):
+                    # measurement.times entries are already per-run (seconds).
+                    sample_ms = t * 1e3
                     sr = dict(case_params)
                     sr["label"] = lbl
                     sr["sample_idx"] = i
-                    sr["time_ms"] = t / measurement.number_per_run * 1e3
+                    sr["time_ms"] = sample_ms
+                    sr["throughput"] = (
+                        thr_mean * ms_mean / sample_ms
+                        if has_thr and sample_ms > 0
+                        else ""
+                    )
+                    sr["unit"] = unit if has_thr else ""
                     sample_rows.append(sr)
         if sample_rows:
             df = pd.DataFrame(
                 sample_rows,
-                columns=param_columns + ["label", "sample_idx", "time_ms"],
+                columns=param_columns
+                + ["label", "sample_idx", "time_ms", "throughput", "unit"],
             )
             df.to_csv(samples_csv, index=False)
             print(f"Samples saved to {samples_csv}")