ROCm · Micky774 · Jun 5, 2026 · Jun 5, 2026
@@ -29,7 +29,16 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv
 ```
 
 The samples CSV contains one row per timing sample with columns for all
-benchmark parameters plus `label`, `sample_idx`, and `time_ms`.
+benchmark parameters plus `label`, `sample_idx`, `time_ms`, `throughput`, and
+`unit`. The per-sample `throughput` is derived from the sample time (it is blank
+for samples-only records such as `Forward+Backward`, which carry no throughput).
+
+torch's autorange records only a few raw timing blocks per metric. To get
+enough samples for statistical comparison, `--min-samples N` (default 12)
+ensures at least `N` raw blocks are recorded, topping up any shortfall with
+additional equal-sized blocks rather than re-averaging the whole measurement.
+Statistical comparison (see below) needs at least ~10 samples, so keep the
+default (or higher) when producing samples CSVs for `--stats`.
 
 ## Shared configuration
 
@@ -75,3 +84,36 @@ python compare_results.py baseline.csv candidate.csv --bench-name GEMM
 
 The script auto-detects metric columns, computes speedups for overlapping rows,
 and reports rows that exist only in the baseline or only in the candidate.
+
+### Statistical comparison (`--stats`)
+
+The ratio comparison above uses point estimates and cannot tell a real
+regression from measurement noise. To test whether timing differences are
+statistically significant, run the benchmark with `--csv-samples` on both
+sides and compare the samples CSVs with `--stats`:
+
+```bash
+pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
+
+# baseline checkout
+python benchmark_gemm.py --csv-samples baseline_samples.csv --min-samples 12
+# candidate checkout
+python benchmark_gemm.py --csv-samples candidate_samples.csv --min-samples 12
+
+python compare_results.py baseline_samples.csv candidate_samples.csv --stats
+```
+
+`--stats` uses the [benchstats](https://github.com/Arech/benchstats) package to
+apply a Brunner-Munzel test (override with `--method`) at significance level
+`--alpha` (default `0.001`) to each `(config, label)` pair. It prints a table
+marking each benchmark as faster (`<`), slower (`>`), or not significantly
+different (`~`), and exits `1` when a significant difference in the timing
+metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`,
+`.txt`) to save the report, and `--always-show-pvalues` to show p-values for
+non-significant rows.
+
+Time (the main metric, which drives the exit code) is shown alongside the
+calculated throughput/bandwidth (`TFLOPS` or `GB/s`), reported as a secondary
+metric. Because the table requires a uniform metric set, samples-only composite
+rows that carry no throughput (e.g. `Forward+Backward`) are omitted from the
+comparison; their raw samples remain in the CSV.
@@ -7,12 +7,22 @@
 """
 Compare two CSVs from the same benchmark suite.
 
-Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns.
-Outputs a markdown <details> block to stdout with per-config results,
-and optionally appends a summary table row to --summary-file.
+Two modes:
+
+1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric
+   columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput
+   speedups, and emits a markdown <details> block (optionally appending a
+   summary row to --summary-file).
+
+2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using
+   a statistical test (Brunner-Munzel by default) via the benchstats package.
+   Reports whether per-config timing differences are significant and exits 1
+   when a significant regression is found (for CI gating). Requires
+   ``pip install -r requirements.txt``.
 
 Usage:
     python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE
+    python compare_results.py base_samples.csv cand_samples.csv --stats
 """
 
 import argparse
@@ -49,6 +59,75 @@ def print_key_table(title, rows_df, key_cols):
     print()
 
 
+def run_stats(args):
+    """Compare two samples CSVs with a statistical test via benchstats.
+
+    Timing (``time_ms``) is the main metric and drives the exit code; throughput
+    / bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when
+    present in the CSV.
+
+    Returns a process exit code: 1 if a significant difference is found in the
+    main (timing) metric, else 0.
+    """
+    import os
+
+    import rich.table  # noqa: F401  benchstats 3.4.0 render uses rich.table.Table without importing it
+    from parser_TEsamples import parser_TEsamples
+    from benchstats.compare import compareStats
+    from benchstats.render import renderComparisonResults
+    from benchstats.common import LoggingConsole, detectExportFormat
+
+    main_metrics = ["time_s"]
+
+    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
+    if export_fmt is not None and os.path.isfile(args.export_to):
+        os.remove(args.export_to)
+
+    console = LoggingConsole(
+        record=export_fmt is not None,
+        log_level=LoggingConsole.LogLevel.Warning,
+    )
+
+    # metrics=None exposes every metric the CSV carries (time + throughput).
+    s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats()
+    s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats()
+
+    cr = compareStats(
+        s1, s2,
+        method=args.method,
+        alpha=args.alpha,
+        main_metrics=main_metrics,
+        debug_log=console,
+    )
+
+    # Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats'
+    # default per-value "s" suffix for them (the column header names the unit).
+    secondary = [m for m in cr.getMetrics() if m not in main_metrics]
+    style_overrides = {f"metric_{m}_unit": "" for m in secondary}
+
+    renderComparisonResults(
+        cr, console,
+        main_metrics=main_metrics,
+        style_overrides=style_overrides or None,
+        always_show_pvalues=args.always_show_pvalues,
+    )
+
+    if export_fmt is not None:
+        if export_fmt == "txt":
+            console.save_text(args.export_to)
+        elif export_fmt == "svg":
+            console.save_svg(args.export_to, title="")
+        elif export_fmt == "html":
+            console.save_html(args.export_to)
+
+    if cr.at_least_one_differs:
+        console.warning(
+            "At least one significant timing difference was detected (exit 1)."
+        )
+        return 1
+    return 0
+
+
 def main():
     parser = argparse.ArgumentParser(description="Compare benchmark CSVs")
     parser.add_argument("baseline_csv", help="Baseline CSV")
@@ -67,8 +146,35 @@ def main():
             "Set to 0 to disable the filter."
         ),
     )
+
+    stats_group = parser.add_argument_group(
+        "statistical comparison (--stats mode; operates on --csv-samples CSVs)"
+    )
+    stats_group.add_argument(
+        "--stats", action="store_true",
+        help="Compare per-sample CSVs with a statistical test via benchstats.",
+    )
+    stats_group.add_argument(
+        "--alpha", type=float, default=0.001,
+        help="Significance level for the test (default: 0.001).",
+    )
+    stats_group.add_argument(
+        "--method", default="brunnermunzel",
+        help="Statistical test to use (default: brunnermunzel).",
+    )
+    stats_group.add_argument(
+        "--always-show-pvalues", action="store_true",
+        help="Always show p-values, including for non-significant results.",
+    )
+    stats_group.add_argument(
+        "--export-to", default=None, metavar="FILE",
+        help="Export the report to a .txt/.svg/.html file (format from extension).",
+    )
     args = parser.parse_args()
 
+    if args.stats:
+        return run_stats(args)
+
     baseline_df = pd.read_csv(args.baseline_csv)
     candidate_df = pd.read_csv(args.candidate_csv)
 

@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""benchstats parser for Transformer Engine microbenchmark samples CSVs.
+
+Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the
+``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}``
+structure consumed by ``benchstats.compare.compareStats``.
+
+Columns of the samples CSV: the benchmark parameter columns, plus ``label``,
+``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is
+built by joining every parameter column and ``label``, so each unique
+(parameters, label) combination becomes one benchmark.
+
+Two metrics are exposed:
+
+- ``time_s`` (seconds, lower is better) -- always present; intended as the
+  *main* metric. Exposed in seconds because benchstats' renderer auto-scales
+  time values (to ms/us/ns) assuming a seconds base unit.
+- the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher
+  is better) -- present when the CSV carries throughput values.
+
+``benchstats``' renderer requires every benchmark to expose the same metric set.
+Records without throughput (the samples-only ``Forward+Backward`` composites) are
+therefore dropped from the comparison when throughput is available for the other
+benchmarks; their raw samples remain in the CSV for other downstream analysis.
+
+The class name matches the file name (``parser_TEsamples``) so it can also be
+loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
+"""
+
+import re
+
+import numpy as np
+import pandas as pd
+
+from benchstats.common import ParserBase, LoggingConsole
+
+_TIME_COL = "time_ms"          # column name in the samples CSV (milliseconds)
+_TIME_KEY = "time_s"           # metric key exposed to benchstats (seconds)
+_THR_COL = "throughput"
+_UNIT_COL = "unit"
+_GENERIC_THR = "throughput"
+_NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL)
+_NAME_DELIM = " | "
+
+
+class parser_TEsamples(ParserBase):
+    def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None:
+        assert isinstance(csv_file_path, str)
+        assert filter is None or isinstance(filter, (str, re.Pattern))
+        assert metrics is None or (
+            isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics)
+        )
+
+        if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
+            self.debug_log = False
+        elif isinstance(debug_log, bool) and debug_log:
+            self.debug_log = True
+            self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug)
+        else:
+            self.debug_log = True
+            self.logger = debug_log
+
+        self.file = csv_file_path
+        self.filter = (
+            filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
+        )
+        self._requested_metrics = list(metrics) if metrics is not None else None
+        self._stats = self._build()
+
+    def getStats(self) -> dict[str, dict[str, np.ndarray]]:
+        return self._stats
+
+    def _log(self, level, msg):
+        if self.debug_log:
+            getattr(self.logger, level)(f"parser_TEsamples: {msg}")
+
+    def _build(self) -> dict[str, dict[str, np.ndarray]]:
+        df = pd.read_csv(self.file)
+
+        if _TIME_COL not in df.columns or "sample_idx" not in df.columns:
+            raise ValueError(
+                f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. "
+                "Was it written with --csv-samples?"
+            )
+
+        name_cols = [c for c in df.columns if c not in _NON_NAME_COLS]
+        if not name_cols:
+            raise ValueError(f"No benchmark-name columns found in '{self.file}'.")
+
+        df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce")
+        has_thr_col = _THR_COL in df.columns
+        if has_thr_col:
+            df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce")
+
+        # First pass: collect per-benchmark time samples, throughput samples and unit.
+        per_bm = {}  # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None}
+        for key_vals, group in df.groupby(name_cols, sort=False):
+            if not isinstance(key_vals, tuple):
+                key_vals = (key_vals,)
+            bm_name = _NAME_DELIM.join(str(v) for v in key_vals)
+
+            if self.filter is not None and self.filter.search(bm_name) is None:
+                continue
+
+            time_ms = group[_TIME_COL].to_numpy(dtype=np.float64)
+            time_ms = time_ms[np.isfinite(time_ms)]
+            if time_ms.size == 0:
+                self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.")
+                continue
+            # benchstats' renderer auto-scales assuming seconds, so expose seconds.
+            time_s = time_ms / 1e3
+
+            thr_s, unit = None, None
+            if has_thr_col:
+                thr_s = group[_THR_COL].to_numpy(dtype=np.float64)
+                thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)]
+                if thr_s.size == 0:
+                    thr_s = None
+                else:
+                    units = [u for u in group[_UNIT_COL].astype(str).unique()
+                             if u and u.lower() != "nan"] if _UNIT_COL in df.columns else []
+                    unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None)
+
+            if self.debug_log and time_s.size < 10:
+                self._log(
+                    "warning",
+                    f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); "
+                    "re-run with a larger --min-samples.",
+                )
+            per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit}
+
+        if not per_bm:
+            self._log("warning", f"no benchmarks read from '{self.file}'.")
+            return {}
+
+        # Decide on a uniform metric set across all benchmarks.
+        with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None}
+        thr_key = self._resolve_throughput_key(with_thr)
+
+        if thr_key is not None and 0 < len(with_thr) < len(per_bm):
+            dropped = sorted(set(per_bm) - set(with_thr))
+            self._log(
+                "warning",
+                f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so "
+                f"throughput can be shown uniformly: {', '.join(dropped)}",
+            )
+            per_bm = with_thr
+
+        # Build the result, honoring an explicit metric request if given.
+        stats = {}
+        for bm_name, d in per_bm.items():
+            entry = {}
+            if self._metric_requested(_TIME_KEY, thr_key):
+                entry[_TIME_KEY] = d["time"]
+            if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key):
+                entry[thr_key] = d["thr"]
+            if entry:
+                stats[bm_name] = entry
+        return stats
+
+    def _resolve_throughput_key(self, with_thr):
+        """Return a single throughput metric key shared by all throughput-bearing benchmarks."""
+        if not with_thr:
+            return None
+        units = {d["unit"] for d in with_thr.values()}
+        if len(units) == 1:
+            return next(iter(units)) or _GENERIC_THR
+        return _GENERIC_THR  # mixed units in one file (atypical) -> generic header
+
+    def _metric_requested(self, key, thr_key):
+        """Honor an explicit metrics= request (benchstats CLI), else expose everything."""
+        if self._requested_metrics is None:
+            return True
+        req = self._requested_metrics
+        if key == _TIME_KEY:
+            return any(t in req for t in (_TIME_KEY, _TIME_COL, "time"))
+        # throughput: match the unit key, the generic name, or literal 'throughput'
+        return key in req or _GENERIC_THR in req or _THR_COL in req
@@ -0,0 +1,3 @@
+# Extra dependencies for statistical benchmark comparison
+# (compare_results.py --stats). benchstats pulls in rich, scipy and numpy.
+benchstats>=3.4