From 02df3c75980a23263b1bc3089db63b55bac31041 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Fri, 5 Jun 2026 16:31:07 +0000
Subject: [PATCH 1/2] Add statistical benchmark comparison via benchstats

Adds a statistically-rigorous comparison path to the microbenchmark suite,
on top of the existing throughput-ratio comparison.

- utils.py: time_func gains a repetitions param; --repetitions flag (default
  15) threaded through run_benchmarks so all bench scripts collect enough
  per-sample data with no per-script edits. repetitions<=1 preserves the
  original single-measurement behavior.
- parser_TEsamples.py: benchstats ParserBase reading the --csv-samples CSV
  into {bench_name: {time_ms: ndarray}}.
- compare_results.py: new --stats mode running a Brunner-Munzel test (via
  benchstats) on two samples CSVs; exits 1 on a significant regression for
  CI gating. Default ratio path unchanged; benchstats lazy-imported.
- requirements.txt: benchstats>=3.4.
- README: document the repetitions knob and --stats workflow.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/microbenchmarks/README.md          |  33 ++++++
 benchmarks/microbenchmarks/compare_results.py | 101 ++++++++++++++++-
 .../microbenchmarks/parser_TEsamples.py       | 107 ++++++++++++++++++
 benchmarks/microbenchmarks/requirements.txt   |   3 +
 benchmarks/microbenchmarks/utils.py           |  70 ++++++++++--
 5 files changed, 302 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/microbenchmarks/parser_TEsamples.py
 create mode 100644 benchmarks/microbenchmarks/requirements.txt

diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md
index ba868b0f0..67e5ccc2a 100644
--- a/benchmarks/microbenchmarks/README.md
+++ b/benchmarks/microbenchmarks/README.md
@@ -31,6 +31,12 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv
 The samples CSV contains one row per timing sample with columns for all
 benchmark parameters plus `label`, `sample_idx`, and `time_ms`.
 
+By default each metric is measured over 15 repetitions (one sample per
+repetition). Use `--repetitions N` to change this; `--repetitions 1` reproduces
+the original single-measurement behavior. Statistical comparison (see below)
+needs at least ~10 samples, so keep the default (or higher) when producing
+samples CSVs for `--stats`.
+
 ## Shared configuration
 
 Common benchmark settings live in `utils.py`.
@@ -75,3 +81,30 @@ python compare_results.py baseline.csv candidate.csv --bench-name GEMM
 
 The script auto-detects metric columns, computes speedups for overlapping rows,
 and reports rows that exist only in the baseline or only in the candidate.
+
+### Statistical comparison (`--stats`)
+
+The ratio comparison above uses point estimates and cannot tell a real
+regression from measurement noise. To test whether timing differences are
+statistically significant, run the benchmark with `--csv-samples` on both
+sides and compare the samples CSVs with `--stats`:
+
+```bash
+pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
+
+# baseline checkout
+python benchmark_gemm.py --csv-samples baseline_samples.csv --repetitions 15
+# candidate checkout
+python benchmark_gemm.py --csv-samples candidate_samples.csv --repetitions 15
+
+python compare_results.py baseline_samples.csv candidate_samples.csv --stats
+```
+
+`--stats` uses the [benchstats](https://github.com/Arech/benchstats) package to
+apply a Brunner-Munzel test (override with `--method`) at significance level
+`--alpha` (default `0.001`) to each `(config, label)` pair. It prints a table
+marking each benchmark as faster (`<`), slower (`>`), or not significantly
+different (`~`), and exits `1` when a significant difference in the timing
+metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`,
+`.txt`) to save the report, and `--always-show-pvalues` to show p-values for
+non-significant rows.
diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py
index 4a7e1dab8..540a9e433 100755
--- a/benchmarks/microbenchmarks/compare_results.py
+++ b/benchmarks/microbenchmarks/compare_results.py
@@ -7,12 +7,22 @@
 """
 Compare two CSVs from the same benchmark suite.
 
-Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns.
-Outputs a markdown <details> block to stdout with per-config results,
-and optionally appends a summary table row to --summary-file.
+Two modes:
+
+1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric
+   columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput
+   speedups, and emits a markdown <details> block (optionally appending a
+   summary row to --summary-file).
+
+2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using
+   a statistical test (Brunner-Munzel by default) via the benchstats package.
+   Reports whether per-config timing differences are significant and exits 1
+   when a significant regression is found (for CI gating). Requires
+   ``pip install -r requirements.txt``.
 
 Usage:
     python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE
+    python compare_results.py base_samples.csv cand_samples.csv --stats
 """
 
 import argparse
@@ -49,6 +59,64 @@ def print_key_table(title, rows_df, key_cols):
     print()
 
 
+def run_stats(args):
+    """Compare two samples CSVs with a statistical test via benchstats.
+
+    Returns a process exit code: 1 if a significant difference is found in the
+    main metric (timing), else 0.
+    """
+    import os
+
+    import rich.table  # noqa: F401  benchstats 3.4.0 render uses rich.table.Table without importing it
+    from parser_TEsamples import parser_TEsamples
+    from benchstats.compare import compareStats
+    from benchstats.render import renderComparisonResults
+    from benchstats.common import LoggingConsole, detectExportFormat
+
+    metrics = ["time_ms"]
+
+    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
+    if export_fmt is not None and os.path.isfile(args.export_to):
+        os.remove(args.export_to)
+
+    console = LoggingConsole(
+        record=export_fmt is not None,
+        log_level=LoggingConsole.LogLevel.Warning,
+    )
+
+    s1 = parser_TEsamples(args.baseline_csv, None, metrics, debug_log=console).getStats()
+    s2 = parser_TEsamples(args.candidate_csv, None, metrics, debug_log=console).getStats()
+
+    cr = compareStats(
+        s1, s2,
+        method=args.method,
+        alpha=args.alpha,
+        main_metrics=metrics,
+        debug_log=console,
+    )
+
+    renderComparisonResults(
+        cr, console,
+        main_metrics=metrics,
+        always_show_pvalues=args.always_show_pvalues,
+    )
+
+    if export_fmt is not None:
+        if export_fmt == "txt":
+            console.save_text(args.export_to)
+        elif export_fmt == "svg":
+            console.save_svg(args.export_to, title="")
+        elif export_fmt == "html":
+            console.save_html(args.export_to)
+
+    if cr.at_least_one_differs:
+        console.warning(
+            "At least one significant timing difference was detected (exit 1)."
+        )
+        return 1
+    return 0
+
+
 def main():
     parser = argparse.ArgumentParser(description="Compare benchmark CSVs")
     parser.add_argument("baseline_csv", help="Baseline CSV")
@@ -67,8 +135,35 @@ def main():
             "Set to 0 to disable the filter."
         ),
     )
+
+    stats_group = parser.add_argument_group(
+        "statistical comparison (--stats mode; operates on --csv-samples CSVs)"
+    )
+    stats_group.add_argument(
+        "--stats", action="store_true",
+        help="Compare per-sample CSVs with a statistical test via benchstats.",
+    )
+    stats_group.add_argument(
+        "--alpha", type=float, default=0.001,
+        help="Significance level for the test (default: 0.001).",
+    )
+    stats_group.add_argument(
+        "--method", default="brunnermunzel",
+        help="Statistical test to use (default: brunnermunzel).",
+    )
+    stats_group.add_argument(
+        "--always-show-pvalues", action="store_true",
+        help="Always show p-values, including for non-significant results.",
+    )
+    stats_group.add_argument(
+        "--export-to", default=None, metavar="FILE",
+        help="Export the report to a .txt/.svg/.html file (format from extension).",
+    )
     args = parser.parse_args()
 
+    if args.stats:
+        return run_stats(args)
+
     baseline_df = pd.read_csv(args.baseline_csv)
     candidate_df = pd.read_csv(args.candidate_csv)
 
diff --git a/benchmarks/microbenchmarks/parser_TEsamples.py b/benchmarks/microbenchmarks/parser_TEsamples.py
new file mode 100644
index 000000000..dc6a07c0e
--- /dev/null
+++ b/benchmarks/microbenchmarks/parser_TEsamples.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""benchstats parser for Transformer Engine microbenchmark samples CSVs.
+
+Reads the per-sample CSV produced by ``run_benchmarks(... )`` with the
+``--csv-samples`` flag (columns: the benchmark parameter columns plus
+``label``, ``sample_idx``, ``time_ms``) and turns it into the
+``{benchmark_name: {metric: ndarray}}`` structure consumed by
+``benchstats.compare.compareStats``.
+
+A benchmark name is built by joining every column except ``sample_idx`` and the
+metric column, so each unique (parameters, label) combination becomes one
+benchmark. The single available metric is ``time_ms`` (lower is better).
+
+The class name matches the file name (``parser_TEsamples``) so it can also be
+loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
+"""
+
+import re
+
+import numpy as np
+import pandas as pd
+
+from benchstats.common import ParserBase, LoggingConsole
+
+_METRIC = "time_ms"
+_NON_NAME_COLS = ("sample_idx", _METRIC)
+_NAME_DELIM = " | "
+
+
+class parser_TEsamples(ParserBase):
+    def __init__(self, csv_file_path, filter, metrics, debug_log=True) -> None:
+        assert isinstance(csv_file_path, str)
+        assert filter is None or isinstance(filter, (str, re.Pattern))
+        assert isinstance(metrics, (list, tuple)) and len(metrics) > 0
+        assert all(isinstance(m, str) for m in metrics)
+
+        if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
+            self.debug_log = False
+        elif isinstance(debug_log, bool) and debug_log:
+            self.debug_log = True
+            self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug)
+        else:
+            self.debug_log = True
+            self.logger = debug_log
+
+        unsupported = [m for m in metrics if m != _METRIC]
+        if unsupported:
+            raise ValueError(
+                f"parser_TEsamples only supports the '{_METRIC}' metric, got: {unsupported}. "
+                "The samples CSV produced by --csv-samples carries per-run times only."
+            )
+
+        self.file = csv_file_path
+        self.filter = filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
+        self._stats = self._build()
+
+    def getStats(self) -> dict[str, dict[str, np.ndarray]]:
+        return self._stats
+
+    def _build(self) -> dict[str, dict[str, np.ndarray]]:
+        df = pd.read_csv(self.file)
+
+        for col in _NON_NAME_COLS:
+            if col not in df.columns:
+                raise ValueError(
+                    f"'{col}' column not found in '{self.file}'. Was the CSV written with "
+                    "--csv-samples?"
+                )
+
+        name_cols = [c for c in df.columns if c not in _NON_NAME_COLS]
+        if not name_cols:
+            raise ValueError(f"No benchmark-name columns found in '{self.file}'.")
+
+        df[_METRIC] = pd.to_numeric(df[_METRIC], errors="coerce")
+
+        stats: dict[str, dict[str, np.ndarray]] = {}
+        for key_vals, group in df.groupby(name_cols, sort=False):
+            if not isinstance(key_vals, tuple):
+                key_vals = (key_vals,)
+            bm_name = _NAME_DELIM.join(str(v) for v in key_vals)
+
+            if self.filter is not None and self.filter.search(bm_name) is None:
+                continue
+
+            samples = group[_METRIC].to_numpy(dtype=np.float64)
+            samples = samples[np.isfinite(samples)]
+            if samples.size == 0:
+                if self.debug_log:
+                    self.logger.warning(
+                        f"parser_TEsamples: benchmark '{bm_name}' has no finite samples; skipping."
+                    )
+                continue
+            if self.debug_log and samples.size < 10:
+                self.logger.warning(
+                    f"parser_TEsamples: benchmark '{bm_name}' has only {samples.size} samples "
+                    "(>= 10 recommended). Re-run the benchmark with a larger --repetitions."
+                )
+            stats[bm_name] = {_METRIC: samples}
+
+        if not stats and self.debug_log:
+            self.logger.warning(f"parser_TEsamples: no benchmarks read from '{self.file}'.")
+        return stats
diff --git a/benchmarks/microbenchmarks/requirements.txt b/benchmarks/microbenchmarks/requirements.txt
new file mode 100644
index 000000000..32aa6a635
--- /dev/null
+++ b/benchmarks/microbenchmarks/requirements.txt
@@ -0,0 +1,3 @@
+# Extra dependencies for statistical benchmark comparison
+# (compare_results.py --stats). benchstats pulls in rich, scipy and numpy.
+benchstats>=3.4
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 99ae6a244..825a08924 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -21,6 +21,12 @@
 
 DEFAULT_MIN_RUN_TIME_SECONDS = 0.2
 
+# Number of repetitions used by ``time_func`` when a caller passes
+# ``repetitions=None``. ``run_benchmarks`` sets this from the ``--repetitions``
+# CLI flag so every benchmark script inherits the knob without per-script edits.
+# ``None`` (the default) preserves the original single-measurement behavior.
+_ACTIVE_REPETITIONS = None
+
 # ---------------------------------------------------------------------------
 # Model configurations
 # ---------------------------------------------------------------------------
@@ -88,22 +94,54 @@ def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None):
 # Timing helpers
 # ---------------------------------------------------------------------------
 
-def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS):
+class _RepeatedMeasurement:
+    """Minimal ``torch...benchmark.Measurement`` stand-in for repeated runs.
+
+    Exposes the ``times`` / ``number_per_run`` / ``mean`` attributes the
+    samples-CSV writer relies on. ``times`` holds one per-run mean (in seconds)
+    for each repetition, so ``--csv-samples`` emits one row per repetition.
+    """
+
+    def __init__(self, times, mean):
+        self.times = times
+        self.number_per_run = 1
+        self.mean = mean
+
+
+def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS,
+              repetitions=None):
     """Time *fn* and return ``(mean_ms, measurement)``.
 
-    The ``Measurement`` object carries per-sample times accessible via
-    ``measurement.times`` (total wall time per run) and
-    ``measurement.number_per_run``.
+    The returned measurement carries per-sample times accessible via
+    ``measurement.times`` and ``measurement.number_per_run``.
 
     method: "adaptive" uses adaptive_autorange (good for compute-bound),
             "blocked"  uses blocked_autorange  (good for memory-bound).
+
+    repetitions: number of independent autorange measurements to collect. Each
+        repetition contributes one per-run mean to ``measurement.times`` (the
+        form expected by statistical comparison via ``compare_results.py
+        --stats``). ``None`` falls back to the module-level ``_ACTIVE_REPETITIONS``
+        (set from ``--repetitions``); a value <= 1 reproduces the original
+        single-measurement behavior and returns the native ``Measurement``.
     """
+    if repetitions is None:
+        repetitions = _ACTIVE_REPETITIONS
+
     timer = benchmark.Timer(stmt="fn()", globals={"fn": fn})
-    if method == "blocked":
-        m = timer.blocked_autorange(min_run_time=min_run_time)
-    else:
-        m = timer.adaptive_autorange(min_run_time=min_run_time)
-    return m.mean * 1e3, m
+
+    def _measure():
+        if method == "blocked":
+            return timer.blocked_autorange(min_run_time=min_run_time)
+        return timer.adaptive_autorange(min_run_time=min_run_time)
+
+    if repetitions is None or repetitions <= 1:
+        m = _measure()
+        return m.mean * 1e3, m
+
+    means = [_measure().mean for _ in range(repetitions)]  # per-run seconds
+    mean_s = sum(means) / len(means)
+    return mean_s * 1e3, _RepeatedMeasurement(times=means, mean=mean_s)
 
 
 # ---------------------------------------------------------------------------
@@ -266,6 +304,15 @@ def make_parser(**kwargs):
             "Optional filename; default derived from script name."
         ),
     )
+    parser.add_argument(
+        "--repetitions", type=int, default=15, metavar="N",
+        help=(
+            "Number of independent timing repetitions per metric. Each "
+            "repetition yields one sample for statistical comparison "
+            "(compare_results.py --stats). Use 1 for the original "
+            "single-measurement behavior. Default: 15."
+        ),
+    )
     return parser
 
 
@@ -301,6 +348,11 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None,
     if args is None:
         args = make_parser().parse_args()
 
+    # Let time_func (called by bench_fns without an explicit repetitions arg)
+    # inherit the CLI value without editing every benchmark script.
+    global _ACTIVE_REPETITIONS
+    _ACTIVE_REPETITIONS = getattr(args, "repetitions", None)
+
     rows = []
     all_case_metrics = []
     resolved_metric_columns = None

From e592a1cbb4f222c833f5e955fbf74bd2210a9be5 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Fri, 5 Jun 2026 19:36:52 +0000
Subject: [PATCH 2/2] Show throughput in --stats and ensure >10 raw samples

Refines the benchstats integration:

- compare_results.py --stats now reports calculated throughput/bandwidth
  (TFLOPS / GB/s) alongside timing. Time (seconds) is the main metric and
  drives the exit code; throughput is a secondary metric. The throughput
  unit suffix is blanked (the column header names the unit) and time is fed
  in seconds so benchstats' auto-scaling (ms/us/ns) is correct.
- utils.py: --csv-samples now also writes per-sample throughput + unit
  columns, derived from the sample time (blank for samples-only records).
- parser_TEsamples.py exposes time_s (main) and a unit-keyed throughput
  metric, dropping throughput-less composite rows so the metric set stays
  uniform for benchstats' renderer.
- utils.py: replace --repetitions with --min-samples (default 12). Instead
  of re-running autorange and averaging, time_func keeps the raw per-block
  samples and tops up any shortfall with additional equal-sized blocks.
  Also fix the samples writer to not re-divide already-per-run times.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/microbenchmarks/README.md          |  25 ++-
 benchmarks/microbenchmarks/compare_results.py |  23 ++-
 .../microbenchmarks/parser_TEsamples.py       | 164 +++++++++++++-----
 benchmarks/microbenchmarks/utils.py           | 113 +++++++-----
 4 files changed, 224 insertions(+), 101 deletions(-)

diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md
index 67e5ccc2a..40f7a1eed 100644
--- a/benchmarks/microbenchmarks/README.md
+++ b/benchmarks/microbenchmarks/README.md
@@ -29,13 +29,16 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv
 ```
 
 The samples CSV contains one row per timing sample with columns for all
-benchmark parameters plus `label`, `sample_idx`, and `time_ms`.
+benchmark parameters plus `label`, `sample_idx`, `time_ms`, `throughput`, and
+`unit`. The per-sample `throughput` is derived from the sample time (it is blank
+for samples-only records such as `Forward+Backward`, which carry no throughput).
 
-By default each metric is measured over 15 repetitions (one sample per
-repetition). Use `--repetitions N` to change this; `--repetitions 1` reproduces
-the original single-measurement behavior. Statistical comparison (see below)
-needs at least ~10 samples, so keep the default (or higher) when producing
-samples CSVs for `--stats`.
+torch's autorange records only a few raw timing blocks per metric. To get
+enough samples for statistical comparison, `--min-samples N` (default 12)
+ensures at least `N` raw blocks are recorded, topping up any shortfall with
+additional equal-sized blocks rather than re-averaging the whole measurement.
+Statistical comparison (see below) needs at least ~10 samples, so keep the
+default (or higher) when producing samples CSVs for `--stats`.
 
 ## Shared configuration
 
@@ -93,9 +96,9 @@ sides and compare the samples CSVs with `--stats`:
 pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
 
 # baseline checkout
-python benchmark_gemm.py --csv-samples baseline_samples.csv --repetitions 15
+python benchmark_gemm.py --csv-samples baseline_samples.csv --min-samples 12
 # candidate checkout
-python benchmark_gemm.py --csv-samples candidate_samples.csv --repetitions 15
+python benchmark_gemm.py --csv-samples candidate_samples.csv --min-samples 12
 
 python compare_results.py baseline_samples.csv candidate_samples.csv --stats
 ```
@@ -108,3 +111,9 @@ different (`~`), and exits `1` when a significant difference in the timing
 metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`,
 `.txt`) to save the report, and `--always-show-pvalues` to show p-values for
 non-significant rows.
+
+Time (the main metric, which drives the exit code) is shown alongside the
+calculated throughput/bandwidth (`TFLOPS` or `GB/s`), reported as a secondary
+metric. Because the table requires a uniform metric set, samples-only composite
+rows that carry no throughput (e.g. `Forward+Backward`) are omitted from the
+comparison; their raw samples remain in the CSV.
diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py
index 540a9e433..1c231af86 100755
--- a/benchmarks/microbenchmarks/compare_results.py
+++ b/benchmarks/microbenchmarks/compare_results.py
@@ -62,8 +62,12 @@ def print_key_table(title, rows_df, key_cols):
 def run_stats(args):
     """Compare two samples CSVs with a statistical test via benchstats.
 
+    Timing (``time_ms``) is the main metric and drives the exit code; throughput
+    / bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when
+    present in the CSV.
+
     Returns a process exit code: 1 if a significant difference is found in the
-    main metric (timing), else 0.
+    main (timing) metric, else 0.
     """
     import os
 
@@ -73,7 +77,7 @@ def run_stats(args):
     from benchstats.render import renderComparisonResults
     from benchstats.common import LoggingConsole, detectExportFormat
 
-    metrics = ["time_ms"]
+    main_metrics = ["time_s"]
 
     export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
     if export_fmt is not None and os.path.isfile(args.export_to):
@@ -84,20 +88,27 @@ def run_stats(args):
         log_level=LoggingConsole.LogLevel.Warning,
     )
 
-    s1 = parser_TEsamples(args.baseline_csv, None, metrics, debug_log=console).getStats()
-    s2 = parser_TEsamples(args.candidate_csv, None, metrics, debug_log=console).getStats()
+    # metrics=None exposes every metric the CSV carries (time + throughput).
+    s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats()
+    s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats()
 
     cr = compareStats(
         s1, s2,
         method=args.method,
         alpha=args.alpha,
-        main_metrics=metrics,
+        main_metrics=main_metrics,
         debug_log=console,
     )
 
+    # Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats'
+    # default per-value "s" suffix for them (the column header names the unit).
+    secondary = [m for m in cr.getMetrics() if m not in main_metrics]
+    style_overrides = {f"metric_{m}_unit": "" for m in secondary}
+
     renderComparisonResults(
         cr, console,
-        main_metrics=metrics,
+        main_metrics=main_metrics,
+        style_overrides=style_overrides or None,
         always_show_pvalues=args.always_show_pvalues,
     )
 
diff --git a/benchmarks/microbenchmarks/parser_TEsamples.py b/benchmarks/microbenchmarks/parser_TEsamples.py
index dc6a07c0e..44d472158 100644
--- a/benchmarks/microbenchmarks/parser_TEsamples.py
+++ b/benchmarks/microbenchmarks/parser_TEsamples.py
@@ -6,15 +6,27 @@
 ###############################################################################
 """benchstats parser for Transformer Engine microbenchmark samples CSVs.
 
-Reads the per-sample CSV produced by ``run_benchmarks(... )`` with the
-``--csv-samples`` flag (columns: the benchmark parameter columns plus
-``label``, ``sample_idx``, ``time_ms``) and turns it into the
-``{benchmark_name: {metric: ndarray}}`` structure consumed by
-``benchstats.compare.compareStats``.
+Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the
+``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}``
+structure consumed by ``benchstats.compare.compareStats``.
 
-A benchmark name is built by joining every column except ``sample_idx`` and the
-metric column, so each unique (parameters, label) combination becomes one
-benchmark. The single available metric is ``time_ms`` (lower is better).
+Columns of the samples CSV: the benchmark parameter columns, plus ``label``,
+``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is
+built by joining every parameter column and ``label``, so each unique
+(parameters, label) combination becomes one benchmark.
+
+Two metrics are exposed:
+
+- ``time_s`` (seconds, lower is better) -- always present; intended as the
+  *main* metric. Exposed in seconds because benchstats' renderer auto-scales
+  time values (to ms/us/ns) assuming a seconds base unit.
+- the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher
+  is better) -- present when the CSV carries throughput values.
+
+``benchstats``' renderer requires every benchmark to expose the same metric set.
+Records without throughput (the samples-only ``Forward+Backward`` composites) are
+therefore dropped from the comparison when throughput is available for the other
+benchmarks; their raw samples remain in the CSV for other downstream analysis.
 
 The class name matches the file name (``parser_TEsamples``) so it can also be
 loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
@@ -27,17 +39,22 @@
 
 from benchstats.common import ParserBase, LoggingConsole
 
-_METRIC = "time_ms"
-_NON_NAME_COLS = ("sample_idx", _METRIC)
+_TIME_COL = "time_ms"          # column name in the samples CSV (milliseconds)
+_TIME_KEY = "time_s"           # metric key exposed to benchstats (seconds)
+_THR_COL = "throughput"
+_UNIT_COL = "unit"
+_GENERIC_THR = "throughput"
+_NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL)
 _NAME_DELIM = " | "
 
 
 class parser_TEsamples(ParserBase):
-    def __init__(self, csv_file_path, filter, metrics, debug_log=True) -> None:
+    def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None:
         assert isinstance(csv_file_path, str)
         assert filter is None or isinstance(filter, (str, re.Pattern))
-        assert isinstance(metrics, (list, tuple)) and len(metrics) > 0
-        assert all(isinstance(m, str) for m in metrics)
+        assert metrics is None or (
+            isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics)
+        )
 
         if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
             self.debug_log = False
@@ -48,37 +65,40 @@ def __init__(self, csv_file_path, filter, metrics, debug_log=True) -> None:
             self.debug_log = True
             self.logger = debug_log
 
-        unsupported = [m for m in metrics if m != _METRIC]
-        if unsupported:
-            raise ValueError(
-                f"parser_TEsamples only supports the '{_METRIC}' metric, got: {unsupported}. "
-                "The samples CSV produced by --csv-samples carries per-run times only."
-            )
-
         self.file = csv_file_path
-        self.filter = filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
+        self.filter = (
+            filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
+        )
+        self._requested_metrics = list(metrics) if metrics is not None else None
         self._stats = self._build()
 
     def getStats(self) -> dict[str, dict[str, np.ndarray]]:
         return self._stats
 
+    def _log(self, level, msg):
+        if self.debug_log:
+            getattr(self.logger, level)(f"parser_TEsamples: {msg}")
+
     def _build(self) -> dict[str, dict[str, np.ndarray]]:
         df = pd.read_csv(self.file)
 
-        for col in _NON_NAME_COLS:
-            if col not in df.columns:
-                raise ValueError(
-                    f"'{col}' column not found in '{self.file}'. Was the CSV written with "
-                    "--csv-samples?"
-                )
+        if _TIME_COL not in df.columns or "sample_idx" not in df.columns:
+            raise ValueError(
+                f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. "
+                "Was it written with --csv-samples?"
+            )
 
         name_cols = [c for c in df.columns if c not in _NON_NAME_COLS]
         if not name_cols:
             raise ValueError(f"No benchmark-name columns found in '{self.file}'.")
 
-        df[_METRIC] = pd.to_numeric(df[_METRIC], errors="coerce")
+        df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce")
+        has_thr_col = _THR_COL in df.columns
+        if has_thr_col:
+            df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce")
 
-        stats: dict[str, dict[str, np.ndarray]] = {}
+        # First pass: collect per-benchmark time samples, throughput samples and unit.
+        per_bm = {}  # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None}
         for key_vals, group in df.groupby(name_cols, sort=False):
             if not isinstance(key_vals, tuple):
                 key_vals = (key_vals,)
@@ -87,21 +107,77 @@ def _build(self) -> dict[str, dict[str, np.ndarray]]:
             if self.filter is not None and self.filter.search(bm_name) is None:
                 continue
 
-            samples = group[_METRIC].to_numpy(dtype=np.float64)
-            samples = samples[np.isfinite(samples)]
-            if samples.size == 0:
-                if self.debug_log:
-                    self.logger.warning(
-                        f"parser_TEsamples: benchmark '{bm_name}' has no finite samples; skipping."
-                    )
+            time_ms = group[_TIME_COL].to_numpy(dtype=np.float64)
+            time_ms = time_ms[np.isfinite(time_ms)]
+            if time_ms.size == 0:
+                self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.")
                 continue
-            if self.debug_log and samples.size < 10:
-                self.logger.warning(
-                    f"parser_TEsamples: benchmark '{bm_name}' has only {samples.size} samples "
-                    "(>= 10 recommended). Re-run the benchmark with a larger --repetitions."
+            # benchstats' renderer auto-scales assuming seconds, so expose seconds.
+            time_s = time_ms / 1e3
+
+            thr_s, unit = None, None
+            if has_thr_col:
+                thr_s = group[_THR_COL].to_numpy(dtype=np.float64)
+                thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)]
+                if thr_s.size == 0:
+                    thr_s = None
+                else:
+                    units = [u for u in group[_UNIT_COL].astype(str).unique()
+                             if u and u.lower() != "nan"] if _UNIT_COL in df.columns else []
+                    unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None)
+
+            if self.debug_log and time_s.size < 10:
+                self._log(
+                    "warning",
+                    f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); "
+                    "re-run with a larger --min-samples.",
                 )
-            stats[bm_name] = {_METRIC: samples}
-
-        if not stats and self.debug_log:
-            self.logger.warning(f"parser_TEsamples: no benchmarks read from '{self.file}'.")
+            per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit}
+
+        if not per_bm:
+            self._log("warning", f"no benchmarks read from '{self.file}'.")
+            return {}
+
+        # Decide on a uniform metric set across all benchmarks.
+        with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None}
+        thr_key = self._resolve_throughput_key(with_thr)
+
+        if thr_key is not None and 0 < len(with_thr) < len(per_bm):
+            dropped = sorted(set(per_bm) - set(with_thr))
+            self._log(
+                "warning",
+                f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so "
+                f"throughput can be shown uniformly: {', '.join(dropped)}",
+            )
+            per_bm = with_thr
+
+        # Build the result, honoring an explicit metric request if given.
+        stats = {}
+        for bm_name, d in per_bm.items():
+            entry = {}
+            if self._metric_requested(_TIME_KEY, thr_key):
+                entry[_TIME_KEY] = d["time"]
+            if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key):
+                entry[thr_key] = d["thr"]
+            if entry:
+                stats[bm_name] = entry
         return stats
+
+    def _resolve_throughput_key(self, with_thr):
+        """Return a single throughput metric key shared by all throughput-bearing benchmarks."""
+        if not with_thr:
+            return None
+        units = {d["unit"] for d in with_thr.values()}
+        if len(units) == 1:
+            return next(iter(units)) or _GENERIC_THR
+        return _GENERIC_THR  # mixed units in one file (atypical) -> generic header
+
+    def _metric_requested(self, key, thr_key):
+        """Honor an explicit metrics= request (benchstats CLI), else expose everything."""
+        if self._requested_metrics is None:
+            return True
+        req = self._requested_metrics
+        if key == _TIME_KEY:
+            return any(t in req for t in (_TIME_KEY, _TIME_COL, "time"))
+        # throughput: match the unit key, the generic name, or literal 'throughput'
+        return key in req or _GENERIC_THR in req or _THR_COL in req
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 825a08924..929019ee8 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -21,11 +21,11 @@
 
 DEFAULT_MIN_RUN_TIME_SECONDS = 0.2
 
-# Number of repetitions used by ``time_func`` when a caller passes
-# ``repetitions=None``. ``run_benchmarks`` sets this from the ``--repetitions``
-# CLI flag so every benchmark script inherits the knob without per-script edits.
-# ``None`` (the default) preserves the original single-measurement behavior.
-_ACTIVE_REPETITIONS = None
+# Minimum number of raw timing samples (blocks) ``time_func`` ensures when a
+# caller passes ``min_samples=None``. ``run_benchmarks`` sets this from the
+# ``--min-samples`` CLI flag so every benchmark script inherits the knob without
+# per-script edits. ``None`` leaves torch's autorange result untouched.
+_ACTIVE_MIN_SAMPLES = None
 
 # ---------------------------------------------------------------------------
 # Model configurations
@@ -94,54 +94,60 @@ def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None):
 # Timing helpers
 # ---------------------------------------------------------------------------
 
-class _RepeatedMeasurement:
-    """Minimal ``torch...benchmark.Measurement`` stand-in for repeated runs.
+class _RawSamples:
+    """Minimal ``torch...benchmark.Measurement`` stand-in holding raw block times.
 
-    Exposes the ``times`` / ``number_per_run`` / ``mean`` attributes the
-    samples-CSV writer relies on. ``times`` holds one per-run mean (in seconds)
-    for each repetition, so ``--csv-samples`` emits one row per repetition.
+    Exposes the ``times`` (per-run seconds, one entry per recorded timing block),
+    ``number_per_run`` and ``mean`` attributes that ``time_func`` callers and the
+    samples-CSV writer rely on.
     """
 
-    def __init__(self, times, mean):
+    def __init__(self, times, number_per_run, mean):
         self.times = times
-        self.number_per_run = 1
+        self.number_per_run = number_per_run
         self.mean = mean
 
 
 def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS,
-              repetitions=None):
+              min_samples=None):
     """Time *fn* and return ``(mean_ms, measurement)``.
 
-    The returned measurement carries per-sample times accessible via
-    ``measurement.times`` and ``measurement.number_per_run``.
+    The returned measurement exposes per-run sample times via
+    ``measurement.times`` -- one entry per recorded timing block (each block is
+    an average over ``measurement.number_per_run`` executions, as chosen by
+    torch to amortize timer overhead).
 
     method: "adaptive" uses adaptive_autorange (good for compute-bound),
             "blocked"  uses blocked_autorange  (good for memory-bound).
 
-    repetitions: number of independent autorange measurements to collect. Each
-        repetition contributes one per-run mean to ``measurement.times`` (the
-        form expected by statistical comparison via ``compare_results.py
-        --stats``). ``None`` falls back to the module-level ``_ACTIVE_REPETITIONS``
-        (set from ``--repetitions``); a value <= 1 reproduces the original
-        single-measurement behavior and returns the native ``Measurement``.
+    min_samples: ensure at least this many raw timing blocks are recorded, so the
+        per-sample data is large enough for statistical comparison
+        (compare_results.py --stats). torch's autorange usually records only a
+        few blocks; any shortfall is topped up with additional equal-sized blocks
+        rather than re-running and re-averaging the whole measurement. ``None``
+        falls back to the module-level ``_ACTIVE_MIN_SAMPLES`` (set from
+        ``--min-samples``); ``None`` there too leaves the autorange result as-is.
     """
-    if repetitions is None:
-        repetitions = _ACTIVE_REPETITIONS
+    if min_samples is None:
+        min_samples = _ACTIVE_MIN_SAMPLES
 
     timer = benchmark.Timer(stmt="fn()", globals={"fn": fn})
+    if method == "blocked":
+        m = timer.blocked_autorange(min_run_time=min_run_time)
+    else:
+        m = timer.adaptive_autorange(min_run_time=min_run_time)
 
-    def _measure():
-        if method == "blocked":
-            return timer.blocked_autorange(min_run_time=min_run_time)
-        return timer.adaptive_autorange(min_run_time=min_run_time)
-
-    if repetitions is None or repetitions <= 1:
-        m = _measure()
+    if min_samples is None or len(m.times) >= min_samples:
         return m.mean * 1e3, m
 
-    means = [_measure().mean for _ in range(repetitions)]  # per-run seconds
-    mean_s = sum(means) / len(means)
-    return mean_s * 1e3, _RepeatedMeasurement(times=means, mean=mean_s)
+    # Top up with additional equal-sized blocks (each timeit() records one block
+    # averaged over number_per_run runs) until enough raw samples are collected.
+    times = list(m.times)  # per-run seconds
+    number = m.number_per_run
+    while len(times) < min_samples:
+        times.append(timer.timeit(number).mean)
+    mean_s = sum(times) / len(times)
+    return mean_s * 1e3, _RawSamples(times=times, number_per_run=number, mean=mean_s)
 
 
 # ---------------------------------------------------------------------------
@@ -305,12 +311,13 @@ def make_parser(**kwargs):
         ),
     )
     parser.add_argument(
-        "--repetitions", type=int, default=15, metavar="N",
+        "--min-samples", type=int, default=12, metavar="N",
         help=(
-            "Number of independent timing repetitions per metric. Each "
-            "repetition yields one sample for statistical comparison "
-            "(compare_results.py --stats). Use 1 for the original "
-            "single-measurement behavior. Default: 15."
+            "Ensure at least N raw timing samples (blocks) are recorded per "
+            "metric for statistical comparison (compare_results.py --stats). "
+            "torch's autorange records only a few; any shortfall is topped up "
+            "with additional equal-sized blocks. Use a small value (e.g. 2) to "
+            "effectively disable top-up. Default: 12."
         ),
     )
     return parser
@@ -348,10 +355,10 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None,
     if args is None:
         args = make_parser().parse_args()
 
-    # Let time_func (called by bench_fns without an explicit repetitions arg)
+    # Let time_func (called by bench_fns without an explicit min_samples arg)
     # inherit the CLI value without editing every benchmark script.
-    global _ACTIVE_REPETITIONS
-    _ACTIVE_REPETITIONS = getattr(args, "repetitions", None)
+    global _ACTIVE_MIN_SAMPLES
+    _ACTIVE_MIN_SAMPLES = getattr(args, "min_samples", None)
 
     rows = []
     all_case_metrics = []
@@ -407,16 +414,36 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None,
                 if measurement is None:
                     continue
                 lbl = metric["label"]
+                unit = metric.get("unit")
+                thr_mean = metric.get("throughput") or 0.0
+                ms_mean = metric.get("ms") or 0.0
+                # Throughput is a deterministic function of time for a given
+                # config (throughput = C / time), so a per-sample throughput is
+                # recovered from the aggregate as thr_mean * ms_mean / sample_ms.
+                # samples_only records (e.g. Forward+Backward) carry no
+                # throughput and are left blank.
+                has_thr = (
+                    not metric.get("samples_only") and thr_mean > 0 and ms_mean > 0
+                )
                 for i, t in enumerate(measurement.times):
+                    # measurement.times entries are already per-run (seconds).
+                    sample_ms = t * 1e3
                     sr = dict(case_params)
                     sr["label"] = lbl
                     sr["sample_idx"] = i
-                    sr["time_ms"] = t / measurement.number_per_run * 1e3
+                    sr["time_ms"] = sample_ms
+                    sr["throughput"] = (
+                        thr_mean * ms_mean / sample_ms
+                        if has_thr and sample_ms > 0
+                        else ""
+                    )
+                    sr["unit"] = unit if has_thr else ""
                     sample_rows.append(sr)
         if sample_rows:
             df = pd.DataFrame(
                 sample_rows,
-                columns=param_columns + ["label", "sample_idx", "time_ms"],
+                columns=param_columns
+                + ["label", "sample_idx", "time_ms", "throughput", "unit"],
             )
             df.to_csv(samples_csv, index=False)
             print(f"Samples saved to {samples_csv}")