Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion benchmarks/microbenchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,16 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv
```

The samples CSV contains one row per timing sample with columns for all
benchmark parameters plus `label`, `sample_idx`, and `time_ms`.
benchmark parameters plus `label`, `sample_idx`, `time_ms`, `throughput`, and
`unit`. The per-sample `throughput` is derived from the sample time (it is blank
for samples-only records such as `Forward+Backward`, which carry no throughput).

torch's autorange records only a few raw timing blocks per metric. To get
enough samples for statistical comparison, `--min-samples N` (default 12)
ensures at least `N` raw blocks are recorded, topping up any shortfall with
additional equal-sized blocks rather than re-averaging the whole measurement.
Statistical comparison (see below) needs at least ~10 samples, so keep the
default (or higher) when producing samples CSVs for `--stats`.

## Shared configuration

Expand Down Expand Up @@ -75,3 +84,36 @@ python compare_results.py baseline.csv candidate.csv --bench-name GEMM

The script auto-detects metric columns, computes speedups for overlapping rows,
and reports rows that exist only in the baseline or only in the candidate.

### Statistical comparison (`--stats`)

The ratio comparison above uses point estimates and cannot tell a real
regression from measurement noise. To test whether timing differences are
statistically significant, run the benchmark with `--csv-samples` on both
sides and compare the samples CSVs with `--stats`:

```bash
pip install -r requirements.txt # benchstats (pulls rich, scipy, numpy)

# baseline checkout
python benchmark_gemm.py --csv-samples baseline_samples.csv --min-samples 12
# candidate checkout
python benchmark_gemm.py --csv-samples candidate_samples.csv --min-samples 12

python compare_results.py baseline_samples.csv candidate_samples.csv --stats
```

`--stats` uses the [benchstats](https://github.com/Arech/benchstats) package to
apply a Brunner-Munzel test (override with `--method`) at significance level
`--alpha` (default `0.001`) to each `(config, label)` pair. It prints a table
marking each benchmark as faster (`<`), slower (`>`), or not significantly
different (`~`), and exits `1` when a significant difference in the timing
metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`,
`.txt`) to save the report, and `--always-show-pvalues` to show p-values for
non-significant rows.

Time (the main metric, which drives the exit code) is shown alongside the
calculated throughput/bandwidth (`TFLOPS` or `GB/s`), reported as a secondary
metric. Because the table requires a uniform metric set, samples-only composite
rows that carry no throughput (e.g. `Forward+Backward`) are omitted from the
comparison; their raw samples remain in the CSV.
112 changes: 109 additions & 3 deletions benchmarks/microbenchmarks/compare_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,22 @@
"""
Compare two CSVs from the same benchmark suite.

Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns.
Outputs a markdown <details> block to stdout with per-config results,
and optionally appends a summary table row to --summary-file.
Two modes:

1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric
columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput
speedups, and emits a markdown <details> block (optionally appending a
summary row to --summary-file).

2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using
a statistical test (Brunner-Munzel by default) via the benchstats package.
Reports whether per-config timing differences are significant and exits 1
when a significant regression is found (for CI gating). Requires
``pip install -r requirements.txt``.

Usage:
python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE
python compare_results.py base_samples.csv cand_samples.csv --stats
"""

import argparse
Expand Down Expand Up @@ -49,6 +59,75 @@ def print_key_table(title, rows_df, key_cols):
print()


def run_stats(args):
"""Compare two samples CSVs with a statistical test via benchstats.

Timing (``time_ms``) is the main metric and drives the exit code; throughput
/ bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when
present in the CSV.

Returns a process exit code: 1 if a significant difference is found in the
main (timing) metric, else 0.
"""
import os

import rich.table # noqa: F401 benchstats 3.4.0 render uses rich.table.Table without importing it
from parser_TEsamples import parser_TEsamples
from benchstats.compare import compareStats
from benchstats.render import renderComparisonResults
from benchstats.common import LoggingConsole, detectExportFormat

main_metrics = ["time_s"]

export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
if export_fmt is not None and os.path.isfile(args.export_to):
os.remove(args.export_to)

console = LoggingConsole(
record=export_fmt is not None,
log_level=LoggingConsole.LogLevel.Warning,
)

# metrics=None exposes every metric the CSV carries (time + throughput).
s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats()
s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats()

cr = compareStats(
s1, s2,
method=args.method,
alpha=args.alpha,
main_metrics=main_metrics,
debug_log=console,
)

# Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats'
# default per-value "s" suffix for them (the column header names the unit).
secondary = [m for m in cr.getMetrics() if m not in main_metrics]
style_overrides = {f"metric_{m}_unit": "" for m in secondary}

renderComparisonResults(
cr, console,
main_metrics=main_metrics,
style_overrides=style_overrides or None,
always_show_pvalues=args.always_show_pvalues,
)

if export_fmt is not None:
if export_fmt == "txt":
console.save_text(args.export_to)
elif export_fmt == "svg":
console.save_svg(args.export_to, title="")
elif export_fmt == "html":
console.save_html(args.export_to)

if cr.at_least_one_differs:
console.warning(
"At least one significant timing difference was detected (exit 1)."
)
return 1
return 0


def main():
parser = argparse.ArgumentParser(description="Compare benchmark CSVs")
parser.add_argument("baseline_csv", help="Baseline CSV")
Expand All @@ -67,8 +146,35 @@ def main():
"Set to 0 to disable the filter."
),
)

stats_group = parser.add_argument_group(
"statistical comparison (--stats mode; operates on --csv-samples CSVs)"
)
stats_group.add_argument(
"--stats", action="store_true",
help="Compare per-sample CSVs with a statistical test via benchstats.",
)
stats_group.add_argument(
"--alpha", type=float, default=0.001,
help="Significance level for the test (default: 0.001).",
)
stats_group.add_argument(
"--method", default="brunnermunzel",
help="Statistical test to use (default: brunnermunzel).",
)
stats_group.add_argument(
"--always-show-pvalues", action="store_true",
help="Always show p-values, including for non-significant results.",
)
stats_group.add_argument(
"--export-to", default=None, metavar="FILE",
help="Export the report to a .txt/.svg/.html file (format from extension).",
)
args = parser.parse_args()

if args.stats:
return run_stats(args)

baseline_df = pd.read_csv(args.baseline_csv)
candidate_df = pd.read_csv(args.candidate_csv)

Expand Down
183 changes: 183 additions & 0 deletions benchmarks/microbenchmarks/parser_TEsamples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/usr/bin/env python
###############################################################################
# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.
###############################################################################
"""benchstats parser for Transformer Engine microbenchmark samples CSVs.

Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the
``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}``
structure consumed by ``benchstats.compare.compareStats``.

Columns of the samples CSV: the benchmark parameter columns, plus ``label``,
``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is
built by joining every parameter column and ``label``, so each unique
(parameters, label) combination becomes one benchmark.

Two metrics are exposed:

- ``time_s`` (seconds, lower is better) -- always present; intended as the
*main* metric. Exposed in seconds because benchstats' renderer auto-scales
time values (to ms/us/ns) assuming a seconds base unit.
- the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher
is better) -- present when the CSV carries throughput values.

``benchstats``' renderer requires every benchmark to expose the same metric set.
Records without throughput (the samples-only ``Forward+Backward`` composites) are
therefore dropped from the comparison when throughput is available for the other
benchmarks; their raw samples remain in the CSV for other downstream analysis.

The class name matches the file name (``parser_TEsamples``) so it can also be
loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
"""

import re

import numpy as np
import pandas as pd

from benchstats.common import ParserBase, LoggingConsole

_TIME_COL = "time_ms" # column name in the samples CSV (milliseconds)
_TIME_KEY = "time_s" # metric key exposed to benchstats (seconds)
_THR_COL = "throughput"
_UNIT_COL = "unit"
_GENERIC_THR = "throughput"
_NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL)
_NAME_DELIM = " | "


class parser_TEsamples(ParserBase):
def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None:
assert isinstance(csv_file_path, str)
assert filter is None or isinstance(filter, (str, re.Pattern))
assert metrics is None or (
isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics)
)

if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
self.debug_log = False
elif isinstance(debug_log, bool) and debug_log:
self.debug_log = True
self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug)
else:
self.debug_log = True
self.logger = debug_log

self.file = csv_file_path
self.filter = (
filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
)
self._requested_metrics = list(metrics) if metrics is not None else None
self._stats = self._build()

def getStats(self) -> dict[str, dict[str, np.ndarray]]:
return self._stats

def _log(self, level, msg):
if self.debug_log:
getattr(self.logger, level)(f"parser_TEsamples: {msg}")

def _build(self) -> dict[str, dict[str, np.ndarray]]:
df = pd.read_csv(self.file)

if _TIME_COL not in df.columns or "sample_idx" not in df.columns:
raise ValueError(
f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. "
"Was it written with --csv-samples?"
)

name_cols = [c for c in df.columns if c not in _NON_NAME_COLS]
if not name_cols:
raise ValueError(f"No benchmark-name columns found in '{self.file}'.")

df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce")
has_thr_col = _THR_COL in df.columns
if has_thr_col:
df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce")

# First pass: collect per-benchmark time samples, throughput samples and unit.
per_bm = {} # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None}
for key_vals, group in df.groupby(name_cols, sort=False):
if not isinstance(key_vals, tuple):
key_vals = (key_vals,)
bm_name = _NAME_DELIM.join(str(v) for v in key_vals)

if self.filter is not None and self.filter.search(bm_name) is None:
continue

time_ms = group[_TIME_COL].to_numpy(dtype=np.float64)
time_ms = time_ms[np.isfinite(time_ms)]
if time_ms.size == 0:
self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.")
continue
# benchstats' renderer auto-scales assuming seconds, so expose seconds.
time_s = time_ms / 1e3

thr_s, unit = None, None
if has_thr_col:
thr_s = group[_THR_COL].to_numpy(dtype=np.float64)
thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)]
if thr_s.size == 0:
thr_s = None
else:
units = [u for u in group[_UNIT_COL].astype(str).unique()
if u and u.lower() != "nan"] if _UNIT_COL in df.columns else []
unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None)

if self.debug_log and time_s.size < 10:
self._log(
"warning",
f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); "
"re-run with a larger --min-samples.",
)
per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit}

if not per_bm:
self._log("warning", f"no benchmarks read from '{self.file}'.")
return {}

# Decide on a uniform metric set across all benchmarks.
with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None}
thr_key = self._resolve_throughput_key(with_thr)

if thr_key is not None and 0 < len(with_thr) < len(per_bm):
dropped = sorted(set(per_bm) - set(with_thr))
self._log(
"warning",
f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so "
f"throughput can be shown uniformly: {', '.join(dropped)}",
)
per_bm = with_thr

# Build the result, honoring an explicit metric request if given.
stats = {}
for bm_name, d in per_bm.items():
entry = {}
if self._metric_requested(_TIME_KEY, thr_key):
entry[_TIME_KEY] = d["time"]
if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key):
entry[thr_key] = d["thr"]
if entry:
stats[bm_name] = entry
return stats

def _resolve_throughput_key(self, with_thr):
"""Return a single throughput metric key shared by all throughput-bearing benchmarks."""
if not with_thr:
return None
units = {d["unit"] for d in with_thr.values()}
if len(units) == 1:
return next(iter(units)) or _GENERIC_THR
return _GENERIC_THR # mixed units in one file (atypical) -> generic header

def _metric_requested(self, key, thr_key):
"""Honor an explicit metrics= request (benchstats CLI), else expose everything."""
if self._requested_metrics is None:
return True
req = self._requested_metrics
if key == _TIME_KEY:
return any(t in req for t in (_TIME_KEY, _TIME_COL, "time"))
# throughput: match the unit key, the generic name, or literal 'throughput'
return key in req or _GENERIC_THR in req or _THR_COL in req
3 changes: 3 additions & 0 deletions benchmarks/microbenchmarks/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Extra dependencies for statistical benchmark comparison
# (compare_results.py --stats). benchstats pulls in rich, scipy and numpy.
benchstats>=3.4
Loading