diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md index ba868b0f0..40f7a1eed 100644 --- a/benchmarks/microbenchmarks/README.md +++ b/benchmarks/microbenchmarks/README.md @@ -29,7 +29,16 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv ``` The samples CSV contains one row per timing sample with columns for all -benchmark parameters plus `label`, `sample_idx`, and `time_ms`. +benchmark parameters plus `label`, `sample_idx`, `time_ms`, `throughput`, and +`unit`. The per-sample `throughput` is derived from the sample time (it is blank +for samples-only records such as `Forward+Backward`, which carry no throughput). + +torch's autorange records only a few raw timing blocks per metric. To get +enough samples for statistical comparison, `--min-samples N` (default 12) +ensures at least `N` raw blocks are recorded, topping up any shortfall with +additional equal-sized blocks rather than re-averaging the whole measurement. +Statistical comparison (see below) needs at least ~10 samples, so keep the +default (or higher) when producing samples CSVs for `--stats`. ## Shared configuration @@ -75,3 +84,36 @@ python compare_results.py baseline.csv candidate.csv --bench-name GEMM The script auto-detects metric columns, computes speedups for overlapping rows, and reports rows that exist only in the baseline or only in the candidate. + +### Statistical comparison (`--stats`) + +The ratio comparison above uses point estimates and cannot tell a real +regression from measurement noise. To test whether timing differences are +statistically significant, run the benchmark with `--csv-samples` on both +sides and compare the samples CSVs with `--stats`: + +```bash +pip install -r requirements.txt # benchstats (pulls rich, scipy, numpy) + +# baseline checkout +python benchmark_gemm.py --csv-samples baseline_samples.csv --min-samples 12 +# candidate checkout +python benchmark_gemm.py --csv-samples candidate_samples.csv --min-samples 12 + +python compare_results.py baseline_samples.csv candidate_samples.csv --stats +``` + +`--stats` uses the [benchstats](https://github.com/Arech/benchstats) package to +apply a Brunner-Munzel test (override with `--method`) at significance level +`--alpha` (default `0.001`) to each `(config, label)` pair. It prints a table +marking each benchmark as faster (`<`), slower (`>`), or not significantly +different (`~`), and exits `1` when a significant difference in the timing +metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`, +`.txt`) to save the report, and `--always-show-pvalues` to show p-values for +non-significant rows. + +Time (the main metric, which drives the exit code) is shown alongside the +calculated throughput/bandwidth (`TFLOPS` or `GB/s`), reported as a secondary +metric. Because the table requires a uniform metric set, samples-only composite +rows that carry no throughput (e.g. `Forward+Backward`) are omitted from the +comparison; their raw samples remain in the CSV. diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py index 4a7e1dab8..1c231af86 100755 --- a/benchmarks/microbenchmarks/compare_results.py +++ b/benchmarks/microbenchmarks/compare_results.py @@ -7,12 +7,22 @@ """ Compare two CSVs from the same benchmark suite. -Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns. -Outputs a markdown
block to stdout with per-config results, -and optionally appends a summary table row to --summary-file. +Two modes: + +1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric + columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput + speedups, and emits a markdown
block (optionally appending a + summary row to --summary-file). + +2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using + a statistical test (Brunner-Munzel by default) via the benchstats package. + Reports whether per-config timing differences are significant and exits 1 + when a significant regression is found (for CI gating). Requires + ``pip install -r requirements.txt``. Usage: python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE + python compare_results.py base_samples.csv cand_samples.csv --stats """ import argparse @@ -49,6 +59,75 @@ def print_key_table(title, rows_df, key_cols): print() +def run_stats(args): + """Compare two samples CSVs with a statistical test via benchstats. + + Timing (``time_ms``) is the main metric and drives the exit code; throughput + / bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when + present in the CSV. + + Returns a process exit code: 1 if a significant difference is found in the + main (timing) metric, else 0. + """ + import os + + import rich.table # noqa: F401 benchstats 3.4.0 render uses rich.table.Table without importing it + from parser_TEsamples import parser_TEsamples + from benchstats.compare import compareStats + from benchstats.render import renderComparisonResults + from benchstats.common import LoggingConsole, detectExportFormat + + main_metrics = ["time_s"] + + export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None + if export_fmt is not None and os.path.isfile(args.export_to): + os.remove(args.export_to) + + console = LoggingConsole( + record=export_fmt is not None, + log_level=LoggingConsole.LogLevel.Warning, + ) + + # metrics=None exposes every metric the CSV carries (time + throughput). + s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats() + s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats() + + cr = compareStats( + s1, s2, + method=args.method, + alpha=args.alpha, + main_metrics=main_metrics, + debug_log=console, + ) + + # Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats' + # default per-value "s" suffix for them (the column header names the unit). + secondary = [m for m in cr.getMetrics() if m not in main_metrics] + style_overrides = {f"metric_{m}_unit": "" for m in secondary} + + renderComparisonResults( + cr, console, + main_metrics=main_metrics, + style_overrides=style_overrides or None, + always_show_pvalues=args.always_show_pvalues, + ) + + if export_fmt is not None: + if export_fmt == "txt": + console.save_text(args.export_to) + elif export_fmt == "svg": + console.save_svg(args.export_to, title="") + elif export_fmt == "html": + console.save_html(args.export_to) + + if cr.at_least_one_differs: + console.warning( + "At least one significant timing difference was detected (exit 1)." + ) + return 1 + return 0 + + def main(): parser = argparse.ArgumentParser(description="Compare benchmark CSVs") parser.add_argument("baseline_csv", help="Baseline CSV") @@ -67,8 +146,35 @@ def main(): "Set to 0 to disable the filter." ), ) + + stats_group = parser.add_argument_group( + "statistical comparison (--stats mode; operates on --csv-samples CSVs)" + ) + stats_group.add_argument( + "--stats", action="store_true", + help="Compare per-sample CSVs with a statistical test via benchstats.", + ) + stats_group.add_argument( + "--alpha", type=float, default=0.001, + help="Significance level for the test (default: 0.001).", + ) + stats_group.add_argument( + "--method", default="brunnermunzel", + help="Statistical test to use (default: brunnermunzel).", + ) + stats_group.add_argument( + "--always-show-pvalues", action="store_true", + help="Always show p-values, including for non-significant results.", + ) + stats_group.add_argument( + "--export-to", default=None, metavar="FILE", + help="Export the report to a .txt/.svg/.html file (format from extension).", + ) args = parser.parse_args() + if args.stats: + return run_stats(args) + baseline_df = pd.read_csv(args.baseline_csv) candidate_df = pd.read_csv(args.candidate_csv) diff --git a/benchmarks/microbenchmarks/parser_TEsamples.py b/benchmarks/microbenchmarks/parser_TEsamples.py new file mode 100644 index 000000000..44d472158 --- /dev/null +++ b/benchmarks/microbenchmarks/parser_TEsamples.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""benchstats parser for Transformer Engine microbenchmark samples CSVs. + +Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the +``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}`` +structure consumed by ``benchstats.compare.compareStats``. + +Columns of the samples CSV: the benchmark parameter columns, plus ``label``, +``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is +built by joining every parameter column and ``label``, so each unique +(parameters, label) combination becomes one benchmark. + +Two metrics are exposed: + +- ``time_s`` (seconds, lower is better) -- always present; intended as the + *main* metric. Exposed in seconds because benchstats' renderer auto-scales + time values (to ms/us/ns) assuming a seconds base unit. +- the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher + is better) -- present when the CSV carries throughput values. + +``benchstats``' renderer requires every benchmark to expose the same metric set. +Records without throughput (the samples-only ``Forward+Backward`` composites) are +therefore dropped from the comparison when throughput is available for the other +benchmarks; their raw samples remain in the CSV for other downstream analysis. + +The class name matches the file name (``parser_TEsamples``) so it can also be +loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``. +""" + +import re + +import numpy as np +import pandas as pd + +from benchstats.common import ParserBase, LoggingConsole + +_TIME_COL = "time_ms" # column name in the samples CSV (milliseconds) +_TIME_KEY = "time_s" # metric key exposed to benchstats (seconds) +_THR_COL = "throughput" +_UNIT_COL = "unit" +_GENERIC_THR = "throughput" +_NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL) +_NAME_DELIM = " | " + + +class parser_TEsamples(ParserBase): + def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None: + assert isinstance(csv_file_path, str) + assert filter is None or isinstance(filter, (str, re.Pattern)) + assert metrics is None or ( + isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics) + ) + + if debug_log is None or (isinstance(debug_log, bool) and not debug_log): + self.debug_log = False + elif isinstance(debug_log, bool) and debug_log: + self.debug_log = True + self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug) + else: + self.debug_log = True + self.logger = debug_log + + self.file = csv_file_path + self.filter = ( + filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter) + ) + self._requested_metrics = list(metrics) if metrics is not None else None + self._stats = self._build() + + def getStats(self) -> dict[str, dict[str, np.ndarray]]: + return self._stats + + def _log(self, level, msg): + if self.debug_log: + getattr(self.logger, level)(f"parser_TEsamples: {msg}") + + def _build(self) -> dict[str, dict[str, np.ndarray]]: + df = pd.read_csv(self.file) + + if _TIME_COL not in df.columns or "sample_idx" not in df.columns: + raise ValueError( + f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. " + "Was it written with --csv-samples?" + ) + + name_cols = [c for c in df.columns if c not in _NON_NAME_COLS] + if not name_cols: + raise ValueError(f"No benchmark-name columns found in '{self.file}'.") + + df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce") + has_thr_col = _THR_COL in df.columns + if has_thr_col: + df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce") + + # First pass: collect per-benchmark time samples, throughput samples and unit. + per_bm = {} # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None} + for key_vals, group in df.groupby(name_cols, sort=False): + if not isinstance(key_vals, tuple): + key_vals = (key_vals,) + bm_name = _NAME_DELIM.join(str(v) for v in key_vals) + + if self.filter is not None and self.filter.search(bm_name) is None: + continue + + time_ms = group[_TIME_COL].to_numpy(dtype=np.float64) + time_ms = time_ms[np.isfinite(time_ms)] + if time_ms.size == 0: + self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.") + continue + # benchstats' renderer auto-scales assuming seconds, so expose seconds. + time_s = time_ms / 1e3 + + thr_s, unit = None, None + if has_thr_col: + thr_s = group[_THR_COL].to_numpy(dtype=np.float64) + thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)] + if thr_s.size == 0: + thr_s = None + else: + units = [u for u in group[_UNIT_COL].astype(str).unique() + if u and u.lower() != "nan"] if _UNIT_COL in df.columns else [] + unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None) + + if self.debug_log and time_s.size < 10: + self._log( + "warning", + f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); " + "re-run with a larger --min-samples.", + ) + per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit} + + if not per_bm: + self._log("warning", f"no benchmarks read from '{self.file}'.") + return {} + + # Decide on a uniform metric set across all benchmarks. + with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None} + thr_key = self._resolve_throughput_key(with_thr) + + if thr_key is not None and 0 < len(with_thr) < len(per_bm): + dropped = sorted(set(per_bm) - set(with_thr)) + self._log( + "warning", + f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so " + f"throughput can be shown uniformly: {', '.join(dropped)}", + ) + per_bm = with_thr + + # Build the result, honoring an explicit metric request if given. + stats = {} + for bm_name, d in per_bm.items(): + entry = {} + if self._metric_requested(_TIME_KEY, thr_key): + entry[_TIME_KEY] = d["time"] + if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key): + entry[thr_key] = d["thr"] + if entry: + stats[bm_name] = entry + return stats + + def _resolve_throughput_key(self, with_thr): + """Return a single throughput metric key shared by all throughput-bearing benchmarks.""" + if not with_thr: + return None + units = {d["unit"] for d in with_thr.values()} + if len(units) == 1: + return next(iter(units)) or _GENERIC_THR + return _GENERIC_THR # mixed units in one file (atypical) -> generic header + + def _metric_requested(self, key, thr_key): + """Honor an explicit metrics= request (benchstats CLI), else expose everything.""" + if self._requested_metrics is None: + return True + req = self._requested_metrics + if key == _TIME_KEY: + return any(t in req for t in (_TIME_KEY, _TIME_COL, "time")) + # throughput: match the unit key, the generic name, or literal 'throughput' + return key in req or _GENERIC_THR in req or _THR_COL in req diff --git a/benchmarks/microbenchmarks/requirements.txt b/benchmarks/microbenchmarks/requirements.txt new file mode 100644 index 000000000..32aa6a635 --- /dev/null +++ b/benchmarks/microbenchmarks/requirements.txt @@ -0,0 +1,3 @@ +# Extra dependencies for statistical benchmark comparison +# (compare_results.py --stats). benchstats pulls in rich, scipy and numpy. +benchstats>=3.4 diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 99ae6a244..929019ee8 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -21,6 +21,12 @@ DEFAULT_MIN_RUN_TIME_SECONDS = 0.2 +# Minimum number of raw timing samples (blocks) ``time_func`` ensures when a +# caller passes ``min_samples=None``. ``run_benchmarks`` sets this from the +# ``--min-samples`` CLI flag so every benchmark script inherits the knob without +# per-script edits. ``None`` leaves torch's autorange result untouched. +_ACTIVE_MIN_SAMPLES = None + # --------------------------------------------------------------------------- # Model configurations # --------------------------------------------------------------------------- @@ -88,22 +94,60 @@ def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None): # Timing helpers # --------------------------------------------------------------------------- -def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS): +class _RawSamples: + """Minimal ``torch...benchmark.Measurement`` stand-in holding raw block times. + + Exposes the ``times`` (per-run seconds, one entry per recorded timing block), + ``number_per_run`` and ``mean`` attributes that ``time_func`` callers and the + samples-CSV writer rely on. + """ + + def __init__(self, times, number_per_run, mean): + self.times = times + self.number_per_run = number_per_run + self.mean = mean + + +def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS, + min_samples=None): """Time *fn* and return ``(mean_ms, measurement)``. - The ``Measurement`` object carries per-sample times accessible via - ``measurement.times`` (total wall time per run) and - ``measurement.number_per_run``. + The returned measurement exposes per-run sample times via + ``measurement.times`` -- one entry per recorded timing block (each block is + an average over ``measurement.number_per_run`` executions, as chosen by + torch to amortize timer overhead). method: "adaptive" uses adaptive_autorange (good for compute-bound), "blocked" uses blocked_autorange (good for memory-bound). + + min_samples: ensure at least this many raw timing blocks are recorded, so the + per-sample data is large enough for statistical comparison + (compare_results.py --stats). torch's autorange usually records only a + few blocks; any shortfall is topped up with additional equal-sized blocks + rather than re-running and re-averaging the whole measurement. ``None`` + falls back to the module-level ``_ACTIVE_MIN_SAMPLES`` (set from + ``--min-samples``); ``None`` there too leaves the autorange result as-is. """ + if min_samples is None: + min_samples = _ACTIVE_MIN_SAMPLES + timer = benchmark.Timer(stmt="fn()", globals={"fn": fn}) if method == "blocked": m = timer.blocked_autorange(min_run_time=min_run_time) else: m = timer.adaptive_autorange(min_run_time=min_run_time) - return m.mean * 1e3, m + + if min_samples is None or len(m.times) >= min_samples: + return m.mean * 1e3, m + + # Top up with additional equal-sized blocks (each timeit() records one block + # averaged over number_per_run runs) until enough raw samples are collected. + times = list(m.times) # per-run seconds + number = m.number_per_run + while len(times) < min_samples: + times.append(timer.timeit(number).mean) + mean_s = sum(times) / len(times) + return mean_s * 1e3, _RawSamples(times=times, number_per_run=number, mean=mean_s) # --------------------------------------------------------------------------- @@ -266,6 +310,16 @@ def make_parser(**kwargs): "Optional filename; default derived from script name." ), ) + parser.add_argument( + "--min-samples", type=int, default=12, metavar="N", + help=( + "Ensure at least N raw timing samples (blocks) are recorded per " + "metric for statistical comparison (compare_results.py --stats). " + "torch's autorange records only a few; any shortfall is topped up " + "with additional equal-sized blocks. Use a small value (e.g. 2) to " + "effectively disable top-up. Default: 12." + ), + ) return parser @@ -301,6 +355,11 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None, if args is None: args = make_parser().parse_args() + # Let time_func (called by bench_fns without an explicit min_samples arg) + # inherit the CLI value without editing every benchmark script. + global _ACTIVE_MIN_SAMPLES + _ACTIVE_MIN_SAMPLES = getattr(args, "min_samples", None) + rows = [] all_case_metrics = [] resolved_metric_columns = None @@ -355,16 +414,36 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None, if measurement is None: continue lbl = metric["label"] + unit = metric.get("unit") + thr_mean = metric.get("throughput") or 0.0 + ms_mean = metric.get("ms") or 0.0 + # Throughput is a deterministic function of time for a given + # config (throughput = C / time), so a per-sample throughput is + # recovered from the aggregate as thr_mean * ms_mean / sample_ms. + # samples_only records (e.g. Forward+Backward) carry no + # throughput and are left blank. + has_thr = ( + not metric.get("samples_only") and thr_mean > 0 and ms_mean > 0 + ) for i, t in enumerate(measurement.times): + # measurement.times entries are already per-run (seconds). + sample_ms = t * 1e3 sr = dict(case_params) sr["label"] = lbl sr["sample_idx"] = i - sr["time_ms"] = t / measurement.number_per_run * 1e3 + sr["time_ms"] = sample_ms + sr["throughput"] = ( + thr_mean * ms_mean / sample_ms + if has_thr and sample_ms > 0 + else "" + ) + sr["unit"] = unit if has_thr else "" sample_rows.append(sr) if sample_rows: df = pd.DataFrame( sample_rows, - columns=param_columns + ["label", "sample_idx", "time_ms"], + columns=param_columns + + ["label", "sample_idx", "time_ms", "throughput", "unit"], ) df.to_csv(samples_csv, index=False) print(f"Samples saved to {samples_csv}")