From 02df3c75980a23263b1bc3089db63b55bac31041 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 5 Jun 2026 16:31:07 +0000 Subject: [PATCH 1/2] Add statistical benchmark comparison via benchstats Adds a statistically-rigorous comparison path to the microbenchmark suite, on top of the existing throughput-ratio comparison. - utils.py: time_func gains a repetitions param; --repetitions flag (default 15) threaded through run_benchmarks so all bench scripts collect enough per-sample data with no per-script edits. repetitions<=1 preserves the original single-measurement behavior. - parser_TEsamples.py: benchstats ParserBase reading the --csv-samples CSV into {bench_name: {time_ms: ndarray}}. - compare_results.py: new --stats mode running a Brunner-Munzel test (via benchstats) on two samples CSVs; exits 1 on a significant regression for CI gating. Default ratio path unchanged; benchstats lazy-imported. - requirements.txt: benchstats>=3.4. - README: document the repetitions knob and --stats workflow. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/microbenchmarks/README.md | 33 ++++++ benchmarks/microbenchmarks/compare_results.py | 101 ++++++++++++++++- .../microbenchmarks/parser_TEsamples.py | 107 ++++++++++++++++++ benchmarks/microbenchmarks/requirements.txt | 3 + benchmarks/microbenchmarks/utils.py | 70 ++++++++++-- 5 files changed, 302 insertions(+), 12 deletions(-) create mode 100644 benchmarks/microbenchmarks/parser_TEsamples.py create mode 100644 benchmarks/microbenchmarks/requirements.txt diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md index ba868b0f0..67e5ccc2a 100644 --- a/benchmarks/microbenchmarks/README.md +++ b/benchmarks/microbenchmarks/README.md @@ -31,6 +31,12 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv The samples CSV contains one row per timing sample with columns for all benchmark parameters plus `label`, `sample_idx`, and `time_ms`. +By default each metric is measured over 15 repetitions (one sample per +repetition). Use `--repetitions N` to change this; `--repetitions 1` reproduces +the original single-measurement behavior. Statistical comparison (see below) +needs at least ~10 samples, so keep the default (or higher) when producing +samples CSVs for `--stats`. + ## Shared configuration Common benchmark settings live in `utils.py`. @@ -75,3 +81,30 @@ python compare_results.py baseline.csv candidate.csv --bench-name GEMM The script auto-detects metric columns, computes speedups for overlapping rows, and reports rows that exist only in the baseline or only in the candidate. + +### Statistical comparison (`--stats`) + +The ratio comparison above uses point estimates and cannot tell a real +regression from measurement noise. To test whether timing differences are +statistically significant, run the benchmark with `--csv-samples` on both +sides and compare the samples CSVs with `--stats`: + +```bash +pip install -r requirements.txt # benchstats (pulls rich, scipy, numpy) + +# baseline checkout +python benchmark_gemm.py --csv-samples baseline_samples.csv --repetitions 15 +# candidate checkout +python benchmark_gemm.py --csv-samples candidate_samples.csv --repetitions 15 + +python compare_results.py baseline_samples.csv candidate_samples.csv --stats +``` + +`--stats` uses the [benchstats](https://github.com/Arech/benchstats) package to +apply a Brunner-Munzel test (override with `--method`) at significance level +`--alpha` (default `0.001`) to each `(config, label)` pair. It prints a table +marking each benchmark as faster (`<`), slower (`>`), or not significantly +different (`~`), and exits `1` when a significant difference in the timing +metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`, +`.txt`) to save the report, and `--always-show-pvalues` to show p-values for +non-significant rows. diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py index 4a7e1dab8..540a9e433 100755 --- a/benchmarks/microbenchmarks/compare_results.py +++ b/benchmarks/microbenchmarks/compare_results.py @@ -7,12 +7,22 @@ """ Compare two CSVs from the same benchmark suite. -Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns. -Outputs a markdown
block to stdout with per-config results, -and optionally appends a summary table row to --summary-file. +Two modes: + +1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric + columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput + speedups, and emits a markdown
block (optionally appending a + summary row to --summary-file). + +2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using + a statistical test (Brunner-Munzel by default) via the benchstats package. + Reports whether per-config timing differences are significant and exits 1 + when a significant regression is found (for CI gating). Requires + ``pip install -r requirements.txt``. Usage: python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE + python compare_results.py base_samples.csv cand_samples.csv --stats """ import argparse @@ -49,6 +59,64 @@ def print_key_table(title, rows_df, key_cols): print() +def run_stats(args): + """Compare two samples CSVs with a statistical test via benchstats. + + Returns a process exit code: 1 if a significant difference is found in the + main metric (timing), else 0. + """ + import os + + import rich.table # noqa: F401 benchstats 3.4.0 render uses rich.table.Table without importing it + from parser_TEsamples import parser_TEsamples + from benchstats.compare import compareStats + from benchstats.render import renderComparisonResults + from benchstats.common import LoggingConsole, detectExportFormat + + metrics = ["time_ms"] + + export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None + if export_fmt is not None and os.path.isfile(args.export_to): + os.remove(args.export_to) + + console = LoggingConsole( + record=export_fmt is not None, + log_level=LoggingConsole.LogLevel.Warning, + ) + + s1 = parser_TEsamples(args.baseline_csv, None, metrics, debug_log=console).getStats() + s2 = parser_TEsamples(args.candidate_csv, None, metrics, debug_log=console).getStats() + + cr = compareStats( + s1, s2, + method=args.method, + alpha=args.alpha, + main_metrics=metrics, + debug_log=console, + ) + + renderComparisonResults( + cr, console, + main_metrics=metrics, + always_show_pvalues=args.always_show_pvalues, + ) + + if export_fmt is not None: + if export_fmt == "txt": + console.save_text(args.export_to) + elif export_fmt == "svg": + console.save_svg(args.export_to, title="") + elif export_fmt == "html": + console.save_html(args.export_to) + + if cr.at_least_one_differs: + console.warning( + "At least one significant timing difference was detected (exit 1)." + ) + return 1 + return 0 + + def main(): parser = argparse.ArgumentParser(description="Compare benchmark CSVs") parser.add_argument("baseline_csv", help="Baseline CSV") @@ -67,8 +135,35 @@ def main(): "Set to 0 to disable the filter." ), ) + + stats_group = parser.add_argument_group( + "statistical comparison (--stats mode; operates on --csv-samples CSVs)" + ) + stats_group.add_argument( + "--stats", action="store_true", + help="Compare per-sample CSVs with a statistical test via benchstats.", + ) + stats_group.add_argument( + "--alpha", type=float, default=0.001, + help="Significance level for the test (default: 0.001).", + ) + stats_group.add_argument( + "--method", default="brunnermunzel", + help="Statistical test to use (default: brunnermunzel).", + ) + stats_group.add_argument( + "--always-show-pvalues", action="store_true", + help="Always show p-values, including for non-significant results.", + ) + stats_group.add_argument( + "--export-to", default=None, metavar="FILE", + help="Export the report to a .txt/.svg/.html file (format from extension).", + ) args = parser.parse_args() + if args.stats: + return run_stats(args) + baseline_df = pd.read_csv(args.baseline_csv) candidate_df = pd.read_csv(args.candidate_csv) diff --git a/benchmarks/microbenchmarks/parser_TEsamples.py b/benchmarks/microbenchmarks/parser_TEsamples.py new file mode 100644 index 000000000..dc6a07c0e --- /dev/null +++ b/benchmarks/microbenchmarks/parser_TEsamples.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""benchstats parser for Transformer Engine microbenchmark samples CSVs. + +Reads the per-sample CSV produced by ``run_benchmarks(... )`` with the +``--csv-samples`` flag (columns: the benchmark parameter columns plus +``label``, ``sample_idx``, ``time_ms``) and turns it into the +``{benchmark_name: {metric: ndarray}}`` structure consumed by +``benchstats.compare.compareStats``. + +A benchmark name is built by joining every column except ``sample_idx`` and the +metric column, so each unique (parameters, label) combination becomes one +benchmark. The single available metric is ``time_ms`` (lower is better). + +The class name matches the file name (``parser_TEsamples``) so it can also be +loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``. +""" + +import re + +import numpy as np +import pandas as pd + +from benchstats.common import ParserBase, LoggingConsole + +_METRIC = "time_ms" +_NON_NAME_COLS = ("sample_idx", _METRIC) +_NAME_DELIM = " | " + + +class parser_TEsamples(ParserBase): + def __init__(self, csv_file_path, filter, metrics, debug_log=True) -> None: + assert isinstance(csv_file_path, str) + assert filter is None or isinstance(filter, (str, re.Pattern)) + assert isinstance(metrics, (list, tuple)) and len(metrics) > 0 + assert all(isinstance(m, str) for m in metrics) + + if debug_log is None or (isinstance(debug_log, bool) and not debug_log): + self.debug_log = False + elif isinstance(debug_log, bool) and debug_log: + self.debug_log = True + self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug) + else: + self.debug_log = True + self.logger = debug_log + + unsupported = [m for m in metrics if m != _METRIC] + if unsupported: + raise ValueError( + f"parser_TEsamples only supports the '{_METRIC}' metric, got: {unsupported}. " + "The samples CSV produced by --csv-samples carries per-run times only." + ) + + self.file = csv_file_path + self.filter = filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter) + self._stats = self._build() + + def getStats(self) -> dict[str, dict[str, np.ndarray]]: + return self._stats + + def _build(self) -> dict[str, dict[str, np.ndarray]]: + df = pd.read_csv(self.file) + + for col in _NON_NAME_COLS: + if col not in df.columns: + raise ValueError( + f"'{col}' column not found in '{self.file}'. Was the CSV written with " + "--csv-samples?" + ) + + name_cols = [c for c in df.columns if c not in _NON_NAME_COLS] + if not name_cols: + raise ValueError(f"No benchmark-name columns found in '{self.file}'.") + + df[_METRIC] = pd.to_numeric(df[_METRIC], errors="coerce") + + stats: dict[str, dict[str, np.ndarray]] = {} + for key_vals, group in df.groupby(name_cols, sort=False): + if not isinstance(key_vals, tuple): + key_vals = (key_vals,) + bm_name = _NAME_DELIM.join(str(v) for v in key_vals) + + if self.filter is not None and self.filter.search(bm_name) is None: + continue + + samples = group[_METRIC].to_numpy(dtype=np.float64) + samples = samples[np.isfinite(samples)] + if samples.size == 0: + if self.debug_log: + self.logger.warning( + f"parser_TEsamples: benchmark '{bm_name}' has no finite samples; skipping." + ) + continue + if self.debug_log and samples.size < 10: + self.logger.warning( + f"parser_TEsamples: benchmark '{bm_name}' has only {samples.size} samples " + "(>= 10 recommended). Re-run the benchmark with a larger --repetitions." + ) + stats[bm_name] = {_METRIC: samples} + + if not stats and self.debug_log: + self.logger.warning(f"parser_TEsamples: no benchmarks read from '{self.file}'.") + return stats diff --git a/benchmarks/microbenchmarks/requirements.txt b/benchmarks/microbenchmarks/requirements.txt new file mode 100644 index 000000000..32aa6a635 --- /dev/null +++ b/benchmarks/microbenchmarks/requirements.txt @@ -0,0 +1,3 @@ +# Extra dependencies for statistical benchmark comparison +# (compare_results.py --stats). benchstats pulls in rich, scipy and numpy. +benchstats>=3.4 diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 99ae6a244..825a08924 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -21,6 +21,12 @@ DEFAULT_MIN_RUN_TIME_SECONDS = 0.2 +# Number of repetitions used by ``time_func`` when a caller passes +# ``repetitions=None``. ``run_benchmarks`` sets this from the ``--repetitions`` +# CLI flag so every benchmark script inherits the knob without per-script edits. +# ``None`` (the default) preserves the original single-measurement behavior. +_ACTIVE_REPETITIONS = None + # --------------------------------------------------------------------------- # Model configurations # --------------------------------------------------------------------------- @@ -88,22 +94,54 @@ def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None): # Timing helpers # --------------------------------------------------------------------------- -def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS): +class _RepeatedMeasurement: + """Minimal ``torch...benchmark.Measurement`` stand-in for repeated runs. + + Exposes the ``times`` / ``number_per_run`` / ``mean`` attributes the + samples-CSV writer relies on. ``times`` holds one per-run mean (in seconds) + for each repetition, so ``--csv-samples`` emits one row per repetition. + """ + + def __init__(self, times, mean): + self.times = times + self.number_per_run = 1 + self.mean = mean + + +def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS, + repetitions=None): """Time *fn* and return ``(mean_ms, measurement)``. - The ``Measurement`` object carries per-sample times accessible via - ``measurement.times`` (total wall time per run) and - ``measurement.number_per_run``. + The returned measurement carries per-sample times accessible via + ``measurement.times`` and ``measurement.number_per_run``. method: "adaptive" uses adaptive_autorange (good for compute-bound), "blocked" uses blocked_autorange (good for memory-bound). + + repetitions: number of independent autorange measurements to collect. Each + repetition contributes one per-run mean to ``measurement.times`` (the + form expected by statistical comparison via ``compare_results.py + --stats``). ``None`` falls back to the module-level ``_ACTIVE_REPETITIONS`` + (set from ``--repetitions``); a value <= 1 reproduces the original + single-measurement behavior and returns the native ``Measurement``. """ + if repetitions is None: + repetitions = _ACTIVE_REPETITIONS + timer = benchmark.Timer(stmt="fn()", globals={"fn": fn}) - if method == "blocked": - m = timer.blocked_autorange(min_run_time=min_run_time) - else: - m = timer.adaptive_autorange(min_run_time=min_run_time) - return m.mean * 1e3, m + + def _measure(): + if method == "blocked": + return timer.blocked_autorange(min_run_time=min_run_time) + return timer.adaptive_autorange(min_run_time=min_run_time) + + if repetitions is None or repetitions <= 1: + m = _measure() + return m.mean * 1e3, m + + means = [_measure().mean for _ in range(repetitions)] # per-run seconds + mean_s = sum(means) / len(means) + return mean_s * 1e3, _RepeatedMeasurement(times=means, mean=mean_s) # --------------------------------------------------------------------------- @@ -266,6 +304,15 @@ def make_parser(**kwargs): "Optional filename; default derived from script name." ), ) + parser.add_argument( + "--repetitions", type=int, default=15, metavar="N", + help=( + "Number of independent timing repetitions per metric. Each " + "repetition yields one sample for statistical comparison " + "(compare_results.py --stats). Use 1 for the original " + "single-measurement behavior. Default: 15." + ), + ) return parser @@ -301,6 +348,11 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None, if args is None: args = make_parser().parse_args() + # Let time_func (called by bench_fns without an explicit repetitions arg) + # inherit the CLI value without editing every benchmark script. + global _ACTIVE_REPETITIONS + _ACTIVE_REPETITIONS = getattr(args, "repetitions", None) + rows = [] all_case_metrics = [] resolved_metric_columns = None From e592a1cbb4f222c833f5e955fbf74bd2210a9be5 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 5 Jun 2026 19:36:52 +0000 Subject: [PATCH 2/2] Show throughput in --stats and ensure >10 raw samples Refines the benchstats integration: - compare_results.py --stats now reports calculated throughput/bandwidth (TFLOPS / GB/s) alongside timing. Time (seconds) is the main metric and drives the exit code; throughput is a secondary metric. The throughput unit suffix is blanked (the column header names the unit) and time is fed in seconds so benchstats' auto-scaling (ms/us/ns) is correct. - utils.py: --csv-samples now also writes per-sample throughput + unit columns, derived from the sample time (blank for samples-only records). - parser_TEsamples.py exposes time_s (main) and a unit-keyed throughput metric, dropping throughput-less composite rows so the metric set stays uniform for benchstats' renderer. - utils.py: replace --repetitions with --min-samples (default 12). Instead of re-running autorange and averaging, time_func keeps the raw per-block samples and tops up any shortfall with additional equal-sized blocks. Also fix the samples writer to not re-divide already-per-run times. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/microbenchmarks/README.md | 25 ++- benchmarks/microbenchmarks/compare_results.py | 23 ++- .../microbenchmarks/parser_TEsamples.py | 164 +++++++++++++----- benchmarks/microbenchmarks/utils.py | 113 +++++++----- 4 files changed, 224 insertions(+), 101 deletions(-) diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md index 67e5ccc2a..40f7a1eed 100644 --- a/benchmarks/microbenchmarks/README.md +++ b/benchmarks/microbenchmarks/README.md @@ -29,13 +29,16 @@ python benchmark_gemm.py --csv --csv-samples gemm_samples.csv ``` The samples CSV contains one row per timing sample with columns for all -benchmark parameters plus `label`, `sample_idx`, and `time_ms`. +benchmark parameters plus `label`, `sample_idx`, `time_ms`, `throughput`, and +`unit`. The per-sample `throughput` is derived from the sample time (it is blank +for samples-only records such as `Forward+Backward`, which carry no throughput). -By default each metric is measured over 15 repetitions (one sample per -repetition). Use `--repetitions N` to change this; `--repetitions 1` reproduces -the original single-measurement behavior. Statistical comparison (see below) -needs at least ~10 samples, so keep the default (or higher) when producing -samples CSVs for `--stats`. +torch's autorange records only a few raw timing blocks per metric. To get +enough samples for statistical comparison, `--min-samples N` (default 12) +ensures at least `N` raw blocks are recorded, topping up any shortfall with +additional equal-sized blocks rather than re-averaging the whole measurement. +Statistical comparison (see below) needs at least ~10 samples, so keep the +default (or higher) when producing samples CSVs for `--stats`. ## Shared configuration @@ -93,9 +96,9 @@ sides and compare the samples CSVs with `--stats`: pip install -r requirements.txt # benchstats (pulls rich, scipy, numpy) # baseline checkout -python benchmark_gemm.py --csv-samples baseline_samples.csv --repetitions 15 +python benchmark_gemm.py --csv-samples baseline_samples.csv --min-samples 12 # candidate checkout -python benchmark_gemm.py --csv-samples candidate_samples.csv --repetitions 15 +python benchmark_gemm.py --csv-samples candidate_samples.csv --min-samples 12 python compare_results.py baseline_samples.csv candidate_samples.csv --stats ``` @@ -108,3 +111,9 @@ different (`~`), and exits `1` when a significant difference in the timing metric is found, so it can gate CI. Use `--export-to report.svg` (or `.html`, `.txt`) to save the report, and `--always-show-pvalues` to show p-values for non-significant rows. + +Time (the main metric, which drives the exit code) is shown alongside the +calculated throughput/bandwidth (`TFLOPS` or `GB/s`), reported as a secondary +metric. Because the table requires a uniform metric set, samples-only composite +rows that carry no throughput (e.g. `Forward+Backward`) are omitted from the +comparison; their raw samples remain in the CSV. diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py index 540a9e433..1c231af86 100755 --- a/benchmarks/microbenchmarks/compare_results.py +++ b/benchmarks/microbenchmarks/compare_results.py @@ -62,8 +62,12 @@ def print_key_table(title, rows_df, key_cols): def run_stats(args): """Compare two samples CSVs with a statistical test via benchstats. + Timing (``time_ms``) is the main metric and drives the exit code; throughput + / bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when + present in the CSV. + Returns a process exit code: 1 if a significant difference is found in the - main metric (timing), else 0. + main (timing) metric, else 0. """ import os @@ -73,7 +77,7 @@ def run_stats(args): from benchstats.render import renderComparisonResults from benchstats.common import LoggingConsole, detectExportFormat - metrics = ["time_ms"] + main_metrics = ["time_s"] export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None if export_fmt is not None and os.path.isfile(args.export_to): @@ -84,20 +88,27 @@ def run_stats(args): log_level=LoggingConsole.LogLevel.Warning, ) - s1 = parser_TEsamples(args.baseline_csv, None, metrics, debug_log=console).getStats() - s2 = parser_TEsamples(args.candidate_csv, None, metrics, debug_log=console).getStats() + # metrics=None exposes every metric the CSV carries (time + throughput). + s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats() + s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats() cr = compareStats( s1, s2, method=args.method, alpha=args.alpha, - main_metrics=metrics, + main_metrics=main_metrics, debug_log=console, ) + # Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats' + # default per-value "s" suffix for them (the column header names the unit). + secondary = [m for m in cr.getMetrics() if m not in main_metrics] + style_overrides = {f"metric_{m}_unit": "" for m in secondary} + renderComparisonResults( cr, console, - main_metrics=metrics, + main_metrics=main_metrics, + style_overrides=style_overrides or None, always_show_pvalues=args.always_show_pvalues, ) diff --git a/benchmarks/microbenchmarks/parser_TEsamples.py b/benchmarks/microbenchmarks/parser_TEsamples.py index dc6a07c0e..44d472158 100644 --- a/benchmarks/microbenchmarks/parser_TEsamples.py +++ b/benchmarks/microbenchmarks/parser_TEsamples.py @@ -6,15 +6,27 @@ ############################################################################### """benchstats parser for Transformer Engine microbenchmark samples CSVs. -Reads the per-sample CSV produced by ``run_benchmarks(... )`` with the -``--csv-samples`` flag (columns: the benchmark parameter columns plus -``label``, ``sample_idx``, ``time_ms``) and turns it into the -``{benchmark_name: {metric: ndarray}}`` structure consumed by -``benchstats.compare.compareStats``. +Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the +``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}`` +structure consumed by ``benchstats.compare.compareStats``. -A benchmark name is built by joining every column except ``sample_idx`` and the -metric column, so each unique (parameters, label) combination becomes one -benchmark. The single available metric is ``time_ms`` (lower is better). +Columns of the samples CSV: the benchmark parameter columns, plus ``label``, +``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is +built by joining every parameter column and ``label``, so each unique +(parameters, label) combination becomes one benchmark. + +Two metrics are exposed: + +- ``time_s`` (seconds, lower is better) -- always present; intended as the + *main* metric. Exposed in seconds because benchstats' renderer auto-scales + time values (to ms/us/ns) assuming a seconds base unit. +- the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher + is better) -- present when the CSV carries throughput values. + +``benchstats``' renderer requires every benchmark to expose the same metric set. +Records without throughput (the samples-only ``Forward+Backward`` composites) are +therefore dropped from the comparison when throughput is available for the other +benchmarks; their raw samples remain in the CSV for other downstream analysis. The class name matches the file name (``parser_TEsamples``) so it can also be loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``. @@ -27,17 +39,22 @@ from benchstats.common import ParserBase, LoggingConsole -_METRIC = "time_ms" -_NON_NAME_COLS = ("sample_idx", _METRIC) +_TIME_COL = "time_ms" # column name in the samples CSV (milliseconds) +_TIME_KEY = "time_s" # metric key exposed to benchstats (seconds) +_THR_COL = "throughput" +_UNIT_COL = "unit" +_GENERIC_THR = "throughput" +_NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL) _NAME_DELIM = " | " class parser_TEsamples(ParserBase): - def __init__(self, csv_file_path, filter, metrics, debug_log=True) -> None: + def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None: assert isinstance(csv_file_path, str) assert filter is None or isinstance(filter, (str, re.Pattern)) - assert isinstance(metrics, (list, tuple)) and len(metrics) > 0 - assert all(isinstance(m, str) for m in metrics) + assert metrics is None or ( + isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics) + ) if debug_log is None or (isinstance(debug_log, bool) and not debug_log): self.debug_log = False @@ -48,37 +65,40 @@ def __init__(self, csv_file_path, filter, metrics, debug_log=True) -> None: self.debug_log = True self.logger = debug_log - unsupported = [m for m in metrics if m != _METRIC] - if unsupported: - raise ValueError( - f"parser_TEsamples only supports the '{_METRIC}' metric, got: {unsupported}. " - "The samples CSV produced by --csv-samples carries per-run times only." - ) - self.file = csv_file_path - self.filter = filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter) + self.filter = ( + filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter) + ) + self._requested_metrics = list(metrics) if metrics is not None else None self._stats = self._build() def getStats(self) -> dict[str, dict[str, np.ndarray]]: return self._stats + def _log(self, level, msg): + if self.debug_log: + getattr(self.logger, level)(f"parser_TEsamples: {msg}") + def _build(self) -> dict[str, dict[str, np.ndarray]]: df = pd.read_csv(self.file) - for col in _NON_NAME_COLS: - if col not in df.columns: - raise ValueError( - f"'{col}' column not found in '{self.file}'. Was the CSV written with " - "--csv-samples?" - ) + if _TIME_COL not in df.columns or "sample_idx" not in df.columns: + raise ValueError( + f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. " + "Was it written with --csv-samples?" + ) name_cols = [c for c in df.columns if c not in _NON_NAME_COLS] if not name_cols: raise ValueError(f"No benchmark-name columns found in '{self.file}'.") - df[_METRIC] = pd.to_numeric(df[_METRIC], errors="coerce") + df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce") + has_thr_col = _THR_COL in df.columns + if has_thr_col: + df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce") - stats: dict[str, dict[str, np.ndarray]] = {} + # First pass: collect per-benchmark time samples, throughput samples and unit. + per_bm = {} # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None} for key_vals, group in df.groupby(name_cols, sort=False): if not isinstance(key_vals, tuple): key_vals = (key_vals,) @@ -87,21 +107,77 @@ def _build(self) -> dict[str, dict[str, np.ndarray]]: if self.filter is not None and self.filter.search(bm_name) is None: continue - samples = group[_METRIC].to_numpy(dtype=np.float64) - samples = samples[np.isfinite(samples)] - if samples.size == 0: - if self.debug_log: - self.logger.warning( - f"parser_TEsamples: benchmark '{bm_name}' has no finite samples; skipping." - ) + time_ms = group[_TIME_COL].to_numpy(dtype=np.float64) + time_ms = time_ms[np.isfinite(time_ms)] + if time_ms.size == 0: + self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.") continue - if self.debug_log and samples.size < 10: - self.logger.warning( - f"parser_TEsamples: benchmark '{bm_name}' has only {samples.size} samples " - "(>= 10 recommended). Re-run the benchmark with a larger --repetitions." + # benchstats' renderer auto-scales assuming seconds, so expose seconds. + time_s = time_ms / 1e3 + + thr_s, unit = None, None + if has_thr_col: + thr_s = group[_THR_COL].to_numpy(dtype=np.float64) + thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)] + if thr_s.size == 0: + thr_s = None + else: + units = [u for u in group[_UNIT_COL].astype(str).unique() + if u and u.lower() != "nan"] if _UNIT_COL in df.columns else [] + unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None) + + if self.debug_log and time_s.size < 10: + self._log( + "warning", + f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); " + "re-run with a larger --min-samples.", ) - stats[bm_name] = {_METRIC: samples} - - if not stats and self.debug_log: - self.logger.warning(f"parser_TEsamples: no benchmarks read from '{self.file}'.") + per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit} + + if not per_bm: + self._log("warning", f"no benchmarks read from '{self.file}'.") + return {} + + # Decide on a uniform metric set across all benchmarks. + with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None} + thr_key = self._resolve_throughput_key(with_thr) + + if thr_key is not None and 0 < len(with_thr) < len(per_bm): + dropped = sorted(set(per_bm) - set(with_thr)) + self._log( + "warning", + f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so " + f"throughput can be shown uniformly: {', '.join(dropped)}", + ) + per_bm = with_thr + + # Build the result, honoring an explicit metric request if given. + stats = {} + for bm_name, d in per_bm.items(): + entry = {} + if self._metric_requested(_TIME_KEY, thr_key): + entry[_TIME_KEY] = d["time"] + if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key): + entry[thr_key] = d["thr"] + if entry: + stats[bm_name] = entry return stats + + def _resolve_throughput_key(self, with_thr): + """Return a single throughput metric key shared by all throughput-bearing benchmarks.""" + if not with_thr: + return None + units = {d["unit"] for d in with_thr.values()} + if len(units) == 1: + return next(iter(units)) or _GENERIC_THR + return _GENERIC_THR # mixed units in one file (atypical) -> generic header + + def _metric_requested(self, key, thr_key): + """Honor an explicit metrics= request (benchstats CLI), else expose everything.""" + if self._requested_metrics is None: + return True + req = self._requested_metrics + if key == _TIME_KEY: + return any(t in req for t in (_TIME_KEY, _TIME_COL, "time")) + # throughput: match the unit key, the generic name, or literal 'throughput' + return key in req or _GENERIC_THR in req or _THR_COL in req diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 825a08924..929019ee8 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -21,11 +21,11 @@ DEFAULT_MIN_RUN_TIME_SECONDS = 0.2 -# Number of repetitions used by ``time_func`` when a caller passes -# ``repetitions=None``. ``run_benchmarks`` sets this from the ``--repetitions`` -# CLI flag so every benchmark script inherits the knob without per-script edits. -# ``None`` (the default) preserves the original single-measurement behavior. -_ACTIVE_REPETITIONS = None +# Minimum number of raw timing samples (blocks) ``time_func`` ensures when a +# caller passes ``min_samples=None``. ``run_benchmarks`` sets this from the +# ``--min-samples`` CLI flag so every benchmark script inherits the knob without +# per-script edits. ``None`` leaves torch's autorange result untouched. +_ACTIVE_MIN_SAMPLES = None # --------------------------------------------------------------------------- # Model configurations @@ -94,54 +94,60 @@ def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None): # Timing helpers # --------------------------------------------------------------------------- -class _RepeatedMeasurement: - """Minimal ``torch...benchmark.Measurement`` stand-in for repeated runs. +class _RawSamples: + """Minimal ``torch...benchmark.Measurement`` stand-in holding raw block times. - Exposes the ``times`` / ``number_per_run`` / ``mean`` attributes the - samples-CSV writer relies on. ``times`` holds one per-run mean (in seconds) - for each repetition, so ``--csv-samples`` emits one row per repetition. + Exposes the ``times`` (per-run seconds, one entry per recorded timing block), + ``number_per_run`` and ``mean`` attributes that ``time_func`` callers and the + samples-CSV writer rely on. """ - def __init__(self, times, mean): + def __init__(self, times, number_per_run, mean): self.times = times - self.number_per_run = 1 + self.number_per_run = number_per_run self.mean = mean def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS, - repetitions=None): + min_samples=None): """Time *fn* and return ``(mean_ms, measurement)``. - The returned measurement carries per-sample times accessible via - ``measurement.times`` and ``measurement.number_per_run``. + The returned measurement exposes per-run sample times via + ``measurement.times`` -- one entry per recorded timing block (each block is + an average over ``measurement.number_per_run`` executions, as chosen by + torch to amortize timer overhead). method: "adaptive" uses adaptive_autorange (good for compute-bound), "blocked" uses blocked_autorange (good for memory-bound). - repetitions: number of independent autorange measurements to collect. Each - repetition contributes one per-run mean to ``measurement.times`` (the - form expected by statistical comparison via ``compare_results.py - --stats``). ``None`` falls back to the module-level ``_ACTIVE_REPETITIONS`` - (set from ``--repetitions``); a value <= 1 reproduces the original - single-measurement behavior and returns the native ``Measurement``. + min_samples: ensure at least this many raw timing blocks are recorded, so the + per-sample data is large enough for statistical comparison + (compare_results.py --stats). torch's autorange usually records only a + few blocks; any shortfall is topped up with additional equal-sized blocks + rather than re-running and re-averaging the whole measurement. ``None`` + falls back to the module-level ``_ACTIVE_MIN_SAMPLES`` (set from + ``--min-samples``); ``None`` there too leaves the autorange result as-is. """ - if repetitions is None: - repetitions = _ACTIVE_REPETITIONS + if min_samples is None: + min_samples = _ACTIVE_MIN_SAMPLES timer = benchmark.Timer(stmt="fn()", globals={"fn": fn}) + if method == "blocked": + m = timer.blocked_autorange(min_run_time=min_run_time) + else: + m = timer.adaptive_autorange(min_run_time=min_run_time) - def _measure(): - if method == "blocked": - return timer.blocked_autorange(min_run_time=min_run_time) - return timer.adaptive_autorange(min_run_time=min_run_time) - - if repetitions is None or repetitions <= 1: - m = _measure() + if min_samples is None or len(m.times) >= min_samples: return m.mean * 1e3, m - means = [_measure().mean for _ in range(repetitions)] # per-run seconds - mean_s = sum(means) / len(means) - return mean_s * 1e3, _RepeatedMeasurement(times=means, mean=mean_s) + # Top up with additional equal-sized blocks (each timeit() records one block + # averaged over number_per_run runs) until enough raw samples are collected. + times = list(m.times) # per-run seconds + number = m.number_per_run + while len(times) < min_samples: + times.append(timer.timeit(number).mean) + mean_s = sum(times) / len(times) + return mean_s * 1e3, _RawSamples(times=times, number_per_run=number, mean=mean_s) # --------------------------------------------------------------------------- @@ -305,12 +311,13 @@ def make_parser(**kwargs): ), ) parser.add_argument( - "--repetitions", type=int, default=15, metavar="N", + "--min-samples", type=int, default=12, metavar="N", help=( - "Number of independent timing repetitions per metric. Each " - "repetition yields one sample for statistical comparison " - "(compare_results.py --stats). Use 1 for the original " - "single-measurement behavior. Default: 15." + "Ensure at least N raw timing samples (blocks) are recorded per " + "metric for statistical comparison (compare_results.py --stats). " + "torch's autorange records only a few; any shortfall is topped up " + "with additional equal-sized blocks. Use a small value (e.g. 2) to " + "effectively disable top-up. Default: 12." ), ) return parser @@ -348,10 +355,10 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None, if args is None: args = make_parser().parse_args() - # Let time_func (called by bench_fns without an explicit repetitions arg) + # Let time_func (called by bench_fns without an explicit min_samples arg) # inherit the CLI value without editing every benchmark script. - global _ACTIVE_REPETITIONS - _ACTIVE_REPETITIONS = getattr(args, "repetitions", None) + global _ACTIVE_MIN_SAMPLES + _ACTIVE_MIN_SAMPLES = getattr(args, "min_samples", None) rows = [] all_case_metrics = [] @@ -407,16 +414,36 @@ def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None, if measurement is None: continue lbl = metric["label"] + unit = metric.get("unit") + thr_mean = metric.get("throughput") or 0.0 + ms_mean = metric.get("ms") or 0.0 + # Throughput is a deterministic function of time for a given + # config (throughput = C / time), so a per-sample throughput is + # recovered from the aggregate as thr_mean * ms_mean / sample_ms. + # samples_only records (e.g. Forward+Backward) carry no + # throughput and are left blank. + has_thr = ( + not metric.get("samples_only") and thr_mean > 0 and ms_mean > 0 + ) for i, t in enumerate(measurement.times): + # measurement.times entries are already per-run (seconds). + sample_ms = t * 1e3 sr = dict(case_params) sr["label"] = lbl sr["sample_idx"] = i - sr["time_ms"] = t / measurement.number_per_run * 1e3 + sr["time_ms"] = sample_ms + sr["throughput"] = ( + thr_mean * ms_mean / sample_ms + if has_thr and sample_ms > 0 + else "" + ) + sr["unit"] = unit if has_thr else "" sample_rows.append(sr) if sample_rows: df = pd.DataFrame( sample_rows, - columns=param_columns + ["label", "sample_idx", "time_ms"], + columns=param_columns + + ["label", "sample_idx", "time_ms", "throughput", "unit"], ) df.to_csv(samples_csv, index=False) print(f"Samples saved to {samples_csv}")