Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/user-tutorial/benchmarks/model-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,33 @@ For inference, supported percentiles include

**New: Support fp8_hybrid and fp8_e4m3 precision for BERT models.**

**New: Deterministic Training Support**
SuperBench now supports deterministic training to ensure reproducibility across runs. This includes fixed seeds and deterministic algorithms. To enable deterministic training, use the following flags:

- **Flags:**
- `--enable_determinism`: Enables deterministic computation for reproducible results.
- `--deterministic_seed <seed>`: Sets the seed for reproducibility (default: 42).
- `--check_frequency <steps>`: How often to record deterministic metrics (default: 100).

- **Environment Variables (set automatically by SuperBench when `--enable_determinism` is used):**
- `CUBLAS_WORKSPACE_CONFIG=:4096:8`: Ensures deterministic behavior in cuBLAS. This can be overridden by setting it manually before running SuperBench.

**Comparing Deterministic Results**

To compare deterministic results between runs, use the standard result analysis workflow:

1. Run benchmark with `--enable_determinism` flag
2. Generate baseline: `sb result generate-baseline --data-file results.jsonl --summary-rule-file rules.yaml`
3. Compare future runs: `sb result diagnosis --data-file new-results.jsonl --rule-file rules.yaml --baseline-file baseline.json`

This allows configurable tolerance for floating-point differences via YAML rules.

**Configuration Parameter Validation**

When determinism is enabled, benchmark configuration parameters (batch_size, num_steps, deterministic_seed, etc.) are automatically recorded in the results file as `deterministic_config_*` metrics. The diagnosis rules enforce exact matching of these parameters between runs to ensure valid comparisons:

If any configuration parameter differs between runs, the diagnosis will flag it as a failure, ensuring you only compare runs with identical configurations.

#### Metrics

| Name | Unit | Description |
Expand Down
150 changes: 150 additions & 0 deletions examples/benchmarks/pytorch_deterministic_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Unified PyTorch deterministic training example for all supported models.

Deterministic metrics (loss, activation mean) are automatically stored in results
when --enable_determinism flag is enabled.

To compare deterministic results between runs, use the `sb result diagnosis` command
with a baseline file and comparison rules. See the SuperBench documentation for details.

Example workflow:
1. Run first benchmark (creates outputs/<timestamp>/results-summary.jsonl):
python3 examples/benchmarks/pytorch_deterministic_example.py \
--model resnet101 --enable_determinism --deterministic_seed 42

2. Generate baseline from results:
sb result generate-baseline --data-file outputs/<timestamp>/results-summary.jsonl \
--summary-rule-file summary-rules.yaml --output-dir outputs/<timestamp>

3. Run second benchmark:
python3 examples/benchmarks/pytorch_deterministic_example.py \
--model resnet101 --enable_determinism --deterministic_seed 42

4. Compare runs with diagnosis:
sb result diagnosis --data-file outputs/<run2-timestamp>/results-summary.jsonl \
--rule-file rules.yaml --baseline-file outputs/<run1-timestamp>/baseline.json

Note: CUBLAS_WORKSPACE_CONFIG is now automatically set by the code when determinism is enabled.
"""

import argparse
import json
import socket
from datetime import datetime
from pathlib import Path
from superbench.benchmarks import BenchmarkRegistry, Framework
from superbench.common.utils import logger

MODEL_CHOICES = [
'bert-large',
'gpt2-small',
'llama2-7b',
'mixtral-8x7b',
'resnet101',
'lstm',
]

DEFAULT_PARAMS = {
'bert-large':
'--batch_size 1 --seq_len 64 --num_warmup 1 --num_steps 200 --precision float32 '
'--model_action train --check_frequency 20',
'gpt2-small':
'--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 128 --precision float32 '
'--model_action train --check_frequency 20',
'llama2-7b':
'--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 512 --precision float32 --model_action train '
'--check_frequency 20',
'mixtral-8x7b':
'--hidden_size 4096 --num_hidden_layers 32 --num_attention_heads 32 --intermediate_size 14336 '
'--num_key_value_heads 8 --max_position_embeddings 32768 --router_aux_loss_coef 0.02 '
'--check_frequency 20',
'resnet101':
'--batch_size 1 --precision float32 --num_warmup 1 --num_steps 120 --sample_count 8192 '
'--pin_memory --model_action train --check_frequency 20',
'lstm':
'--batch_size 1 --num_steps 100 --num_warmup 2 --seq_len 64 --precision float32 '
'--model_action train --check_frequency 30',
}


def main():
"""Main function for determinism example file."""
parser = argparse.ArgumentParser(description='Unified PyTorch deterministic training example.')
parser.add_argument('--model', type=str, choices=MODEL_CHOICES, required=True, help='Model to run.')
parser.add_argument(
'--enable_determinism',
action='store_true',
help='Enable deterministic mode for reproducible results.',
)
parser.add_argument(
'--deterministic_seed',
type=int,
default=None,
help='Seed for deterministic training.',
)
args = parser.parse_args()

parameters = DEFAULT_PARAMS[args.model]
if args.enable_determinism:
parameters += ' --enable_determinism'
if args.deterministic_seed is not None:
parameters += f' --deterministic_seed {args.deterministic_seed}'

context = BenchmarkRegistry.create_benchmark_context(args.model, parameters=parameters, framework=Framework.PYTORCH)
benchmark = BenchmarkRegistry.launch_benchmark(context)
logger.info(f'Benchmark finished. Return code: {benchmark.return_code}')

# Create timestamped output directory
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
output_dir = Path('outputs') / timestamp
output_dir.mkdir(parents=True, exist_ok=True)

# Parse benchmark results
benchmark_results = json.loads(benchmark.serialized_result)
benchmark_name = benchmark_results.get('name', f'pytorch-{args.model}')

# Convert to results-summary.jsonl format (flattened keys)
# Use format compatible with sb result commands: model-benchmarks:<category>/<benchmark>/<metric>
summary = {}
prefix = f'model-benchmarks:example:determinism/{benchmark_name}'
if 'result' in benchmark_results:
for metric, values in benchmark_results['result'].items():
# Use first value if it's a list
val = values[0] if isinstance(values, list) else values
# Add _rank0 suffix to deterministic metrics for compatibility with rules
if metric.startswith('deterministic_'):
metric_key = f'{prefix}/{metric}_rank0'
else:
metric_key = f'{prefix}/{metric}'
summary[metric_key] = val

# Add node identifier
summary['node'] = socket.gethostname()

# Write results-summary.jsonl
summary_file = output_dir / 'results-summary.jsonl'
with open(summary_file, 'w') as f:
f.write(json.dumps(summary))
logger.info(f'Results saved to {summary_file}')

# Also save full results for reference
full_results_file = output_dir / 'results-full.json'
with open(full_results_file, 'w') as f:
json.dump(benchmark_results, f, indent=2)

if 'raw_data' in benchmark_results and 'deterministic_loss' in benchmark_results['raw_data']:
num_checkpoints = len(benchmark_results['raw_data']['deterministic_loss'][0])
logger.info(f'Periodic fingerprints collected at {num_checkpoints} checkpoints')

logger.info(
f'To generate baseline: sb result generate-baseline '
f'--data-file {summary_file} --summary-rule-file summary-rules.yaml '
f'--output-dir {output_dir}'
)
logger.info('To compare results between runs, use `sb result diagnosis` command.')


if __name__ == '__main__':
main()
41 changes: 29 additions & 12 deletions superbench/analyzer/baseline_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,33 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
aggregated_df[metrics[index]] = out[1]
return baseline

def _format_metric_value(self, metric, val, digit):
"""Format a single baseline metric value based on its type.

Args:
metric (str): the metric name.
val: the metric value.
digit (int): the number of digits after the decimal point.

Returns:
The formatted metric value.
"""
if metric not in self._raw_data_df:
return val
sample = self._raw_data_df[metric].iloc[0]
if isinstance(sample, float):
# Keep full precision for deterministic metrics to avoid false positives in diagnosis
if 'deterministic' in metric:
return float(val)
return f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
if isinstance(sample, int):
return int(val)
try:
return float(val)
except Exception as e:
logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
return val

def run(
self, raw_data_file, summary_rule_file, diagnosis_rule_file, pre_baseline_file, algorithm, output_dir, digit=2
):
Expand All @@ -174,19 +201,9 @@ def run(
# generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
for metric in baseline:
val = baseline[metric]
if metric in self._raw_data_df:
if isinstance(self._raw_data_df[metric].iloc[0], float):
baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
elif isinstance(self._raw_data_df[metric].iloc[0], int):
baseline[metric] = int(val)
else:
try:
baseline[metric] = float(val)
except Exception as e:
logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
baseline[metric] = self._format_metric_value(metric, baseline[metric], digit)
baseline = json.dumps(baseline, indent=2, sort_keys=True)
baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
baseline = re.sub(r': \"(-?\d+\.?\d*)\"', r': \1', baseline)
with (Path(output_dir) / 'baseline.json').open('w') as f:
f.write(baseline)

Expand Down
5 changes: 4 additions & 1 deletion superbench/analyzer/data_diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,10 @@ def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
'Category','Defective Details']
"""
append_columns = ['Accept', 'Number Of Issues', 'Category', 'Defective Details']
all_data_df = (raw_data_df).astype('float64')
# Preserve all columns, but only convert numeric columns to float64
all_data_df = raw_data_df.copy()
numeric_cols = all_data_df.select_dtypes(include=['number']).columns
all_data_df[numeric_cols] = all_data_df[numeric_cols].astype('float64')

if data_not_accept_df.shape[0] == 0:
all_data_df['Accept'] = [True for i in range(len(all_data_df))]
Expand Down
19 changes: 15 additions & 4 deletions superbench/benchmarks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,25 @@ def parse_args(self, ignore_invalid=False):
logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
return False, None, []

ret = True
ret = self._check_unknown_args(unknown)

return ret, args, unknown

def _check_unknown_args(self, unknown):
"""Check for unknown arguments and log an error if any are found.

Args:
unknown (list): List of unknown arguments.

Returns:
bool: False if unknown arguments are found, True otherwise.
"""
if len(unknown) > 0:
logger.error(
'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
)
ret = False

return ret, args, unknown
return False
return True

def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Expand Down
17 changes: 17 additions & 0 deletions superbench/benchmarks/model_benchmarks/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,17 @@ def _generate_dataset(self):
"""
pass

def set_deterministic_seed(self):
"""Hook to set deterministic RNG state before dataset generation.

Framework-specific subclasses may
override this to apply deterministic RNG settings (for example,
PyTorch benchmarks implement this to call their deterministic setup
when requested). This is called from _preprocess() before
_generate_dataset().
"""
return None

@abstractmethod
def _init_dataloader(self):
"""Initialize the dataloader.
Expand Down Expand Up @@ -221,6 +232,12 @@ def _preprocess(self):
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
return False

# Invoke model-specific deterministic seeding hook before dataset generation
try:
self.set_deterministic_seed()
except Exception:
logger.info('set_deterministic_seed() hook failed or not implemented for model: %s', self._name)

# Set sample_count aligned with batch_size.
self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size

Expand Down
Loading
Loading