microsoft · Aishwarya-Tonpe · Feb 18, 2026
@@ -34,6 +34,33 @@ For inference, supported percentiles include
 
 **New: Support fp8_hybrid and fp8_e4m3 precision for BERT models.**
 
+**New: Deterministic Training Support**
+SuperBench now supports deterministic training to ensure reproducibility across runs. This includes fixed seeds and deterministic algorithms. To enable deterministic training, use the following flags:
+
+- **Flags:**
+  - `--enable_determinism`: Enables deterministic computation for reproducible results.
+  - `--deterministic_seed <seed>`: Sets the seed for reproducibility (default: 42).
+  - `--check_frequency <steps>`: How often to record deterministic metrics (default: 100).
+
+- **Environment Variables (set automatically by SuperBench when `--enable_determinism` is used):**
+  - `CUBLAS_WORKSPACE_CONFIG=:4096:8`: Ensures deterministic behavior in cuBLAS. This can be overridden by setting it manually before running SuperBench.
+
+**Comparing Deterministic Results**
+
+To compare deterministic results between runs, use the standard result analysis workflow:
+
+1. Run benchmark with `--enable_determinism` flag
+2. Generate baseline: `sb result generate-baseline --data-file results.jsonl --summary-rule-file rules.yaml`
+3. Compare future runs: `sb result diagnosis --data-file new-results.jsonl --rule-file rules.yaml --baseline-file baseline.json`
+
+This allows configurable tolerance for floating-point differences via YAML rules.
+
+**Configuration Parameter Validation**
+
+When determinism is enabled, benchmark configuration parameters (batch_size, num_steps, deterministic_seed, etc.) are automatically recorded in the results file as `deterministic_config_*` metrics. The diagnosis rules enforce exact matching of these parameters between runs to ensure valid comparisons:
+
+If any configuration parameter differs between runs, the diagnosis will flag it as a failure, ensuring you only compare runs with identical configurations.
+
 #### Metrics
 
 | Name                                                                                    | Unit                   | Description                                                                  |

@@ -0,0 +1,150 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unified PyTorch deterministic training example for all supported models.
+
+Deterministic metrics (loss, activation mean) are automatically stored in results
+when --enable_determinism flag is enabled.
+
+To compare deterministic results between runs, use the `sb result diagnosis` command
+with a baseline file and comparison rules. See the SuperBench documentation for details.
+
+Example workflow:
+1. Run first benchmark (creates outputs/<timestamp>/results-summary.jsonl):
+   python3 examples/benchmarks/pytorch_deterministic_example.py \
+       --model resnet101 --enable_determinism --deterministic_seed 42
+
+2. Generate baseline from results:
+   sb result generate-baseline --data-file outputs/<timestamp>/results-summary.jsonl \
+       --summary-rule-file summary-rules.yaml --output-dir outputs/<timestamp>
+
+3. Run second benchmark:
+   python3 examples/benchmarks/pytorch_deterministic_example.py \
+       --model resnet101 --enable_determinism --deterministic_seed 42
+
+4. Compare runs with diagnosis:
+   sb result diagnosis --data-file outputs/<run2-timestamp>/results-summary.jsonl \
+       --rule-file rules.yaml --baseline-file outputs/<run1-timestamp>/baseline.json
+
+Note: CUBLAS_WORKSPACE_CONFIG is now automatically set by the code when determinism is enabled.
+"""
+
+import argparse
+import json
+import socket
+from datetime import datetime
+from pathlib import Path
+from superbench.benchmarks import BenchmarkRegistry, Framework
+from superbench.common.utils import logger
+
+MODEL_CHOICES = [
+    'bert-large',
+    'gpt2-small',
+    'llama2-7b',
+    'mixtral-8x7b',
+    'resnet101',
+    'lstm',
+]
+
+DEFAULT_PARAMS = {
+    'bert-large':
+    '--batch_size 1 --seq_len 64 --num_warmup 1 --num_steps 200 --precision float32 '
+    '--model_action train --check_frequency 20',
+    'gpt2-small':
+    '--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 128 --precision float32 '
+    '--model_action train --check_frequency 20',
+    'llama2-7b':
+    '--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 512 --precision float32 --model_action train '
+    '--check_frequency 20',
+    'mixtral-8x7b':
+    '--hidden_size 4096 --num_hidden_layers 32 --num_attention_heads 32 --intermediate_size 14336 '
+    '--num_key_value_heads 8 --max_position_embeddings 32768 --router_aux_loss_coef 0.02 '
+    '--check_frequency 20',
+    'resnet101':
+    '--batch_size 1 --precision float32 --num_warmup 1 --num_steps 120 --sample_count 8192 '
+    '--pin_memory --model_action train --check_frequency 20',
+    'lstm':
+    '--batch_size 1 --num_steps 100 --num_warmup 2 --seq_len 64 --precision float32 '
+    '--model_action train --check_frequency 30',
+}
+
+
+def main():
+    """Main function for determinism example file."""
+    parser = argparse.ArgumentParser(description='Unified PyTorch deterministic training example.')
+    parser.add_argument('--model', type=str, choices=MODEL_CHOICES, required=True, help='Model to run.')
+    parser.add_argument(
+        '--enable_determinism',
+        action='store_true',
+        help='Enable deterministic mode for reproducible results.',
+    )
+    parser.add_argument(
+        '--deterministic_seed',
+        type=int,
+        default=None,
+        help='Seed for deterministic training.',
+    )
+    args = parser.parse_args()
+
+    parameters = DEFAULT_PARAMS[args.model]
+    if args.enable_determinism:
+        parameters += ' --enable_determinism'
+    if args.deterministic_seed is not None:
+        parameters += f' --deterministic_seed {args.deterministic_seed}'
+
+    context = BenchmarkRegistry.create_benchmark_context(args.model, parameters=parameters, framework=Framework.PYTORCH)
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    logger.info(f'Benchmark finished. Return code: {benchmark.return_code}')
+
+    # Create timestamped output directory
+    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+    output_dir = Path('outputs') / timestamp
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Parse benchmark results
+    benchmark_results = json.loads(benchmark.serialized_result)
+    benchmark_name = benchmark_results.get('name', f'pytorch-{args.model}')
+
+    # Convert to results-summary.jsonl format (flattened keys)
+    # Use format compatible with sb result commands: model-benchmarks:<category>/<benchmark>/<metric>
+    summary = {}
+    prefix = f'model-benchmarks:example:determinism/{benchmark_name}'
+    if 'result' in benchmark_results:
+        for metric, values in benchmark_results['result'].items():
+            # Use first value if it's a list
+            val = values[0] if isinstance(values, list) else values
+            # Add _rank0 suffix to deterministic metrics for compatibility with rules
+            if metric.startswith('deterministic_'):
+                metric_key = f'{prefix}/{metric}_rank0'
+            else:
+                metric_key = f'{prefix}/{metric}'
+            summary[metric_key] = val
+
+    # Add node identifier
+    summary['node'] = socket.gethostname()
+
+    # Write results-summary.jsonl
+    summary_file = output_dir / 'results-summary.jsonl'
+    with open(summary_file, 'w') as f:
+        f.write(json.dumps(summary))
+    logger.info(f'Results saved to {summary_file}')
+
+    # Also save full results for reference
+    full_results_file = output_dir / 'results-full.json'
+    with open(full_results_file, 'w') as f:
+        json.dump(benchmark_results, f, indent=2)
+
+    if 'raw_data' in benchmark_results and 'deterministic_loss' in benchmark_results['raw_data']:
+        num_checkpoints = len(benchmark_results['raw_data']['deterministic_loss'][0])
+        logger.info(f'Periodic fingerprints collected at {num_checkpoints} checkpoints')
+
+    logger.info(
+        f'To generate baseline: sb result generate-baseline '
+        f'--data-file {summary_file} --summary-rule-file summary-rules.yaml '
+        f'--output-dir {output_dir}'
+    )
+    logger.info('To compare results between runs, use `sb result diagnosis` command.')
+
+
+if __name__ == '__main__':
+    main()
@@ -150,6 +150,33 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
                         aggregated_df[metrics[index]] = out[1]
         return baseline
 
+    def _format_metric_value(self, metric, val, digit):
+        """Format a single baseline metric value based on its type.
+
+        Args:
+            metric (str): the metric name.
+            val: the metric value.
+            digit (int): the number of digits after the decimal point.
+
+        Returns:
+            The formatted metric value.
+        """
+        if metric not in self._raw_data_df:
+            return val
+        sample = self._raw_data_df[metric].iloc[0]
+        if isinstance(sample, float):
+            # Keep full precision for deterministic metrics to avoid false positives in diagnosis
+            if 'deterministic' in metric:
+                return float(val)
+            return f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
+        if isinstance(sample, int):
+            return int(val)
+        try:
+            return float(val)
+        except Exception as e:
+            logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
+            return val
+
     def run(
         self, raw_data_file, summary_rule_file, diagnosis_rule_file, pre_baseline_file, algorithm, output_dir, digit=2
     ):
@@ -174,19 +201,9 @@ def run(
             # generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
             baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
             for metric in baseline:
-                val = baseline[metric]
-                if metric in self._raw_data_df:
-                    if isinstance(self._raw_data_df[metric].iloc[0], float):
-                        baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
-                    elif isinstance(self._raw_data_df[metric].iloc[0], int):
-                        baseline[metric] = int(val)
-                    else:
-                        try:
-                            baseline[metric] = float(val)
-                        except Exception as e:
-                            logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
+                baseline[metric] = self._format_metric_value(metric, baseline[metric], digit)
             baseline = json.dumps(baseline, indent=2, sort_keys=True)
-            baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
+            baseline = re.sub(r': \"(-?\d+\.?\d*)\"', r': \1', baseline)
             with (Path(output_dir) / 'baseline.json').open('w') as f:
                 f.write(baseline)
 

@@ -238,7 +238,10 @@ def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
             'Category','Defective Details']
         """
         append_columns = ['Accept', 'Number Of Issues', 'Category', 'Defective Details']
-        all_data_df = (raw_data_df).astype('float64')
+        # Preserve all columns, but only convert numeric columns to float64
+        all_data_df = raw_data_df.copy()
+        numeric_cols = all_data_df.select_dtypes(include=['number']).columns
+        all_data_df[numeric_cols] = all_data_df[numeric_cols].astype('float64')
 
         if data_not_accept_df.shape[0] == 0:
             all_data_df['Accept'] = [True for i in range(len(all_data_df))]

@@ -110,14 +110,25 @@ def parse_args(self, ignore_invalid=False):
                 logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
                 return False, None, []
 
-        ret = True
+        ret = self._check_unknown_args(unknown)
+
+        return ret, args, unknown
+
+    def _check_unknown_args(self, unknown):
+        """Check for unknown arguments and log an error if any are found.
+
+        Args:
+            unknown (list): List of unknown arguments.
+
+        Returns:
+            bool: False if unknown arguments are found, True otherwise.
+        """
         if len(unknown) > 0:
             logger.error(
                 'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
             )
-            ret = False
-
-        return ret, args, unknown
+            return False
+        return True
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.

@@ -186,6 +186,17 @@ def _generate_dataset(self):
         """
         pass
 
+    def set_deterministic_seed(self):
+        """Hook to set deterministic RNG state before dataset generation.
+
+        Framework-specific subclasses may
+        override this to apply deterministic RNG settings (for example,
+        PyTorch benchmarks implement this to call their deterministic setup
+        when requested). This is called from _preprocess() before
+        _generate_dataset().
+        """
+        return None
+
     @abstractmethod
     def _init_dataloader(self):
         """Initialize the dataloader.
@@ -221,6 +232,12 @@ def _preprocess(self):
             self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
             return False
 
+        # Invoke model-specific deterministic seeding hook before dataset generation
+        try:
+            self.set_deterministic_seed()
+        except Exception:
+            logger.info('set_deterministic_seed() hook failed or not implemented for model: %s', self._name)
+
         # Set sample_count aligned with batch_size.
         self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size