From 4753e1a050677077ca2e80761a12d21bcc1c253d Mon Sep 17 00:00:00 2001 From: Patrick Miles Date: Fri, 5 Jun 2026 13:14:57 -0700 Subject: [PATCH 1/2] fix fractalgen bug: computing acceptance criteria variance across wrong axis --- ScaFFold/datagen/category_search.py | 65 ++++++++++++++++++----------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/ScaFFold/datagen/category_search.py b/ScaFFold/datagen/category_search.py index a7dbc7a..8b819e7 100644 --- a/ScaFFold/datagen/category_search.py +++ b/ScaFFold/datagen/category_search.py @@ -50,8 +50,8 @@ def generate_single_category(config: Config) -> tuple[bool, np.array, bool, bool A bool for whether a valid category was found on this attempt. params : np.array A numpy array containing IFS parameters for this category attempt, if attempt was valid. - (not nan_check_pass) : bool - A bool for whether this attempt passed the NaN check. + (not value_check_pass) : bool + A bool for whether this attempt passed the NaN/non-finite check. (not variance_check_pass) : bool A bool for whether this attempt passed the variance check. (not runaway_check_pass) : bool @@ -80,31 +80,40 @@ def generate_single_category(config: Config) -> tuple[bool, np.array, bool, bool ), ) - # Sum number of NaNs + # Sum number of NaNs and reject infinities before normalization. nan_count = np.isnan(points).sum() - nan_check_pass = nan_count == 0 + value_check_pass = nan_count == 0 and np.isfinite(points).all() variance_check_pass = False - if nan_check_pass: + if value_check_pass: # Normalize + center mins = points.min(axis=0) maxs = points.max(axis=0) means = points.mean(axis=0) - scales = (2 * config.normalize) / (maxs - mins) - points = (points - means) * scales - - # Calc dimension-wise variance and compare to threshold - points_variance = np.var(points, axis=1) - variance_check_pass = np.all(points_variance > config.variance_threshold) - if variance_check_pass and nan_check_pass and runaway_check_pass: + with np.errstate(over="ignore", invalid="ignore"): + ranges = maxs - mins + value_check_pass = np.all(np.isfinite(ranges)) and np.all(ranges > 0) + if value_check_pass: + scales = (2 * config.normalize) / ranges + with np.errstate(over="ignore", invalid="ignore"): + points = (points - means) * scales + + value_check_pass = np.isfinite(points).all() + if value_check_pass: + # Calc dimension-wise variance and compare to threshold + points_variance = np.var(points, axis=0) + variance_check_pass = np.all( + points_variance > config.variance_threshold + ) + if variance_check_pass and value_check_pass and runaway_check_pass: valid = True # Return result return ( valid, params, - not nan_check_pass, - not variance_check_pass, + bool(not value_check_pass), + bool(value_check_pass and not variance_check_pass), not runaway_check_pass, ) @@ -129,7 +138,7 @@ def generate_categories_batch( params : np.array A numpy array containing IFS parameters for this category attempt, if attempt was valid. failed_nan_check_count : int - The number of attempts in this batch which failed the nan check. + The number of attempts in this batch which failed the NaN/non-finite check. failed_var_check_count : int The number of attempts in this batch which failed the var check. runaway_failure_count : int @@ -186,7 +195,11 @@ def main(config: Config) -> None: rank = comm.Get_rank() size = comm.Get_size() - datagen_batch_size = 10000 + datagen_batch_size = int(getattr(config, "datagen_batch_size", 10000)) + if datagen_batch_size < 1: + raise ValueError( + f"datagen_batch_size must be positive, got {datagen_batch_size}" + ) # FIXME anything else to ensure determinism? np.random.seed(config.seed + rank) @@ -224,7 +237,7 @@ def main(config: Config) -> None: var_fail_count = 0 runaway_fail_count = 0 while categories_remaining > 0: - attempts += size + attempts += datagen_batch_size * size # Each rank attempts to generate datagen_batch_size categories ( @@ -245,12 +258,15 @@ def main(config: Config) -> None: # Process IFS params one at a time, writing each to a CSV if rank == 0: params_valid = [item for sublist in gathered_params for item in sublist] - if attempts % 10000 * size / datagen_batch_size == 0: + print( + f"cat_remaining = {categories_remaining} | total attempts = {attempts} | stats for rank 0: invalid_value_fail_count = {nan_fail_count}, var_fail_count = {var_fail_count}, runaway_fail_count = {runaway_fail_count}", + flush=True, + ) + if len(params_valid) > 0: print( - f"cat_remaining = {categories_remaining} | total attempts = {attempts} | stats for rank 0: nan_fail_count = {nan_fail_count}, var_fail_count = {var_fail_count}, runaway_fail_count = {runaway_fail_count}" + f"Processing {len(params_valid)} valid param sets from this batch", + flush=True, ) - if len(params_valid) > 0: - print(f"Processing {len(params_valid)} param sets from this attempt") for p in params_valid: # Ensure we don't save more categories than needed if categories_remaining > 0: @@ -284,14 +300,15 @@ def main(config: Config) -> None: global_runaway_fail_count = comm.reduce(runaway_fail_count, op=MPI.SUM, root=0) if rank == 0 and attempts > 0: + generated_categories = config.n_categories - existing_categories print( - f"Generated {config.n_categories - existing_categories} new categories in {attempts * datagen_batch_size} total attempts | {attempts * datagen_batch_size / (config.n_categories - existing_categories)} Attempts per category | Total categories is now {config.n_categories}" + f"Generated {generated_categories} new categories in {attempts} total attempts | {attempts / generated_categories} Attempts per category | Total categories is now {config.n_categories}" ) print( - f"Failures experienced: {global_nan_fail_count} nan attempts, {100 * global_nan_fail_count / (attempts * datagen_batch_size):.4f}% of all attempts, {global_var_fail_count} var fail attempts, {100 * global_var_fail_count / (attempts * datagen_batch_size):.4f}% of all attempts, {global_runaway_fail_count} runaway attempts, {100 * global_runaway_fail_count / (attempts * datagen_batch_size):.4f}% of all attempts" + f"Failures experienced: {global_nan_fail_count} invalid-value attempts, {100 * global_nan_fail_count / attempts:.4f}% of all attempts, {global_var_fail_count} var fail attempts, {100 * global_var_fail_count / attempts:.4f}% of all attempts, {global_runaway_fail_count} runaway attempts, {100 * global_runaway_fail_count / attempts:.4f}% of all attempts" ) print( - f"Rank 0 wall time = {rank_total_time:.2f} | Total CPU time = {global_sum_time:.2f} | Avg wall time per rank {global_sum_time / size:.2f} | {attempts * datagen_batch_size / rank_total_time:.2f} total attempts per wall second | {attempts * datagen_batch_size / rank_total_time / size:.2f} attempts per wall second per rank" + f"Rank 0 wall time = {rank_total_time:.2f} | Total CPU time = {global_sum_time:.2f} | Avg wall time per rank {global_sum_time / size:.2f} | {attempts / rank_total_time:.2f} total attempts per wall second | {attempts / rank_total_time / size:.2f} attempts per wall second per rank" ) return 0 From a407644af786ea530501faa56cde6980456a994d Mon Sep 17 00:00:00 2001 From: Patrick Miles Date: Fri, 5 Jun 2026 13:23:03 -0700 Subject: [PATCH 2/2] pass absolute fractal dir path to scaffold calls to ensure existing fractals found --- scripts/scaffold-matrix.job | 13 ++++++++++--- scripts/scaffold-tuolumne-torchpypi.job | 13 ++++++++++--- scripts/scaffold-tuolumne.job | 13 ++++++++++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/scripts/scaffold-matrix.job b/scripts/scaffold-matrix.job index d194aa8..f143fcd 100644 --- a/scripts/scaffold-matrix.job +++ b/scripts/scaffold-matrix.job @@ -12,10 +12,17 @@ export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH -scaffold generate_fractals -c ScaFFold/configs/benchmark_default.yml +CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml" +FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}" + +scaffold generate_fractals \ + -c "$CONFIG_PATH" \ + --fract-base-dir "$FRACT_BASE_DIR" # Uncomment if you want torch profiling #export PROFILE_TORCH=ON -torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml -#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml +torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark \ + -c "$CONFIG_PATH" \ + --fract-base-dir "$FRACT_BASE_DIR" +#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c "$CONFIG_PATH" --fract-base-dir "$FRACT_BASE_DIR" diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job index 0387e5e..c78af96 100644 --- a/scripts/scaffold-tuolumne-torchpypi.job +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -22,10 +22,17 @@ export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0 # Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0 -torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml +CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml" +FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}" + +torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals \ + -c "$CONFIG_PATH" \ + --fract-base-dir "$FRACT_BASE_DIR" # Uncomment if you want torch profiling #export PROFILE_TORCH=ON -torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml -# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml +torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark \ + -c "$CONFIG_PATH" \ + --fract-base-dir "$FRACT_BASE_DIR" +# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c "$CONFIG_PATH" --fract-base-dir "$FRACT_BASE_DIR" diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index a22d8c6..1ae88b0 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -25,10 +25,17 @@ export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0 # Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0 -torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml +CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml" +FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}" + +torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals \ + -c "$CONFIG_PATH" \ + --fract-base-dir "$FRACT_BASE_DIR" # Uncomment if you want torch profiling #export PROFILE_TORCH=ON -torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml -# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml +torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark \ + -c "$CONFIG_PATH" \ + --fract-base-dir "$FRACT_BASE_DIR" +# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c "$CONFIG_PATH" --fract-base-dir "$FRACT_BASE_DIR"