Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 41 additions & 24 deletions ScaFFold/datagen/category_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def generate_single_category(config: Config) -> tuple[bool, np.array, bool, bool
A bool for whether a valid category was found on this attempt.
params : np.array
A numpy array containing IFS parameters for this category attempt, if attempt was valid.
(not nan_check_pass) : bool
A bool for whether this attempt passed the NaN check.
(not value_check_pass) : bool
A bool for whether this attempt passed the NaN/non-finite check.
(not variance_check_pass) : bool
A bool for whether this attempt passed the variance check.
(not runaway_check_pass) : bool
Expand Down Expand Up @@ -80,31 +80,40 @@ def generate_single_category(config: Config) -> tuple[bool, np.array, bool, bool
),
)

# Sum number of NaNs
# Sum number of NaNs and reject infinities before normalization.
nan_count = np.isnan(points).sum()
nan_check_pass = nan_count == 0
value_check_pass = nan_count == 0 and np.isfinite(points).all()
variance_check_pass = False

if nan_check_pass:
if value_check_pass:
# Normalize + center
mins = points.min(axis=0)
maxs = points.max(axis=0)
means = points.mean(axis=0)
scales = (2 * config.normalize) / (maxs - mins)
points = (points - means) * scales

# Calc dimension-wise variance and compare to threshold
points_variance = np.var(points, axis=1)
variance_check_pass = np.all(points_variance > config.variance_threshold)
if variance_check_pass and nan_check_pass and runaway_check_pass:
with np.errstate(over="ignore", invalid="ignore"):
ranges = maxs - mins
value_check_pass = np.all(np.isfinite(ranges)) and np.all(ranges > 0)
if value_check_pass:
scales = (2 * config.normalize) / ranges
with np.errstate(over="ignore", invalid="ignore"):
points = (points - means) * scales

value_check_pass = np.isfinite(points).all()
if value_check_pass:
# Calc dimension-wise variance and compare to threshold
points_variance = np.var(points, axis=0)
variance_check_pass = np.all(
points_variance > config.variance_threshold
)
if variance_check_pass and value_check_pass and runaway_check_pass:
valid = True

# Return result
return (
valid,
params,
not nan_check_pass,
not variance_check_pass,
bool(not value_check_pass),
bool(value_check_pass and not variance_check_pass),
not runaway_check_pass,
)

Expand All @@ -129,7 +138,7 @@ def generate_categories_batch(
params : np.array
A numpy array containing IFS parameters for this category attempt, if attempt was valid.
failed_nan_check_count : int
The number of attempts in this batch which failed the nan check.
The number of attempts in this batch which failed the NaN/non-finite check.
failed_var_check_count : int
The number of attempts in this batch which failed the var check.
runaway_failure_count : int
Expand Down Expand Up @@ -186,7 +195,11 @@ def main(config: Config) -> None:
rank = comm.Get_rank()
size = comm.Get_size()

datagen_batch_size = 10000
datagen_batch_size = int(getattr(config, "datagen_batch_size", 10000))
if datagen_batch_size < 1:
raise ValueError(
f"datagen_batch_size must be positive, got {datagen_batch_size}"
)

# FIXME anything else to ensure determinism?
np.random.seed(config.seed + rank)
Expand Down Expand Up @@ -224,7 +237,7 @@ def main(config: Config) -> None:
var_fail_count = 0
runaway_fail_count = 0
while categories_remaining > 0:
attempts += size
attempts += datagen_batch_size * size

# Each rank attempts to generate datagen_batch_size categories
(
Expand All @@ -245,12 +258,15 @@ def main(config: Config) -> None:
# Process IFS params one at a time, writing each to a CSV
if rank == 0:
params_valid = [item for sublist in gathered_params for item in sublist]
if attempts % 10000 * size / datagen_batch_size == 0:
print(
f"cat_remaining = {categories_remaining} | total attempts = {attempts} | stats for rank 0: invalid_value_fail_count = {nan_fail_count}, var_fail_count = {var_fail_count}, runaway_fail_count = {runaway_fail_count}",
flush=True,
)
if len(params_valid) > 0:
print(
f"cat_remaining = {categories_remaining} | total attempts = {attempts} | stats for rank 0: nan_fail_count = {nan_fail_count}, var_fail_count = {var_fail_count}, runaway_fail_count = {runaway_fail_count}"
f"Processing {len(params_valid)} valid param sets from this batch",
flush=True,
)
if len(params_valid) > 0:
print(f"Processing {len(params_valid)} param sets from this attempt")
for p in params_valid:
# Ensure we don't save more categories than needed
if categories_remaining > 0:
Expand Down Expand Up @@ -284,14 +300,15 @@ def main(config: Config) -> None:
global_runaway_fail_count = comm.reduce(runaway_fail_count, op=MPI.SUM, root=0)

if rank == 0 and attempts > 0:
generated_categories = config.n_categories - existing_categories
print(
f"Generated {config.n_categories - existing_categories} new categories in {attempts * datagen_batch_size} total attempts | {attempts * datagen_batch_size / (config.n_categories - existing_categories)} Attempts per category | Total categories is now {config.n_categories}"
f"Generated {generated_categories} new categories in {attempts} total attempts | {attempts / generated_categories} Attempts per category | Total categories is now {config.n_categories}"
)
print(
f"Failures experienced: {global_nan_fail_count} nan attempts, {100 * global_nan_fail_count / (attempts * datagen_batch_size):.4f}% of all attempts, {global_var_fail_count} var fail attempts, {100 * global_var_fail_count / (attempts * datagen_batch_size):.4f}% of all attempts, {global_runaway_fail_count} runaway attempts, {100 * global_runaway_fail_count / (attempts * datagen_batch_size):.4f}% of all attempts"
f"Failures experienced: {global_nan_fail_count} invalid-value attempts, {100 * global_nan_fail_count / attempts:.4f}% of all attempts, {global_var_fail_count} var fail attempts, {100 * global_var_fail_count / attempts:.4f}% of all attempts, {global_runaway_fail_count} runaway attempts, {100 * global_runaway_fail_count / attempts:.4f}% of all attempts"
)
print(
f"Rank 0 wall time = {rank_total_time:.2f} | Total CPU time = {global_sum_time:.2f} | Avg wall time per rank {global_sum_time / size:.2f} | {attempts * datagen_batch_size / rank_total_time:.2f} total attempts per wall second | {attempts * datagen_batch_size / rank_total_time / size:.2f} attempts per wall second per rank"
f"Rank 0 wall time = {rank_total_time:.2f} | Total CPU time = {global_sum_time:.2f} | Avg wall time per rank {global_sum_time / size:.2f} | {attempts / rank_total_time:.2f} total attempts per wall second | {attempts / rank_total_time / size:.2f} attempts per wall second per rank"
)

return 0
13 changes: 10 additions & 3 deletions scripts/scaffold-matrix.job
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@

export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH

scaffold generate_fractals -c ScaFFold/configs/benchmark_default.yml
CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml"
FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we are always overriding whatever fract_base_dir is in the config?


scaffold generate_fractals \
-c "$CONFIG_PATH" \
--fract-base-dir "$FRACT_BASE_DIR"

# Uncomment if you want torch profiling
#export PROFILE_TORCH=ON

torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark \
-c "$CONFIG_PATH" \
--fract-base-dir "$FRACT_BASE_DIR"
#torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c "$CONFIG_PATH" --fract-base-dir "$FRACT_BASE_DIR"
13 changes: 10 additions & 3 deletions scripts/scaffold-tuolumne-torchpypi.job
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,17 @@ export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0
# Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd
export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0

torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml
CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml"
FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}"

torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals \
-c "$CONFIG_PATH" \
--fract-base-dir "$FRACT_BASE_DIR"

# Uncomment if you want torch profiling
#export PROFILE_TORCH=ON

torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark \
-c "$CONFIG_PATH" \
--fract-base-dir "$FRACT_BASE_DIR"
# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c "$CONFIG_PATH" --fract-base-dir "$FRACT_BASE_DIR"
13 changes: 10 additions & 3 deletions scripts/scaffold-tuolumne.job
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,17 @@ export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0
# Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd
export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0

torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml
CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml"
FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}"

torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals \
-c "$CONFIG_PATH" \
--fract-base-dir "$FRACT_BASE_DIR"

# Uncomment if you want torch profiling
#export PROFILE_TORCH=ON

torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c $(pwd)/ScaFFold/configs/benchmark_default.yml
torchrun-hpc -N 1 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark \
-c "$CONFIG_PATH" \
--fract-base-dir "$FRACT_BASE_DIR"
# torchrun-hpc -N 2 -n 4 --gpus-per-proc 1 $(which scaffold) benchmark -c "$CONFIG_PATH" --fract-base-dir "$FRACT_BASE_DIR"
Loading