Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ cuda = [
"mpi4py==4.1.1",
]
rocm = [
"torch==2.12.0+rocm7.2",
"torch==2.12.0+rocm7.1",
"torchaudio==2.11.0+rocm7.1",
"torchvision==0.27.0+rocm7.1",
Comment thread
PatrickRMiles marked this conversation as resolved.
"mpi4py==4.1.1",
]
rocmwci = [
Expand Down
8 changes: 4 additions & 4 deletions scripts/install-tuolumne-torchpypi.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ml load python/3.11.5 && python3 -m venv .venvs/scaffoldvenv-tuo-pypi && source .venvs/scaffoldvenv-tuo-pypi/bin/activate && pip install --upgrade pip
ml cce/21.0.1 cray-mpich/9.1.0 rocm/7.2.1 rccl/fast-env-slows-mpi
pip install -e .[rocm] --find-links https://download.pytorch.org/whl/torch/ --find-links https://download.pytorch.org/whl/triton-rocm/ 2>&1 | tee install.log
ml load python/3.13.2 && python3 -m venv .venvs/scaffoldvenv-tuo-pypi && source .venvs/scaffoldvenv-tuo-pypi/bin/activate && pip install --upgrade pip
ml cce/21.0.1 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi
pip install -e .[rocm] --find-links https://download.pytorch.org/whl/torch/ --find-links https://download.pytorch.org/whl/torchaudio/ --find-links https://download.pytorch.org/whl/torchvision/ --find-links https://download.pytorch.org/whl/triton-rocm/ 2>&1 | tee install.log
# libmpi.so.12 does not exist => ls /opt/cray/pe/lib64/ | grep libmpi
patchelf --replace-needed libmpi.so.12 libmpi_gnu.so.12 .venvs/scaffoldvenv-tuo-pypi/lib/python3.11/site-packages/mpi4py/MPI.mpich.cpython-311-x86_64-linux-gnu.so
patchelf --replace-needed libmpi.so.12 libmpi_gnu.so.12 .venvs/scaffoldvenv-tuo-pypi/lib/python3.13/site-packages/mpi4py/MPI.mpich.cpython-313-x86_64-linux-gnu.so
12 changes: 6 additions & 6 deletions scripts/scaffold-tuolumne-torchpypi.job
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@
# flux: -qpdebug
# flux: -B flask

ml cce/21.0.1 cray-mpich/9.1.0 rocm/7.2.1 rccl/fast-env-slows-mpi
ml cce/21.0.1 cray-mpich/9.1.0 rocm/7.1.1 rccl/fast-env-slows-mpi

. .venvs/scaffoldvenv-tuo-pypi/bin/activate

export NCCL_NET_PLUGIN=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-7.2.0/install/lib/librccl-net.so
export NCCL_NET_PLUGIN=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-7.1.1/install/lib/librccl-net.so

# Disable direct convolution benchmarking (should speedup warmup by a significant amount, does the below three options together)
# export MIOPEN_DEBUG_CONV_DIRECT=0
export MIOPEN_DEBUG_CONV_DIRECT=0
# Disable direct naive convolution benchmarking (naive_conv_ab_nonpacked_fwd_ndhwc_half_double_half.kd)
export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD=0
# export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD=0
# Disable naive_conv_ab_nonpacked_bwd_ndhwc_half_double_half.kd
export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0
# export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD=0
# Disable naive_conv_ab_nonpacked_wrw_ndhwc_half_double_half.kd
export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0
# export MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW=0

CONFIG_PATH="$(pwd)/ScaFFold/configs/benchmark_default.yml"
FRACT_BASE_DIR="${FRACT_BASE_DIR:-$(pwd)/ScaFFold/fractals}"
Expand Down
Loading