From 41382bbb6b995ccc4b63f7bba23ef378f7907d7f Mon Sep 17 00:00:00 2001 From: katja Date: Sun, 7 Sep 2025 21:08:40 +0200 Subject: [PATCH 1/3] Added the parallelized rsync script --- Unix/copy_directory_tree_with_pattern.sh | 163 +++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 Unix/copy_directory_tree_with_pattern.sh diff --git a/Unix/copy_directory_tree_with_pattern.sh b/Unix/copy_directory_tree_with_pattern.sh new file mode 100644 index 0000000..614e467 --- /dev/null +++ b/Unix/copy_directory_tree_with_pattern.sh @@ -0,0 +1,163 @@ +#!/usr/bin/env bash + +""" +THIS EXAMPLE SCRIPT COPIES THE FOLLOWING FOLDERS: +preprocessing_output, preprocessing_output_evaluation, combined_graph_weights, NHmap +FROM ONE ROOT DIRECTORY INTO ANOTHER (i.e. ~/RESULTS/RUN_1 into ~/RESULTS/RUN_2) + + + +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t0 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t1 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t2 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t0 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t1 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t2 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation + +===================== +In cases when you work with a huge directory tree, each branch of which follows the same structure: +...it becomes prohibitively difficult to use the regular ways of copying files. +- 'cp' command would not be robust to interruptions, because it does not check if the files already exist. +---> 'rsync' is a better alternative. +- With some tricks, 'rsync' can preserve the directory structure while called from an arbitary root - avoids the need to 'cd' every cycle! +- 'find' command fails against particularly large file trees. +--> piping it into 'parallel' not only alleviates the burden, it also speeds up the file syncing by the factor of the number of threads. + +========= + +Requires GNU parallel +$ sudo apt install parallel + +========= + + + +""" + + +#!/usr/bin/env bash +set -euo pipefail + +SOURCE_DIR=${1:-} +DEST_DIR=${2:-} +JOBS=${JOBS:-8} # set via env: JOBS=8 ./script.sh src dst + +if [[ -z "${SOURCE_DIR}" || -z "${DEST_DIR}" ]]; then + echo "Usage: $0 " + echo "Hint: set parallelism with JOBS env var (default: 4), e.g. JOBS=8 $0 src dst" + exit 1 +fi + +# Normalize: drop trailing slashes +SOURCE_DIR="${SOURCE_DIR%/}" +DEST_DIR="${DEST_DIR%/}" +mkdir -p "$DEST_DIR" + +# Ensure GNU parallel is available +if ! command -v parallel >/dev/null 2>&1; then + echo "Error: GNU parallel is required (apt install parallel / brew install parallel)." + exit 1 +fi + +# DEFINING THE JOB, which would be called every iteration: +# One rsync job per trial_exec_dir +sync_one() { + local nhmap_dir="$1" + + # trial_exec_dir is NHmap's parent + local trial_exec_dir + trial_exec_dir="$(dirname "$nhmap_dir")" + + local rel="${trial_exec_dir#"$SOURCE_DIR"/}" + local parent_dir + parent_dir="$(dirname "$trial_exec_dir")" + local rel_parent="${parent_dir#"$SOURCE_DIR"/}" + + echo "Copying files for $(basename "$parent_dir")" + + # Batch most subpaths into a single rsync; preserves relative paths via ./ anchor + rsync -rlR \ + "$SOURCE_DIR"/./"$rel"/{preprocessing_output,preprocessing_output_evaluation,HSQC_peak_features,combined_graph_weights,NHmap} \ + "$DEST_DIR"/ + + # Copy *.yml from the parent dir if present. Use nullglob to avoid literal *.yml. + shopt -s nullglob + local ymls=( "$SOURCE_DIR"/./"$rel_parent"/*.yml ) + if ((${#ymls[@]})); then + rsync -rlR "${ymls[@]}" "$DEST_DIR"/ + fi + shopt -u nullglob +} + +export -f sync_one +export SOURCE_DIR DEST_DIR + +# Find every NHmap that matches your original pattern and run sync_one in parallel. +# -print0/-0 ensures paths with spaces/newlines are handled correctly. +find "$SOURCE_DIR" \ + -path "$SOURCE_DIR/????/????_NOESY/*_t*/????/NHmap" \ + -type d -print0 \ +| parallel -0 -j"${JOBS}" --halt now,fail=1 --will-cite sync_one {} + +echo "All done." + From d281cac552e2fb5324a845435cf44d32a0248051 Mon Sep 17 00:00:00 2001 From: katja Date: Thu, 11 Sep 2025 09:56:44 +0200 Subject: [PATCH 2/3] Added the parallelized rsync script --- Unix/copy_directory_tree_with_pattern.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Unix/copy_directory_tree_with_pattern.sh b/Unix/copy_directory_tree_with_pattern.sh index 614e467..6d39e10 100644 --- a/Unix/copy_directory_tree_with_pattern.sh +++ b/Unix/copy_directory_tree_with_pattern.sh @@ -81,7 +81,8 @@ In cases when you work with a huge directory tree, each branch of which follows ---> 'rsync' is a better alternative. - With some tricks, 'rsync' can preserve the directory structure while called from an arbitary root - avoids the need to 'cd' every cycle! - 'find' command fails against particularly large file trees. ---> piping it into 'parallel' not only alleviates the burden, it also speeds up the file syncing by the factor of the number of threads. +--> piping it into 'parallel' not only speeds up the file syncing by the factor of the number of threads, it also prevent hitting shell expansion limits, +because the 'find' outputs are read one-by-one, not all at once. ========= @@ -91,7 +92,6 @@ $ sudo apt install parallel ========= - """ @@ -155,7 +155,7 @@ export SOURCE_DIR DEST_DIR # Find every NHmap that matches your original pattern and run sync_one in parallel. # -print0/-0 ensures paths with spaces/newlines are handled correctly. find "$SOURCE_DIR" \ - -path "$SOURCE_DIR/????/????_NOESY/*_t*/????/NHmap" \ + -path "$SOURCE_DIR/????/????_*/*_t*/????/NHmap" \ -type d -print0 \ | parallel -0 -j"${JOBS}" --halt now,fail=1 --will-cite sync_one {} From 92589e3682f6005ae315a24cee0d9f168d82b928 Mon Sep 17 00:00:00 2001 From: katja Date: Thu, 11 Sep 2025 11:32:04 +0200 Subject: [PATCH 3/3] Added proper description of the script --- Unix/Copy_directory_tree_in_parallel.md | 91 ++++++++++++++++++++ Unix/copy_directory_tree_with_pattern.sh | 101 ++--------------------- 2 files changed, 99 insertions(+), 93 deletions(-) create mode 100644 Unix/Copy_directory_tree_in_parallel.md diff --git a/Unix/Copy_directory_tree_in_parallel.md b/Unix/Copy_directory_tree_in_parallel.md new file mode 100644 index 0000000..b553394 --- /dev/null +++ b/Unix/Copy_directory_tree_in_parallel.md @@ -0,0 +1,91 @@ +# Intro +[This](./copy_directory_tree_with_pattern.sh) +example script copies several folders +from one root directory into another (i.e. `~/RESULTS/RUN_1` into `~/RESULTS/RUN_2`) +on the condition that a specific folder exists. + +In cases when you work with a huge directory tree, each branch of which follows the same structure: +...it becomes prohibitively difficult to use the regular ways of copying files. +- `cp` command would not be robust to interruptions, because it does not check if the files already exist. +---> `rsync` is a better alternative. +- With some tricks, `rsync` can preserve the directory structure while called from an arbitary root - avoids the need to `cd` every cycle! +- 'find' command fails against particularly large file trees. +--> piping it into `parallel` not only speeds up the file syncing by the factor of the number of threads, it also prevents hitting shell expansion limits, +because the `find` outputs are read one-by-one, not all at once. + + + +# Setup +> **Adjust all the corresponding lines in the script accordingly!** +> - the list of folders (line 53): preprocessing_output, preprocessing_output_evaluation, combined_graph_weights, NHmap) +> - the folder that needs to be present (line 73): NHmap +> - the path, according to the directory structure (same line 73): '\$SOURCE_DIR/????/????_*/*_t*/????/NHmap' + +## The script is currently set up for the following directory structure: +``` +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t0 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t1 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t2 +└── 1VDY + ├── HSQC_peak_features + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t0 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t1 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t2 +└── 1VDY + ├── HSQC_peak_features + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +``` +**Requirements** + +Requires GNU parallel +`sudo apt install parallel` diff --git a/Unix/copy_directory_tree_with_pattern.sh b/Unix/copy_directory_tree_with_pattern.sh index 6d39e10..7c749a4 100644 --- a/Unix/copy_directory_tree_with_pattern.sh +++ b/Unix/copy_directory_tree_with_pattern.sh @@ -1,98 +1,10 @@ #!/usr/bin/env bash -""" -THIS EXAMPLE SCRIPT COPIES THE FOLLOWING FOLDERS: -preprocessing_output, preprocessing_output_evaluation, combined_graph_weights, NHmap -FROM ONE ROOT DIRECTORY INTO ANOTHER (i.e. ~/RESULTS/RUN_1 into ~/RESULTS/RUN_2) - - - -SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t0 -└── 1VDY - ├── HSQC_peak_features - ├── NHmap - ├── NHmap_eval - ├── combined_graph_weights - ├── graph - ├── graph_weights_from_backbone - ├── graph_weights_from_noesy - ├── preprocessing_output - └── preprocessing_output_evaluation -SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t1 -└── 1VDY - ├── HSQC_peak_features - ├── NHmap - ├── NHmap_eval - ├── combined_graph_weights - ├── graph - ├── graph_weights_from_backbone - ├── graph_weights_from_noesy - ├── preprocessing_output - └── preprocessing_output_evaluation -SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t2 -└── 1VDY - ├── HSQC_peak_features - ├── NHmap - ├── NHmap_eval - ├── combined_graph_weights - ├── graph - ├── graph_weights_from_backbone - ├── graph_weights_from_noesy - ├── preprocessing_output - └── preprocessing_output_evaluation -SOURCE_DIR/1VDY/1VDY_BB/1VDY_t0 -└── 1VDY - ├── HSQC_peak_features - ├── NHmap - ├── NHmap_eval - ├── combined_graph_weights - ├── graph - ├── graph_weights_from_backbone - ├── graph_weights_from_noesy - ├── preprocessing_output - └── preprocessing_output_evaluation -SOURCE_DIR/1VDY/1VDY_BB/1VDY_t1 -└── 1VDY - ├── HSQC_peak_features - ├── NHmap - ├── NHmap_eval - ├── combined_graph_weights - ├── graph - ├── graph_weights_from_backbone - ├── graph_weights_from_noesy - ├── preprocessing_output - └── preprocessing_output_evaluation -SOURCE_DIR/1VDY/1VDY_BB/1VDY_t2 -└── 1VDY - ├── HSQC_peak_features - ├── NHmap - ├── NHmap_eval - ├── combined_graph_weights - ├── graph - ├── graph_weights_from_backbone - ├── graph_weights_from_noesy - ├── preprocessing_output - └── preprocessing_output_evaluation - -===================== -In cases when you work with a huge directory tree, each branch of which follows the same structure: -...it becomes prohibitively difficult to use the regular ways of copying files. -- 'cp' command would not be robust to interruptions, because it does not check if the files already exist. ----> 'rsync' is a better alternative. -- With some tricks, 'rsync' can preserve the directory structure while called from an arbitary root - avoids the need to 'cd' every cycle! -- 'find' command fails against particularly large file trees. ---> piping it into 'parallel' not only speeds up the file syncing by the factor of the number of threads, it also prevent hitting shell expansion limits, -because the 'find' outputs are read one-by-one, not all at once. - -========= - -Requires GNU parallel -$ sudo apt install parallel - -========= - - -""" +### THIS EXAMPLE SCRIPT COPIES SEVERAL FOLDERS +### FROM ONE ROOT DIRECTORY INTO ANOTHER (i.e. ~/RESULTS/RUN_1 into ~/RESULTS/RUN_2) +### ON THE CONDITION THAT A SPECIFIC FOLDER EXISTS. +## See explanation in the ./Copy_directory_tree_in_parallel.md +# Adjust names and paths inside! #!/usr/bin/env bash @@ -136,6 +48,7 @@ sync_one() { echo "Copying files for $(basename "$parent_dir")" # Batch most subpaths into a single rsync; preserves relative paths via ./ anchor + ### ----->>> CHANGE THIS!!! ### rsync -rlR \ "$SOURCE_DIR"/./"$rel"/{preprocessing_output,preprocessing_output_evaluation,HSQC_peak_features,combined_graph_weights,NHmap} \ "$DEST_DIR"/ @@ -154,6 +67,8 @@ export SOURCE_DIR DEST_DIR # Find every NHmap that matches your original pattern and run sync_one in parallel. # -print0/-0 ensures paths with spaces/newlines are handled correctly. + +### ----->>> CHANGE THIS!!! ### find "$SOURCE_DIR" \ -path "$SOURCE_DIR/????/????_*/*_t*/????/NHmap" \ -type d -print0 \