diff --git a/Unix/Copy_directory_tree_in_parallel.md b/Unix/Copy_directory_tree_in_parallel.md new file mode 100644 index 0000000..b553394 --- /dev/null +++ b/Unix/Copy_directory_tree_in_parallel.md @@ -0,0 +1,91 @@ +# Intro +[This](./copy_directory_tree_with_pattern.sh) +example script copies several folders +from one root directory into another (i.e. `~/RESULTS/RUN_1` into `~/RESULTS/RUN_2`) +on the condition that a specific folder exists. + +In cases when you work with a huge directory tree, each branch of which follows the same structure: +...it becomes prohibitively difficult to use the regular ways of copying files. +- `cp` command would not be robust to interruptions, because it does not check if the files already exist. +---> `rsync` is a better alternative. +- With some tricks, `rsync` can preserve the directory structure while called from an arbitary root - avoids the need to `cd` every cycle! +- 'find' command fails against particularly large file trees. +--> piping it into `parallel` not only speeds up the file syncing by the factor of the number of threads, it also prevents hitting shell expansion limits, +because the `find` outputs are read one-by-one, not all at once. + + + +# Setup +> **Adjust all the corresponding lines in the script accordingly!** +> - the list of folders (line 53): preprocessing_output, preprocessing_output_evaluation, combined_graph_weights, NHmap) +> - the folder that needs to be present (line 73): NHmap +> - the path, according to the directory structure (same line 73): '\$SOURCE_DIR/????/????_*/*_t*/????/NHmap' + +## The script is currently set up for the following directory structure: +``` +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t0 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t1 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_NOESY/1VDY_t2 +└── 1VDY + ├── HSQC_peak_features + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t0 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t1 +└── 1VDY + ├── HSQC_peak_features + ├── NHmap + ├── NHmap_eval + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +SOURCE_DIR/1VDY/1VDY_BB/1VDY_t2 +└── 1VDY + ├── HSQC_peak_features + ├── combined_graph_weights + ├── graph + ├── graph_weights_from_backbone + ├── graph_weights_from_noesy + ├── preprocessing_output + └── preprocessing_output_evaluation +``` +**Requirements** + +Requires GNU parallel +`sudo apt install parallel` diff --git a/Unix/copy_directory_tree_with_pattern.sh b/Unix/copy_directory_tree_with_pattern.sh new file mode 100644 index 0000000..7c749a4 --- /dev/null +++ b/Unix/copy_directory_tree_with_pattern.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +### THIS EXAMPLE SCRIPT COPIES SEVERAL FOLDERS +### FROM ONE ROOT DIRECTORY INTO ANOTHER (i.e. ~/RESULTS/RUN_1 into ~/RESULTS/RUN_2) +### ON THE CONDITION THAT A SPECIFIC FOLDER EXISTS. +## See explanation in the ./Copy_directory_tree_in_parallel.md +# Adjust names and paths inside! + + +#!/usr/bin/env bash +set -euo pipefail + +SOURCE_DIR=${1:-} +DEST_DIR=${2:-} +JOBS=${JOBS:-8} # set via env: JOBS=8 ./script.sh src dst + +if [[ -z "${SOURCE_DIR}" || -z "${DEST_DIR}" ]]; then + echo "Usage: $0 " + echo "Hint: set parallelism with JOBS env var (default: 4), e.g. JOBS=8 $0 src dst" + exit 1 +fi + +# Normalize: drop trailing slashes +SOURCE_DIR="${SOURCE_DIR%/}" +DEST_DIR="${DEST_DIR%/}" +mkdir -p "$DEST_DIR" + +# Ensure GNU parallel is available +if ! command -v parallel >/dev/null 2>&1; then + echo "Error: GNU parallel is required (apt install parallel / brew install parallel)." + exit 1 +fi + +# DEFINING THE JOB, which would be called every iteration: +# One rsync job per trial_exec_dir +sync_one() { + local nhmap_dir="$1" + + # trial_exec_dir is NHmap's parent + local trial_exec_dir + trial_exec_dir="$(dirname "$nhmap_dir")" + + local rel="${trial_exec_dir#"$SOURCE_DIR"/}" + local parent_dir + parent_dir="$(dirname "$trial_exec_dir")" + local rel_parent="${parent_dir#"$SOURCE_DIR"/}" + + echo "Copying files for $(basename "$parent_dir")" + + # Batch most subpaths into a single rsync; preserves relative paths via ./ anchor + ### ----->>> CHANGE THIS!!! ### + rsync -rlR \ + "$SOURCE_DIR"/./"$rel"/{preprocessing_output,preprocessing_output_evaluation,HSQC_peak_features,combined_graph_weights,NHmap} \ + "$DEST_DIR"/ + + # Copy *.yml from the parent dir if present. Use nullglob to avoid literal *.yml. + shopt -s nullglob + local ymls=( "$SOURCE_DIR"/./"$rel_parent"/*.yml ) + if ((${#ymls[@]})); then + rsync -rlR "${ymls[@]}" "$DEST_DIR"/ + fi + shopt -u nullglob +} + +export -f sync_one +export SOURCE_DIR DEST_DIR + +# Find every NHmap that matches your original pattern and run sync_one in parallel. +# -print0/-0 ensures paths with spaces/newlines are handled correctly. + +### ----->>> CHANGE THIS!!! ### +find "$SOURCE_DIR" \ + -path "$SOURCE_DIR/????/????_*/*_t*/????/NHmap" \ + -type d -print0 \ +| parallel -0 -j"${JOBS}" --halt now,fail=1 --will-cite sync_one {} + +echo "All done." +