From 326ecfb3ee1b27f134fdec4d99da05dd32281ab7 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 24 Jan 2024 21:20:28 -0500 Subject: [PATCH 1/3] Create cluster.config --- cluster.config | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 cluster.config diff --git a/cluster.config b/cluster.config new file mode 100644 index 0000000..6ec18fc --- /dev/null +++ b/cluster.config @@ -0,0 +1,23 @@ +// Configuration settings for NYU Greene HPC Cluster + +params { + max_memory = 3000.GB + max_cpus = 96 + max_time = 7.d +} + +singularity.enabled = true + +process { + executor = 'slurm' + clusterOptions = '--export=NONE' + scratch = true + maxRetries = 3 + errorStrategy = { task.attempt <=3 ? 'retry' : 'finish' } + cache = 'lenient' +} + +executor { + queueSize = 1900 + submitRateLimit = '20 sec' +} From 8d3a377be3e4c21e69138a16ca4d1ffb3c83af95 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 24 Jan 2024 21:23:24 -0500 Subject: [PATCH 2/3] Add cluster.config --- launch.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/launch.sh b/launch.sh index 0d171ea..9f39f18 100644 --- a/launch.sh +++ b/launch.sh @@ -19,6 +19,7 @@ log_dir="/scratch/gencore/GENEFLOW/alpha/logs/${fcid}/pipeline" nextflow_command="nextflow \ -log ${log_dir}/nextflow.log run /home/gencore/SCRIPTS/GENEFLOW/main.nf \ + -c /home/gencore/SCRIPTS/GENEFLOW/cluster.config \ -c /home/gencore/SCRIPTS/GENEFLOW/nextflow.config \ --run_dir_path $run_dir_path \ --trace_file_path ${log_dir}/trace.txt \ From 296d4726aa6884ea554eccece0b9301a18f19f03 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 3 Jun 2026 12:36:33 -0400 Subject: [PATCH 3/3] Update cluster configuration for resource limits and executor --- cluster.config | 64 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/cluster.config b/cluster.config index 6ec18fc..49ee58c 100644 --- a/cluster.config +++ b/cluster.config @@ -1,23 +1,53 @@ -// Configuration settings for NYU Greene HPC Cluster - params { - max_memory = 3000.GB - max_cpus = 96 - max_time = 7.d + // Resource limits + max_memory = 3000.GB + max_cpus = 128 + max_time = 7.d } - -singularity.enabled = true - + +apptainer { + enabled = true + runOptions = "-B ${System.getenv('APPTAINER_BINDPATH') ?: '/scratch'}" +} + process { - executor = 'slurm' - clusterOptions = '--export=NONE' - scratch = true - maxRetries = 3 - errorStrategy = { task.attempt <=3 ? 'retry' : 'finish' } - cache = 'lenient' + resourceLimits = [ + memory: params.max_memory, + cpus: params.max_cpus, + time: params.max_time + ] + executor = 'slurm' + clusterOptions = { + "--account=${System.getenv('SLURM_JOB_ACCOUNT')} --export=NONE" + } + + // Only retry for specific SLURM exit codes (e.g., OOM or Timeouts) + errorStrategy = { task.exitStatus in [143, 137, 104, 134, 139] ? 'retry' : 'finish' } + maxRetries = 3 + + cache = 'lenient' // Tolerates minor timestamp differences on shared filesystems } - + executor { - queueSize = 1900 - submitRateLimit = '20 sec' + queueSize = 500 + + // Job submission throttling - prevents overwhelming scheduler + submitRateLimit = '3 sec' // Max 3 jobs/sec (180 jobs/min) + pollInterval = '10 sec' // Check running jobs every 10s (reduces I/O on shared FS) + queueStatInterval = '2 min' // Check queue status every 2min (reduces squeue load) + + // SLURM optimization flags (version-dependent, gracefully ignored if unsupported) + queueGlobalStatus = true // Query jobs globally, not per-partition (23.01.0+) + onlyJobState = true // Use --only-job-state for faster queries (25.12.0+, requires SLURM 24.05+) + + // Prevents false failures when file sync is delayed across storage nodes + exitReadTimeout = '10 min' + + // Job submission retry with exponential backoff + retry { + maxAttempts = 5 // Try up to 5 times before giving up + delay = '5 sec' // Delay when retrying failed job submissions + maxDelay = '1 min' // Cap exponential backoff at 60sec + jitter = 0.25 // Add ±25% randomness to avoid retry storms + } }