diff --git a/cluster.config b/cluster.config new file mode 100644 index 0000000..49ee58c --- /dev/null +++ b/cluster.config @@ -0,0 +1,53 @@ +params { + // Resource limits + max_memory = 3000.GB + max_cpus = 128 + max_time = 7.d +} + +apptainer { + enabled = true + runOptions = "-B ${System.getenv('APPTAINER_BINDPATH') ?: '/scratch'}" +} + +process { + resourceLimits = [ + memory: params.max_memory, + cpus: params.max_cpus, + time: params.max_time + ] + executor = 'slurm' + clusterOptions = { + "--account=${System.getenv('SLURM_JOB_ACCOUNT')} --export=NONE" + } + + // Only retry for specific SLURM exit codes (e.g., OOM or Timeouts) + errorStrategy = { task.exitStatus in [143, 137, 104, 134, 139] ? 'retry' : 'finish' } + maxRetries = 3 + + cache = 'lenient' // Tolerates minor timestamp differences on shared filesystems +} + +executor { + queueSize = 500 + + // Job submission throttling - prevents overwhelming scheduler + submitRateLimit = '3 sec' // Max 3 jobs/sec (180 jobs/min) + pollInterval = '10 sec' // Check running jobs every 10s (reduces I/O on shared FS) + queueStatInterval = '2 min' // Check queue status every 2min (reduces squeue load) + + // SLURM optimization flags (version-dependent, gracefully ignored if unsupported) + queueGlobalStatus = true // Query jobs globally, not per-partition (23.01.0+) + onlyJobState = true // Use --only-job-state for faster queries (25.12.0+, requires SLURM 24.05+) + + // Prevents false failures when file sync is delayed across storage nodes + exitReadTimeout = '10 min' + + // Job submission retry with exponential backoff + retry { + maxAttempts = 5 // Try up to 5 times before giving up + delay = '5 sec' // Delay when retrying failed job submissions + maxDelay = '1 min' // Cap exponential backoff at 60sec + jitter = 0.25 // Add ±25% randomness to avoid retry storms + } +} diff --git a/launch.sh b/launch.sh index 0d171ea..9f39f18 100644 --- a/launch.sh +++ b/launch.sh @@ -19,6 +19,7 @@ log_dir="/scratch/gencore/GENEFLOW/alpha/logs/${fcid}/pipeline" nextflow_command="nextflow \ -log ${log_dir}/nextflow.log run /home/gencore/SCRIPTS/GENEFLOW/main.nf \ + -c /home/gencore/SCRIPTS/GENEFLOW/cluster.config \ -c /home/gencore/SCRIPTS/GENEFLOW/nextflow.config \ --run_dir_path $run_dir_path \ --trace_file_path ${log_dir}/trace.txt \