From a116bda03b85ff45652f90ca678c5d0d4b31d985 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 18 Jun 2026 17:25:43 -0400 Subject: [PATCH] ci: enable --stepmgr for Frontier jobs to relieve slurmctld pressure Each CI test/bench job is a single-node allocation that runs the full regression suite in-process via ./mfc.sh test, launching one srun per target (pre_process/simulation/post_process) for ~560 cases = ~1700+ srun step-creates per job, up to 32 concurrent. This congests the Frontier Slurm controller. --stepmgr delegates step management to each job's slurmstepd instead of routing every srun through slurmctld, which is the appropriate mechanism for many-step single-allocation workloads. Added via the existing extra_sbatch hook for both frontier and frontier_amd; Phoenix unaffected. --- .github/scripts/submit-slurm-job.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index 3d6bac6293..b8404716b4 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -52,7 +52,10 @@ case "$cluster" in account="CFD154" job_prefix="MFC" qos="hackathon" - extra_sbatch="" + # Let each job's slurmstepd broker its own steps instead of routing + # every srun through slurmctld. The in-job test suite launches ~1700+ + # srun steps per allocation, which congests the Frontier controller. + extra_sbatch="#SBATCH --stepmgr" test_time="01:59:00" bench_time="01:59:00" gpu_partition_dynamic=false @@ -62,7 +65,7 @@ case "$cluster" in account="CFD154" job_prefix="MFC" qos="hackathon" - extra_sbatch="" + extra_sbatch="#SBATCH --stepmgr" test_time="01:59:00" bench_time="01:59:00" gpu_partition_dynamic=false