From a116bda03b85ff45652f90ca678c5d0d4b31d985 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Thu, 18 Jun 2026 17:25:43 -0400
Subject: [PATCH] ci: enable --stepmgr for Frontier jobs to relieve slurmctld
 pressure

Each CI test/bench job is a single-node allocation that runs the full
regression suite in-process via ./mfc.sh test, launching one srun per
target (pre_process/simulation/post_process) for ~560 cases = ~1700+
srun step-creates per job, up to 32 concurrent. This congests the
Frontier Slurm controller.

--stepmgr delegates step management to each job's slurmstepd instead of
routing every srun through slurmctld, which is the appropriate mechanism
for many-step single-allocation workloads. Added via the existing
extra_sbatch hook for both frontier and frontier_amd; Phoenix unaffected.
---
 .github/scripts/submit-slurm-job.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh
index 3d6bac6293..b8404716b4 100755
--- a/.github/scripts/submit-slurm-job.sh
+++ b/.github/scripts/submit-slurm-job.sh
@@ -52,7 +52,10 @@ case "$cluster" in
         account="CFD154"
         job_prefix="MFC"
         qos="hackathon"
-        extra_sbatch=""
+        # Let each job's slurmstepd broker its own steps instead of routing
+        # every srun through slurmctld. The in-job test suite launches ~1700+
+        # srun steps per allocation, which congests the Frontier controller.
+        extra_sbatch="#SBATCH --stepmgr"
         test_time="01:59:00"
         bench_time="01:59:00"
         gpu_partition_dynamic=false
@@ -62,7 +65,7 @@ case "$cluster" in
         account="CFD154"
         job_prefix="MFC"
         qos="hackathon"
-        extra_sbatch=""
+        extra_sbatch="#SBATCH --stepmgr"
         test_time="01:59:00"
         bench_time="01:59:00"
         gpu_partition_dynamic=false