Skip to content
Open
3 changes: 3 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ concurrency:
jobs:
file-changes:
name: Detect File Changes
if: >
github.event_name != 'pull_request_review' ||
github.event.review.user.type != 'Bot'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@ output_file="$job_slug.out"
submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A ENG160 # charge account
#SBATCH -A CFD154 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
#SBATCH -t 01:59:00 # Duration of the job
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
#SBATCH -p batch # Batch partition (concurrent jobs)
#SBATCH --qos=hackathon # Hackathon QOS for batch access

set -e
set -x
Expand All @@ -50,6 +51,7 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
job_shard="$4"

. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

shard_opts=""
if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
fi
8 changes: 5 additions & 3 deletions .github/workflows/frontier_amd/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@ output_file="$job_slug.out"
submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A ENG160 # charge account
#SBATCH -A CFD154 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
#SBATCH -t 01:59:00 # Duration of the job
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
#SBATCH -p batch # Batch partition (concurrent jobs)
#SBATCH --qos=hackathon # Hackathon QOS for batch access

set -e
set -x
Expand All @@ -50,6 +51,7 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
job_shard="$4"

. ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/frontier_amd/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

shard_opts=""
if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
fi
1 change: 1 addition & 0 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
$sbatch_device_opts
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
#SBATCH -q embers # QOS Name
#SBATCH --requeue # Auto-requeue on preemption
#SBATCH -o$output_file # Combined output and error messages file

set -e
Expand Down
71 changes: 60 additions & 11 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:

- name: Check Formatting
run: |
./mfc.sh format -j $(nproc)
./mfc.sh format -j "$(nproc)"
git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1)

- name: Spell Check
Expand Down Expand Up @@ -138,19 +138,38 @@ jobs:

- name: Build
run: |
/bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
/bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} $TEST_ALL
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}

- name: Test
run: |
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
run: |
rm -f tests/failed_uuids.txt
TEST_EXIT=0
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$?

# Retry only if a small number of tests failed (sporadic failures)
if [ -s tests/failed_uuids.txt ]; then
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then
FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
echo ""
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
echo ""
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $?
else
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
exit 1
fi
elif [ "$TEST_EXIT" -ne 0 ]; then
exit $TEST_EXIT
fi
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}

self:
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: [lint-gate, file-changes]
continue-on-error: false
Expand All @@ -164,50 +183,74 @@ jobs:
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'acc'
shard: ''
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'omp'
shard: ''
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'cpu'
interface: 'none'
# Frontier (ORNL) — build on login node, test via SLURM
shard: ''
# Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'cpu'
interface: 'none'
# Frontier AMD — build on login node, test via SLURM
shard: ''
# Frontier AMD — build on login node, GPU tests sharded for batch partition
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'cpu'
interface: 'none'
shard: ''
runs-on:
group: phoenix
labels: ${{ matrix.runner }}
env:
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clone
uses: actions/checkout@v4
Expand All @@ -216,10 +259,16 @@ jobs:

- name: Build
if: matrix.cluster != 'phoenix'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
uses: nick-fields/retry@v3
with:
max_attempts: 3
retry_wait_seconds: 60
timeout_minutes: 480
command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
on_retry_command: ./mfc.sh clean

- name: Test
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}

- name: Print Logs
if: always()
Expand Down
6 changes: 6 additions & 0 deletions toolchain/mfc/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,12 @@
default=False,
dest="dry_run",
),
Argument(
name="shard",
help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).",
type=str,
default=None,
),
],
mutually_exclusive=[
MutuallyExclusiveGroup(arguments=[
Expand Down
25 changes: 25 additions & 0 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]:
skipped_cases += example_cases
cases = [case for case in cases if case not in example_cases]

if ARG("shard") is not None:
parts = ARG("shard").split("/")
if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]):
raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').")
shard_idx, shard_count = int(parts[0]), int(parts[1])
skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]

if ARG("percent") == 100:
return cases, skipped_cases

Expand Down Expand Up @@ -182,6 +190,14 @@ def test():

# Check if we aborted due to high failure rate
if abort_tests.is_set():
# Clean up stale failed_uuids.txt so CI doesn't retry wrong tests
failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
try:
if os.path.exists(failed_uuids_path):
os.remove(failed_uuids_path)
except OSError:
pass

total_completed = nFAIL + nPASS
cons.print()
cons.unindent()
Expand All @@ -206,6 +222,15 @@ def test():
# Build the summary report
_print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)

# Write failed UUIDs to file for CI retry logic
failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
if failed_tests:
with open(failed_uuids_path, "w") as f:
for test_info in failed_tests:
f.write(test_info['uuid'] + "\n")
elif os.path.exists(failed_uuids_path):
os.remove(failed_uuids_path)

exit(nFAIL)


Expand Down
Loading