Skip to content

Fix int32 overflow deadlock and non-power-of-2 crash in Triton AlltoAllv (#2133) #8617

Fix int32 overflow deadlock and non-power-of-2 crash in Triton AlltoAllv (#2133)

Fix int32 overflow deadlock and non-power-of-2 crash in Triton AlltoAllv (#2133) #8617

Workflow file for this run

name: Build and test torchcomms
on:
push:
branches:
- main
pull_request:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true
jobs:
build_pip_wheel:
strategy:
fail-fast: false
matrix:
include:
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "stable"
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "nightly"
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "nightly"
build-flags: "USE_NCCLX=0 USE_TRANSPORT=0"
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
upload-artifact: build-artifacts-${{ matrix.torch-version }}-${{ matrix.gpu-arch-type }}-${{ matrix.gpu-arch-version }}-${{ matrix.build-flags }}
script: |
set -ex
source .github/scripts/setup_env.sh --with-cmake --cuda-version "${{ matrix.gpu-arch-version }}" "${{ matrix.torch-version }}"
export ${{ matrix.build-flags }}
# Build wheel
pip install build
python -m build --wheel --no-isolation
mkdir -p "${RUNNER_ARTIFACT_DIR}"
cp dist/*.whl "${RUNNER_ARTIFACT_DIR}/"
python -c "import torch; print(torch.__version__)" > "${RUNNER_ARTIFACT_DIR}/torch_version.txt"
build_and_run_cpp_tests:
strategy:
fail-fast: false
matrix:
include:
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "stable"
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "nightly"
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
script: |
set -ex
source .github/scripts/setup_env.sh --with-cmake --cuda-version "${{ matrix.gpu-arch-version }}" "${{ matrix.torch-version }}"
# Build and run C++ tests
cmake -B build_tests -G Ninja -DBUILD_TESTS=ON
cmake --build build_tests
cd build_tests && ctest --output-on-failure
run_py_tests:
needs: build_pip_wheel
strategy:
fail-fast: false
matrix:
include:
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "stable"
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "nightly"
- runs-on: "linux.g5.12xlarge.nvidia.gpu"
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
torch-version: "nightly"
build-flags: "USE_NCCLX=0 USE_TRANSPORT=0"
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
timeout: 120
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
download-artifact: build-artifacts-${{ matrix.torch-version }}-${{ matrix.gpu-arch-type }}-${{ matrix.gpu-arch-version }}-${{ matrix.build-flags }}
script: |
set -ex
TORCH_VERSION=$(cat "${RUNNER_ARTIFACT_DIR}/torch_version.txt")
source .github/scripts/setup_env.sh --cuda-version "${{ matrix.gpu-arch-version }}" --torch-version "$TORCH_VERSION" "${{ matrix.torch-version }}"
export ${{ matrix.build-flags }}
# Install from pre-built wheel (skip build step)
pip install "${RUNNER_ARTIFACT_DIR}"/*.whl pytest numpy psutil parameterized pydot requests urllib3 tabulate
python -c "import torchcomms"
# Run Python tests
comms/torchcomms/scripts/run_tests_unit_py.sh
comms/torchcomms/scripts/run_tests_integration_py.sh