Skip to content
121 changes: 47 additions & 74 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ jobs:
name: 'Core'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake==3.21.0 pybind11[global] ninja
git config --global --add safe.directory '*'
- name: 'Checkout'
uses: actions/checkout@v3
with:
Expand All @@ -32,125 +33,97 @@ jobs:
NVTE_FRAMEWORK: none
MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
NVTE_CUDA_ARCHS: "100"
- name: 'Sanity check'
run: python3 -c "import transformer_engine"
working-directory: /
pytorch:
name: 'PyTorch'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04
options: --user root
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"

- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'

- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"

- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake==3.21.0 pybind11[global] ninja pydantic importlib-metadata>=1.0 packaging numpy einops onnxscript
pip install torch --index-url https://download.pytorch.org/whl/cu130
git config --global --add safe.directory '*'
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive

- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity

- name: 'Dependencies'
run: |
docker exec builder bash -c '\
apt-get update && \
apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
apt-get clean \
'

- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- name: 'Build'
run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
NVTE_CUDA_ARCHS: "100"
- name: 'Sanity check'
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
run: python3 tests/pytorch/test_sanity_import.py
jax:
name: 'JAX'
runs-on: ubuntu-latest
container:
image: ghcr.io/nvidia/jax:jax
image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04
options: --user root
Comment on lines 72 to 74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switched from ghcr.io/nvidia/jax:jax to base CUDA container - verify JAX[cuda12] install is compatible with CUDA 12.1 and includes all necessary dependencies

steps:
- name: 'Dependencies'
run: pip install cmake==3.21.0 pybind11[global]
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake==3.21.0 pybind11[global] ninja packaging
pip install jax[cuda13] flax[cuda13]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Verify jax[cuda13] is a valid extra. JAX typically uses extras like jax[cuda12_local] or jax[cuda12_pip] (see build_tools/wheel_utils/build_wheels.sh:66). Also note that transformer_engine/jax/pyproject.toml:6 specifies jax[cuda12]. Check JAX documentation to confirm cuda13 is the correct syntax for CUDA 13.0.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

jax[cuda13] syntax is likely invalid. JAX typically uses extras like jax[cuda12_local] or jax[cuda12_pip] (see build_tools/wheel_utils/build_wheels.sh:66). Also, transformer_engine/jax/pyproject.toml:6 specifies jax[cuda12]. This will fail to install the CUDA-enabled version.

Suggested change
pip install jax[cuda13] flax[cuda13]
pip install "jax[cuda12_pip]" "flax[cuda12_pip]"

git config --global --add safe.directory '*'
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- name: 'Build'
run: |
NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: jax
MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
NVTE_CUDA_ARCHS: "100"
- name: 'Sanity check'
run: python3 tests/jax/test_sanity_import.py
all:
name: 'All'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04
options: --user root
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"

- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'

- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"

- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake==3.21.0 pybind11[global] ninja pydantic importlib-metadata>=1.0 packaging numpy einops onnxscript
pip install torch --index-url https://download.pytorch.org/whl/cu130
pip install jax[cuda13] flax[cuda13]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern as JAX job: verify jax[cuda13] is the correct syntax for CUDA 13.0 installation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as JAX job: jax[cuda13] and flax[cuda13] are invalid extras. Use jax[cuda12_pip] and flax[cuda12_pip] instead.

Suggested change
pip install jax[cuda13] flax[cuda13]
pip install jax[cuda12_pip] flax[cuda12_pip]

git config --global --add safe.directory '*'
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive

- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity

- name: 'Dependencies'
run: |
docker exec builder bash -c '\
pip install cmake==3.21.0 pybind11[global] einops onnxscript && \
pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
'
- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- name: 'Build'
run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps'
run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps
env:
NVTE_FRAMEWORK: all
MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
NVTE_CUDA_ARCHS: "100"
- name: 'Sanity check'
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py'
run: |
python3 tests/pytorch/test_sanity_import.py
python3 tests/jax/test_sanity_import.py
7 changes: 4 additions & 3 deletions .github/workflows/deploy_nightly_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ name: Deploy nightly docs
on:
push:
branches: [ "main" ]
workflow_dispatch:
jobs:
build:
uses: ./.github/workflows/docs.yml
Expand All @@ -21,9 +22,8 @@ jobs:
name: "te_docs"
path: "html"
- name: Prepare for pages
uses: actions/upload-pages-artifact@v1.0.7
uses: actions/upload-pages-artifact@v3
with:
name: github-pages
path: "html"
deploy:
needs: prepare
Expand All @@ -36,4 +36,5 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Deploy
uses: actions/deploy-pages@v2.0.0
id: deployment
uses: actions/deploy-pages@v4
Loading