Update to use CUDA 13.0 #1529
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test TorchTitan Integration | |
| on: | |
| pull_request: | |
| push: | |
| branches: | |
| - main | |
| - release/* | |
| concurrency: | |
| group: test-torchtitan-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| test-torchtitan: | |
| name: Test TorchTitan Integration (cuda12.6-py3.12) | |
| uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| include: | |
| - name: 12xlargegpu | |
| runs-on: linux.g5.12xlarge.nvidia.gpu | |
| torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126' | |
| gpu-arch-type: "cuda" | |
| gpu-arch-version: "12.6" | |
| with: | |
| timeout: 60 | |
| runner: ${{ matrix.runs-on }} | |
| gpu-arch-type: ${{ matrix.gpu-arch-type }} | |
| gpu-arch-version: ${{ matrix.gpu-arch-version }} | |
| submodules: recursive | |
| script: | | |
| conda create --yes --quiet --name py312 python=3.12 | |
| source $(conda info --base)/etc/profile.d/conda.sh | |
| conda activate py312 | |
| pip install --quiet -r requirements-test.txt | |
| # For some reason the spec above isnt working | |
| pip uninstall -y torch | |
| pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 | |
| pip install --quiet . | |
| # Clone TorchTitan | |
| git clone https://github.com/pytorch/torchtitan.git | |
| cd torchtitan | |
| pip install --quiet -r requirements.txt | |
| # Run TorchTitan training with AutoParallel | |
| NGPU=4 ./run_train.sh \ | |
| --module autoparallel.llama3 \ | |
| --config autoparallel_llama3_debugmodel \ | |
| --parallelism.tensor_parallel_degree 4 | |
| # TODO: Re-enable deepseek_v3 test once torchtitan experiment is fixed | |
| # (deepseek_v3 experiment is also disabled in torchtitan's own CI) | |
| # NGPU=4 ./run_train.sh \ | |
| # --module autoparallel.deepseek_v3 \ | |
| # --config autoparallel_deepseek_v3_debugmodel \ | |
| # --parallelism.data_parallel_shard_degree 4 \ | |
| # --parallelism.expert_parallel_degree 4 |