Update to use CUDA 13.0 #1529

Workflow file for this run

.github/workflows/test_torchtitan.yml at 3fbad1e

	name: Test TorchTitan Integration

	on:
	pull_request:
	push:
	branches:
	- main
	- release/*

	concurrency:
	group: test-torchtitan-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number \|\| github.ref }}
	cancel-in-progress: true

	jobs:
	test-torchtitan:
	name: Test TorchTitan Integration (cuda12.6-py3.12)
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	strategy:
	fail-fast: true
	matrix:
	include:
	- name: 12xlargegpu
	runs-on: linux.g5.12xlarge.nvidia.gpu
	torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
	gpu-arch-type: "cuda"
	gpu-arch-version: "12.6"
	with:
	timeout: 60
	runner: ${{ matrix.runs-on }}
	gpu-arch-type: ${{ matrix.gpu-arch-type }}
	gpu-arch-version: ${{ matrix.gpu-arch-version }}
	submodules: recursive
	script: \|
	conda create --yes --quiet --name py312 python=3.12
	source $(conda info --base)/etc/profile.d/conda.sh
	conda activate py312

	pip install --quiet -r requirements-test.txt
	# For some reason the spec above isnt working
	pip uninstall -y torch
	pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
	pip install --quiet .

	# Clone TorchTitan
	git clone https://github.com/pytorch/torchtitan.git
	cd torchtitan
	pip install --quiet -r requirements.txt

	# Run TorchTitan training with AutoParallel
	NGPU=4 ./run_train.sh \
	--module autoparallel.llama3 \
	--config autoparallel_llama3_debugmodel \
	--parallelism.tensor_parallel_degree 4

	# TODO: Re-enable deepseek_v3 test once torchtitan experiment is fixed
	# (deepseek_v3 experiment is also disabled in torchtitan's own CI)
	# NGPU=4 ./run_train.sh \
	# --module autoparallel.deepseek_v3 \
	# --config autoparallel_deepseek_v3_debugmodel \
	# --parallelism.data_parallel_shard_degree 4 \
	# --parallelism.expert_parallel_degree 4

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update to use CUDA 13.0 #1529

Workflow file

Update to use CUDA 13.0 #1529

Uh oh!

Workflow file for this run