Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion buildkite/bootstrap-amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ if [[ -z "${NIGHTLY:-}" ]]; then
NIGHTLY=0
fi

if [[ -z "${TORCH_NIGHTLY:-}" ]]; then
TORCH_NIGHTLY=0
fi

if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
VLLM_CI_BRANCH="main"
fi
Expand Down Expand Up @@ -110,6 +114,20 @@ check_run_all_label() {
fi
}

check_torch_nightly_label() {
TORCH_NIGHTLY_LABEL="ready-torch-nightly"
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(get_pr_labels)
if [[ $PR_LABELS == *"$TORCH_NIGHTLY_LABEL"* ]]; then
echo true
else
echo false
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}

# ---------------------------------------------------------------------------
# get_diff: compute changed files between commits only (no index staging).
#
Expand Down Expand Up @@ -153,13 +171,14 @@ upload_pipeline() {
# (WIP) Use pipeline generator instead of jinja template
if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
python -m pip install click pydantic
python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --torch_nightly="$TORCH_NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
buildkite-agent pipeline upload .buildkite/pipeline.yaml
exit 0
fi
echo "List file diff: $LIST_FILE_DIFF"
echo "Run all: $RUN_ALL"
echo "Nightly: $NIGHTLY"
echo "Torch Nightly: $TORCH_NIGHTLY"
echo "AMD Mirror HW: $AMD_MIRROR_HW"

FAIL_FAST=$(fail_fast)
Expand All @@ -173,6 +192,7 @@ upload_pipeline() {
-D list_file_diff="$LIST_FILE_DIFF" \
-D run_all="$RUN_ALL" \
-D nightly="$NIGHTLY" \
-D torch_nightly="$TORCH_NIGHTLY" \
-D mirror_hw="$AMD_MIRROR_HW" \
-D fail_fast="$FAIL_FAST" \
-D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
Expand Down Expand Up @@ -304,6 +324,15 @@ if [[ $LABEL_RUN_ALL == true ]]; then
echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
fi

# Check for ready-torch-nightly label: full CI built and tested against torch
# nightly, plus a full run on the pinned torch.
LABEL_TORCH_NIGHTLY=$(check_torch_nightly_label)
if [[ $LABEL_TORCH_NIGHTLY == true ]]; then
TORCH_NIGHTLY=1
RUN_ALL=1
echo "Found 'ready-torch-nightly' label. Running the full suite against torch nightly."
fi

# Decide whether to use precompiled wheels
# Relies on existing patterns array as a basis.
if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
Expand Down
28 changes: 28 additions & 0 deletions buildkite/bootstrap-intel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ if [[ -z "${NIGHTLY:-}" ]]; then
NIGHTLY=0
fi

if [[ -z "${TORCH_NIGHTLY:-}" ]]; then
TORCH_NIGHTLY=0
fi

if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
VLLM_CI_BRANCH="main"
fi
Expand Down Expand Up @@ -65,6 +69,20 @@ check_run_all_label() {
fi
}

check_torch_nightly_label() {
TORCH_NIGHTLY_LABEL="ready-torch-nightly"
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$TORCH_NIGHTLY_LABEL"* ]]; then
echo true
else
echo false
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}

clean_docker_tag() {
# Function to replace invalid characters in Docker image tags and truncate to 128 chars
# Valid characters: a-z, A-Z, 0-9, _, ., -
Expand Down Expand Up @@ -131,6 +149,7 @@ upload_pipeline() {
echo "List file diff: $LIST_FILE_DIFF"
echo "Run all: $RUN_ALL"
echo "Nightly: $NIGHTLY"
echo "Torch Nightly: $TORCH_NIGHTLY"

FAIL_FAST=$(fail_fast)

Expand Down Expand Up @@ -276,6 +295,15 @@ if [[ $LABEL_RUN_ALL == true ]]; then
echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
fi

# Check for ready-torch-nightly label: full CI built and tested against torch
# nightly, plus a full run on the pinned torch.
LABEL_TORCH_NIGHTLY=$(check_torch_nightly_label)
if [[ $LABEL_TORCH_NIGHTLY == true ]]; then
TORCH_NIGHTLY=1
RUN_ALL=1
echo "Found 'ready-torch-nightly' label. Running the full suite against torch nightly."
fi

# Decide whether to use precompiled wheels
# Relies on existing patterns array as a basis.
if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
Expand Down
3 changes: 2 additions & 1 deletion buildkite/pipeline_generator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ The generator relies on several environment variables, typically provided by Bui
* `BUILDKITE_COMMIT`: Current commit hash.
* `BUILDKITE_PULL_REQUEST`: Pull request number (or "false").
* `BUILDKITE_PULL_REQUEST_BASE_BRANCH`: Base branch for PRs.
* `NIGHTLY`: Set to "1" to force run nightly steps.
* `NIGHTLY`: Set to "1" to auto-run the curated torch-nightly steps (those tagged `mirror.torch_nightly`).
* `TORCH_NIGHTLY`: Set to "1" to build and run the *entire* test suite against torch nightly (full run, not just the tagged subset). Also forces a full run on the pinned torch. Equivalent to the `ready-torch-nightly` PR label.
* `RUN_ALL`: Set to "1" to force run all steps.
* `DOCS_ONLY_DISABLE`: Set to "0" to enable skipping CI for doc-only changes.
* `VLLM_USE_PRECOMPILED`: Set to "1" to force use of precompiled wheels.
138 changes: 0 additions & 138 deletions buildkite/pipeline_generator/buildkite_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,6 @@ def convert_group_step_to_buildkite_step(
list_file_diff = global_config["list_file_diff"]

amd_hardware_steps = []
torch_nightly_steps_collected = []

for group, steps in group_steps.items():
group_steps_list = []
Expand Down Expand Up @@ -565,10 +564,6 @@ def convert_group_step_to_buildkite_step(

group_steps_list.append(buildkite_step)

# Collect steps marked for torch nightly testing via mirror field
if step.mirror and step.mirror.get("torch_nightly") is not None:
torch_nightly_steps_collected.append(step)

# Create AMD mirror step and its block step if specified/applicable
if step.mirror and step.mirror.get("amd"):
amd = step.mirror["amd"]
Expand Down Expand Up @@ -636,13 +631,6 @@ def convert_group_step_to_buildkite_step(
BuildkiteGroupStep(group="Hardware-AMD Tests", steps=amd_hardware_steps)
)

# Create torch nightly group if any steps have mirror.torch_nightly
if torch_nightly_steps_collected:
nightly_group = _create_torch_nightly_group(
torch_nightly_steps_collected, list_file_diff, variables_to_inject
)
buildkite_group_steps.append(nightly_group)

return buildkite_group_steps


Expand Down Expand Up @@ -739,129 +727,3 @@ def _create_amd_step(
retry=AMD_RETRY,
parallelism=parallelism,
)


def _create_torch_nightly_group(
nightly_steps: List[Step],
list_file_diff: List[str],
variables_to_inject: Dict[str, str],
) -> BuildkiteGroupStep:
"""Create the 'vLLM Against PyTorch Nightly' group with image build + test steps."""
global_config = get_global_config()
branch = global_config["branch"]
auto_run = global_config["nightly"] == "1"

nightly_image = get_torch_nightly_image()
group_steps_list = []

# Add manual block step for the image build (unless auto-run)
if not auto_run:
group_steps_list.append(
BuildkiteBlockStep(
block="Build torch nightly image",
key="block-build-torch-nightly",
depends_on=[],
)
)

# Docker image build step — delegates to the shell script in vllm repo.
# Resolve variables at generation time (these commands don't go through
# _prepare_commands, so we substitute manually).
import re as _re
raw_cmd = '.buildkite/image_build/image_build_torch_nightly.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG_TORCH_NIGHTLY'
for variable, value in variables_to_inject.items():
if not value:
continue
pattern = _re.escape(variable)
raw_cmd = _re.sub(pattern + r'\b', value, raw_cmd)
image_build_commands = [raw_cmd]

image_build_step = BuildkiteCommandStep(
label=":docker: build image torch nightly",
key="image-build-torch-nightly",
commands=image_build_commands,
depends_on=["block-build-torch-nightly"] if not auto_run else [],
soft_fail=True,
agents={
"queue": AgentQueue.CPU_POSTMERGE_US_EAST_1
if branch == "main"
else AgentQueue.CPU_PREMERGE_US_EAST_1,
},
env={"DOCKER_BUILDKIT": "1"},
retry={
"automatic": [
{"exit_status": -1, "limit": 2},
{"exit_status": -10, "limit": 2},
]
},
)
group_steps_list.append(image_build_step)

# Create test steps for each torch_nightly step
for step in nightly_steps:
# Determine if this test step should be auto-run or blocked
step_auto_run = auto_run
if not step_auto_run and step.source_file_dependencies:
for source_file in step.source_file_dependencies:
for diff_file in list_file_diff:
if _matches_source_dependency(source_file, diff_file):
step_auto_run = True
break
if step_auto_run:
break
elif not step_auto_run and not step.source_file_dependencies:
step_auto_run = True

blocked = not step_auto_run or (step.optional and not auto_run)

if blocked:
block_key = f"block-torch-nightly-{_generate_step_key(step.label)}"
group_steps_list.append(
BuildkiteBlockStep(
block=f"Run Torch Nightly {step.label}",
depends_on=["image-build-torch-nightly"],
key=block_key,
)
)

# Create the nightly test step using the nightly image
nightly_plugin = _get_nightly_step_plugin(step, nightly_image)
step_commands = _prepare_commands(step, variables_to_inject)

nightly_test_step = BuildkiteCommandStep(
label=f"Torch Nightly {step.label}",
commands=step_commands,
depends_on=[block_key] if blocked else ["image-build-torch-nightly"],
soft_fail=True,
agents=_get_step_agents(step),
parallelism=step.parallelism,
retry={
"automatic": [
{"exit_status": -1, "limit": 1},
{"exit_status": -10, "limit": 1},
]
},
)
if not step.no_plugin:
nightly_test_step.plugins = [nightly_plugin]

group_steps_list.append(nightly_test_step)

return BuildkiteGroupStep(
group="vLLM Against PyTorch Nightly", steps=group_steps_list
)


def _get_nightly_step_plugin(step: Step, nightly_image: str):
"""Get the Docker plugin config for a torch nightly test step."""
use_cpu = step.device == DeviceType.CPU or False
if step.device in [
DeviceType.H100.value,
DeviceType.A100.value,
DeviceType.B200_K8S.value,
]:
from plugin.k8s_plugin import get_k8s_plugin
return get_k8s_plugin(step, nightly_image)
else:
from plugin.docker_plugin import get_docker_plugin
return {"docker#v5.2.0": get_docker_plugin(step, nightly_image)}
10 changes: 10 additions & 0 deletions buildkite/pipeline_generator/global_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class GlobalConfig(TypedDict):
run_all_patterns: Optional[List[str]] = None
run_all_exclude_patterns: Optional[List[str]] = None
nightly: Optional[str] = "0"
torch_nightly: Optional[str] = "0"
run_all: bool = False
docs_only_disable: Optional[str] = "0"
merge_base_commit: Optional[str] = None
Expand Down Expand Up @@ -55,6 +56,7 @@ def init_global_config(pipeline_config_path: str):
run_all_patterns=pipeline_config.get("run_all_patterns", None),
run_all_exclude_patterns=pipeline_config.get("run_all_exclude_patterns", None),
nightly=os.getenv("NIGHTLY", "0"),
torch_nightly=os.getenv("TORCH_NIGHTLY", "0"),
run_all=_should_run_all(
pr_labels,
list_file_diff,
Expand All @@ -68,6 +70,11 @@ def init_global_config(pipeline_config_path: str):
if "ready-run-all-tests" in pr_labels:
config["run_all"] = True
config["nightly"] = "1"
if "ready-torch-nightly" in pr_labels:
# Full CI run built and tested against PyTorch nightly. Also force a
# full run on the pinned-torch image so both signals are produced.
config["torch_nightly"] = "1"
config["run_all"] = True
print("Config:\n")
for key, value in config.items():
print(f"{key}: {value}\n")
Expand Down Expand Up @@ -103,6 +110,9 @@ def _should_run_all(
"""Determine if the pipeline should run all tests."""
if os.getenv("RUN_ALL") == "1":
return True
if os.getenv("TORCH_NIGHTLY") == "1":
# A full torch-nightly run also runs the full suite on the pinned torch.
return True
if "ready-run-all-tests" in pr_labels:
return True
for file in list_file_diff:
Expand Down
36 changes: 36 additions & 0 deletions buildkite/tests/pipeline_generator/test_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def fake_global_config(monkeypatch):
"pull_request": "false",
"docs_only_disable": "1",
"nightly": "0",
"torch_nightly": "0",
"run_all": False,
"list_file_diff": [],
"fail_fast": False,
Expand Down Expand Up @@ -194,5 +195,40 @@ def test_amd_mirror_uses_shared_gating_with_amd_dependency_fallback(
)


def test_torch_nightly_flag_no_separate_group(fake_global_config):
# TORCH_NIGHTLY=1 now runs the entire existing pipeline against the nightly
# base image (built by image_build.sh when TORCH_NIGHTLY=1, CUDA/GPU lane).
# It must NOT synthesize a separate "vLLM Against PyTorch Nightly" group.
fake_global_config["torch_nightly"] = "1"
step = Step(
label="Untagged test",
group="Some Group",
key="untagged-test",
depends_on=["image-build"],
working_dir="/vllm-workspace/tests",
commands=["pytest tests/untagged.py"],
source_file_dependencies=["tests/untagged.py"],
device="h200_18gb",
)

group_steps = buildkite_step.convert_group_step_to_buildkite_step({
step.group: [step],
})

# No dedicated torch-nightly group is synthesized anymore.
assert not any(
g.group == "vLLM Against PyTorch Nightly" for g in group_steps
)

# The step stays in its normal group and is built once (no nightly duplicate).
normal_group = next(g for g in group_steps if g.group == "Some Group")
labels = [
s.label for s in normal_group.steps
if isinstance(s, buildkite_step.BuildkiteCommandStep)
]
assert "Untagged test" in labels
assert not any(lbl.startswith("Torch Nightly ") for lbl in labels)


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))