Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions benchmarks/curl_gpt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
MODEL="${MODEL:-/data/models/gpt-oss-120b-w-mxfp4-a-fp8}" \
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
"model": ${MODEL},
"prompt": "What is the capital of France, and what is it known for?",
"temperature": 0.0,
"max_tokens": 100
}'
7 changes: 7 additions & 0 deletions benchmarks/gpt_fp4_serve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ROCR_VISIBLE_DEVICE=0 \
MODEL="${MODEL:-/data/models/gpt-oss-120b-w-mxfp4-a-fp8}" \
HSA_ENABLE_SDMA=0 USE_SVM=0 HSA_XNACK=0 \
VLLM_ROCM_AITER_FUSED_MOE_TRITON_GEMM_A4W4=1 \
VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 \
VLLM_ROCM_USE_SKINNY_GEMM=0 VLLM_ROCM_USE_AITER_RMSNORM=0 \
vllm serve --model ${MODEL} --host localhost --port 8000 --tensor-parallel-size 1 --gpu_memory_utilization 0.7 #--compilation-config '{"mode":"None","cudagraph_mode": "FULL", "cudagraph_capture_sizes": [1]}'
4 changes: 2 additions & 2 deletions docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# default base image
ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
ARG BASE_IMAGE=rocm/vllm-private:juan_455_npi_base
ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base
# NIC backend for MoRI RDMA support.
# By default (all), drivers and userspace libraries for all supported NIC types
Expand Down Expand Up @@ -115,7 +115,7 @@ FROM base AS fetch_vllm_0
ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
ARG VLLM_BRANCH="455_wip"
ARG VLLM_BRANCH="455_wip_nowhls"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.rocm_1250_ffm
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# FFM overlay for gfx1250 builds
ARG BASE_IMAGE
ARG BASE_IMAGE=rocm/vllm-private:juan_455_npi_test
FROM ${BASE_IMAGE}

# extract tarball into /root/x/ffm/.
Expand Down
126 changes: 63 additions & 63 deletions docker/Dockerfile.rocm_base
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG BASE_IMAGE=ubuntu:24.04
ARG BASE_IMAGE=registry-sc-harbor.amd.com/framework/therock-npi:pytorch-2.10.0-rocm7.14.0a20260605.a0-7.14.0a20260605-a0-nightly-ubuntu24.04-gfx1250
ARG ROCM_WHEEL_INDEX=https://rocm.genesis.amd.com/whl/gfx1250
ARG TORCH_VERSION=2.11.0+rocm7.14.0a20260605
ARG TORCH_VERSION=2.10.0+rocm7.14.0a20260605.a0
ARG TORCHVISION_VERSION=0.28.0a0+rocm7.14.0a20260605
ARG TORCHAUDIO_VERSION=2.11.0a0+rocm7.14.0a20260605
ARG ROCM_SDK_VERSION=7.14.0a20260605
Expand Down Expand Up @@ -44,12 +44,12 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev liblzma-dev pkg-config \
&& for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
Expand All @@ -71,41 +71,41 @@ ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi

###
### Install PyTorch w/ Triton + ROCM_SDK from ROCM wheel index
###
ARG ROCM_WHEEL_INDEX
ARG TORCH_VERSION
ARG TORCHVISION_VERSION
ARG TORCHAUDIO_VERSION
ARG ROCM_SDK_VERSION
# torch/torchvision/torchaudio must be pinned to mutually-consistent builds
# (same +rocm... suffix) or the C++ ops break at import (ABI skew). The rocm
# sdk version is derived from torch's own dependency pin unless overridden,
# which keeps the set consistent and avoids pip backtracking.
RUN pip install --pre --index-url "${ROCM_WHEEL_INDEX}" \
"torch==${TORCH_VERSION}" \
"torchvision==${TORCHVISION_VERSION}" \
"torchaudio==${TORCHAUDIO_VERSION}" \
"rocm[libraries,devel]==${ROCM_SDK_VERSION}" && \
rocm-sdk init
#ARG ROCM_WHEEL_INDEX
#ARG TORCH_VERSION
#ARG TORCHVISION_VERSION
#ARG TORCHAUDIO_VERSION
#ARG ROCM_SDK_VERSION
## torch/torchvision/torchaudio must be pinned to mutually-consistent builds
## (same +rocm... suffix) or the C++ ops break at import (ABI skew). The rocm
## sdk version is derived from torch's own dependency pin unless overridden,
## which keeps the set consistent and avoids pip backtracking.
#RUN pip install --pre --index-url "${ROCM_WHEEL_INDEX}" \
# "torch==${TORCH_VERSION}" \
# "torchvision==${TORCHVISION_VERSION}" \
# "torchaudio==${TORCHAUDIO_VERSION}" \
# "rocm[libraries,devel]==${ROCM_SDK_VERSION}" && \
# rocm-sdk init

# Torch runtime deps that may not be published on the ROCm wheel index;
# install them from PyPI afterwards.
RUN pip install filelock "typing-extensions>=4.10.0" "sympy>=1.13.3" \
"networkx>=2.5.1" jinja2 "fsspec>=0.8.5"
"networkx>=2.5.1" jinja2 "fsspec>=0.8.5"

ENV SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
ENV ROCM_PATH=${SITE_PACKAGES}/_rocm_sdk_devel
Expand All @@ -122,10 +122,10 @@ ENV PYTHONPATH=${SITE_PACKAGES}/_rocm_sdk_core/share/amd_smi
# Expose the rocm-sdk wheel as a conventional /opt/rocm install so downstream
# builds (Dockerfile.rocm: vLLM csrc, RIXL/UCX, ROCShmem/DeepEP) keep working.
RUN ln -sfn "${ROCM_PATH}" /opt/rocm;

RUN if [ -f "${SITE_PACKAGES}/rocm_sdk/__init__.py" ]; then \
sed -i 's/rtld_global: bool = True/rtld_global: bool = False/g' \
"${SITE_PACKAGES}/rocm_sdk/__init__.py"; \
sed -i 's/rtld_global: bool = True/rtld_global: bool = False/g' \
"${SITE_PACKAGES}/rocm_sdk/__init__.py"; \
fi

# Setup sccache for HIP compilation via HIP_CLANG_PATH
Expand All @@ -134,13 +134,13 @@ RUN if [ -f "${SITE_PACKAGES}/rocm_sdk/__init__.py" ]; then \
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Setting up sccache wrappers for HIP compilation..." \
&& mkdir -p /opt/sccache-wrappers \
&& printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
&& chmod +x /opt/sccache-wrappers/clang++ \
&& printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
&& chmod +x /opt/sccache-wrappers/clang \
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
echo "Setting up sccache wrappers for HIP compilation..." \
&& mkdir -p /opt/sccache-wrappers \
&& printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
&& chmod +x /opt/sccache-wrappers/clang++ \
&& printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
&& chmod +x /opt/sccache-wrappers/clang \
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
fi

# Set sccache environment variables only when USE_SCCACHE=1
Expand Down Expand Up @@ -168,14 +168,14 @@ ARG MORI_BRANCH
ARG MORI_REPO
RUN mkdir -p /app/install; \
if echo "${PYTORCH_ROCM_ARCH}" | grep -q "gfx1250"; then \
echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping MORI build"; \
echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping MORI build"; \
else \
git clone ${MORI_REPO} \
&& cd mori \
&& git checkout ${MORI_BRANCH} \
&& git submodule update --init --recursive \
&& python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl \
&& cp /app/mori/dist/*.whl /app/install; \
git clone ${MORI_REPO} \
&& cd mori \
&& git checkout ${MORI_BRANCH} \
&& git submodule update --init --recursive \
&& python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl \
&& cp /app/mori/dist/*.whl /app/install; \
fi


Expand All @@ -188,19 +188,19 @@ ARG FA_REPO
ARG USE_SCCACHE
RUN mkdir -p /app/install; \
if echo "${PYTORCH_ROCM_ARCH}" | grep -q "gfx1250"; then \
echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping FlashAttention build"; \
echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping FlashAttention build"; \
else \
git clone ${FA_REPO} \
&& cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& cp dist/*.whl /app/install; \
git clone ${FA_REPO} \
&& cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& cp dist/*.whl /app/install; \
fi


Expand All @@ -217,9 +217,9 @@ RUN cd aiter \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& AITER_USE_SYSTEM_TRITON=1 PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& ls /app/aiter/dist/*.whl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,34 +48,16 @@ def aiter_triton_kernel_w4a8_moe_forward(
)
from vllm.platforms.rocm import on_gfx1250

# aiter exposes its MoE routing under two module paths across versions;
# prefer the nested `moe.moe_routing` location, fall back to the legacy one.
#
# On gfx1250 use aiter's pure-torch ``routing_torch`` instead of the triton
# ``routing``: the triton routing kernel compiles a TMA (TDM) descriptor
# whose last dim is ``topk * 2`` bytes, which is < 16 bytes (the descriptor
# minimum) for a power-of-2 topk such as gpt-oss' topk=4 and fails to
# compile for small batches (warmup dummy run + every decode step).
# ``routing_torch`` avoids the TDM kernel and is numerically identical
# (validated on the FFM sim: gather/scatter/gate_scal match the triton path
# exactly where the latter compiles). DeepSeek-V4's topk=6 dodged this since
# ``next_power_of_2(6) == 8 != 6`` disables the descriptor branch.
try:
from aiter.ops.triton.moe.moe_routing import routing as _routing_mod
except ImportError:
from aiter.ops.triton.moe_routing import routing as _routing_mod

# TODO: (JPVILLAM) This causes a tl compile error on 1250.
# Need to figure out why this is a problem and sync with triton team
if on_gfx1250():
try:
from aiter.ops.triton.moe.moe_routing.routing import (
routing_torch as aiter_routing,
)
except ImportError:
from aiter.ops.triton.moe_routing.routing import (
routing as aiter_routing,
)
else:
try:
from aiter.ops.triton.moe.moe_routing.routing import (
routing as aiter_routing,
)
except ImportError:
from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
_routing_mod.is_tdm_avail = lambda: False
aiter_routing = _routing_mod.routing

routing_data, gather_idx, scatter_idx = aiter_routing(
gating_output, topk, sm_first=not renormalize
Expand Down
11 changes: 7 additions & 4 deletions vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,7 @@ def _get_gcn_arch() -> str:
# current_platform is bound. This path is taken on the FFM simulator,
# where amdsmi is unavailable (and would report the host's real gfx950
# cards rather than the simulated gfx1250) — torch.cuda below is correct.
logger.debug(
"Failed to get GCN arch via amdsmi, falling back to torch.cuda."
)
logger.debug("Failed to get GCN arch via amdsmi, falling back to torch.cuda.")
# Ultimate fallback: use torch.cuda (will initialize CUDA)
return torch.cuda.get_device_properties("cuda").gcnArchName

Expand All @@ -206,10 +204,15 @@ def _get_gcn_arch() -> str:
_ON_GFX1X = any(arch in _GCN_ARCH for arch in ["gfx11", "gfx12"])
_ON_GFX12X = any(arch in _GCN_ARCH for arch in ["gfx12"])
_ON_MI3XX = any(arch in _GCN_ARCH for arch in ["gfx942", "gfx950", "gfx1250"])
_ON_GFX9 = any(arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950", "gfx1250"])
_ON_GFX9 = any(
arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950", "gfx1250"]
) # TODO(JPVILLAM): Bubblegum patch to unlock gptoss
_ON_GFX90A = "gfx90a" in _GCN_ARCH
_ON_GFX942 = "gfx942" in _GCN_ARCH
_ON_GFX950 = "gfx950" in _GCN_ARCH
# any(
# arch in _GCN_ARCH for arch in ["gfx950", "gfx1250"]
# ) # TODO(JPVILLAM): Bubblegum patch to unlock DSR1
_ON_GFX1250 = "gfx1250" in _GCN_ARCH


Expand Down
Loading