ROCm · jpvillam-amd · Jun 10, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/benchmarks/curl_gpt.sh b/benchmarks/curl_gpt.sh
@@ -0,0 +1,7 @@
+MODEL="${MODEL:-/data/models/gpt-oss-120b-w-mxfp4-a-fp8}" \
+curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
+  "model": ${MODEL},
+  "prompt": "What is the capital of France, and what is it known for?",
+  "temperature": 0.0,
+  "max_tokens": 100
+}'
diff --git a/benchmarks/gpt_fp4_serve.sh b/benchmarks/gpt_fp4_serve.sh
@@ -0,0 +1,7 @@
+#ROCR_VISIBLE_DEVICE=0 \
+MODEL="${MODEL:-/data/models/gpt-oss-120b-w-mxfp4-a-fp8}" \
+HSA_ENABLE_SDMA=0 USE_SVM=0 HSA_XNACK=0 \
+VLLM_ROCM_AITER_FUSED_MOE_TRITON_GEMM_A4W4=1 \
+VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 \
+VLLM_ROCM_USE_SKINNY_GEMM=0 VLLM_ROCM_USE_AITER_RMSNORM=0 \
+vllm serve --model ${MODEL} --host localhost --port 8000 --tensor-parallel-size 1 --gpu_memory_utilization 0.7 #--compilation-config '{"mode":"None","cudagraph_mode": "FULL", "cudagraph_capture_sizes": [1]}'
@@ -1,7 +1,7 @@
 # default base image
 ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
-ARG BASE_IMAGE=rocm/vllm-dev:base
+ARG BASE_IMAGE=rocm/vllm-private:juan_455_npi_base
 ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base
 # NIC backend for MoRI RDMA support.
 # By default (all), drivers and userspace libraries for all supported NIC types
@@ -115,7 +115,7 @@ FROM base AS fetch_vllm_0
 ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
 ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
-ARG VLLM_BRANCH="455_wip"
+ARG VLLM_BRANCH="455_wip_nowhls"
 ENV VLLM_REPO=${VLLM_REPO}
 ENV VLLM_BRANCH=${VLLM_BRANCH}
 ONBUILD RUN git clone ${VLLM_REPO} \

@@ -1,5 +1,5 @@
 # FFM overlay for gfx1250 builds
-ARG BASE_IMAGE
+ARG BASE_IMAGE=rocm/vllm-private:juan_455_npi_test
 FROM ${BASE_IMAGE}
 
 # extract tarball into /root/x/ffm/.

@@ -1,6 +1,6 @@
-ARG BASE_IMAGE=ubuntu:24.04
+ARG BASE_IMAGE=registry-sc-harbor.amd.com/framework/therock-npi:pytorch-2.10.0-rocm7.14.0a20260605.a0-7.14.0a20260605-a0-nightly-ubuntu24.04-gfx1250
 ARG ROCM_WHEEL_INDEX=https://rocm.genesis.amd.com/whl/gfx1250
-ARG TORCH_VERSION=2.11.0+rocm7.14.0a20260605
+ARG TORCH_VERSION=2.10.0+rocm7.14.0a20260605.a0
 ARG TORCHVISION_VERSION=0.28.0a0+rocm7.14.0a20260605
 ARG TORCHAUDIO_VERSION=2.11.0a0+rocm7.14.0a20260605
 ARG ROCM_SDK_VERSION=7.14.0a20260605
@@ -44,12 +44,12 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update -y \
     && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev liblzma-dev pkg-config \
     && for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    add-apt-repository -y ppa:deadsnakes/ppa && break || \
+    { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
     done \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
+    python${PYTHON_VERSION}-lib2to3 python-is-python3  \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
@@ -71,41 +71,41 @@ ARG SCCACHE_BUCKET_NAME
 ARG SCCACHE_REGION_NAME
 ARG SCCACHE_S3_NO_CREDENTIALS
 RUN if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && SCCACHE_ARCH="x86_64" \
-        && SCCACHE_VERSION="v0.8.1" \
-        && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
-        && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
-        && tar -xzf /tmp/sccache.tar.gz -C /tmp \
-        && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
-        && chmod +x /usr/bin/sccache \
-        && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
-        && sccache --version; \
+    echo "Installing sccache..." \
+    && SCCACHE_ARCH="x86_64" \
+    && SCCACHE_VERSION="v0.8.1" \
+    && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+    && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+    && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+    && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+    && chmod +x /usr/bin/sccache \
+    && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+    && sccache --version; \
     fi
 
 ###
 ### Install PyTorch w/ Triton + ROCM_SDK from ROCM wheel index
 ###
-ARG ROCM_WHEEL_INDEX
-ARG TORCH_VERSION
-ARG TORCHVISION_VERSION
-ARG TORCHAUDIO_VERSION
-ARG ROCM_SDK_VERSION
-# torch/torchvision/torchaudio must be pinned to mutually-consistent builds
-# (same +rocm... suffix) or the C++ ops break at import (ABI skew). The rocm
-# sdk version is derived from torch's own dependency pin unless overridden,
-# which keeps the set consistent and avoids pip backtracking.
-RUN pip install --pre --index-url "${ROCM_WHEEL_INDEX}" \
-        "torch==${TORCH_VERSION}" \
-        "torchvision==${TORCHVISION_VERSION}" \
-        "torchaudio==${TORCHAUDIO_VERSION}" \
-        "rocm[libraries,devel]==${ROCM_SDK_VERSION}" && \
-    rocm-sdk init
+#ARG ROCM_WHEEL_INDEX
+#ARG TORCH_VERSION
+#ARG TORCHVISION_VERSION
+#ARG TORCHAUDIO_VERSION
+#ARG ROCM_SDK_VERSION
+## torch/torchvision/torchaudio must be pinned to mutually-consistent builds
+## (same +rocm... suffix) or the C++ ops break at import (ABI skew). The rocm
+## sdk version is derived from torch's own dependency pin unless overridden,
+## which keeps the set consistent and avoids pip backtracking.
+#RUN pip install --pre --index-url "${ROCM_WHEEL_INDEX}" \
+#        "torch==${TORCH_VERSION}" \
+#        "torchvision==${TORCHVISION_VERSION}" \
+#        "torchaudio==${TORCHAUDIO_VERSION}" \
+#        "rocm[libraries,devel]==${ROCM_SDK_VERSION}" && \
+#    rocm-sdk init
 
 # Torch runtime deps that may not be published on the ROCm wheel index;
 # install them from PyPI afterwards.
 RUN pip install filelock "typing-extensions>=4.10.0" "sympy>=1.13.3" \
-        "networkx>=2.5.1" jinja2 "fsspec>=0.8.5"
+    "networkx>=2.5.1" jinja2 "fsspec>=0.8.5"
 
 ENV SITE_PACKAGES=${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
 ENV ROCM_PATH=${SITE_PACKAGES}/_rocm_sdk_devel
@@ -122,10 +122,10 @@ ENV PYTHONPATH=${SITE_PACKAGES}/_rocm_sdk_core/share/amd_smi
 # Expose the rocm-sdk wheel as a conventional /opt/rocm install so downstream
 # builds (Dockerfile.rocm: vLLM csrc, RIXL/UCX, ROCShmem/DeepEP) keep working.
 RUN ln -sfn "${ROCM_PATH}" /opt/rocm;  
-    
+
 RUN if [ -f "${SITE_PACKAGES}/rocm_sdk/__init__.py" ]; then \
-        sed -i 's/rtld_global: bool = True/rtld_global: bool = False/g' \
-            "${SITE_PACKAGES}/rocm_sdk/__init__.py"; \
+    sed -i 's/rtld_global: bool = True/rtld_global: bool = False/g' \
+    "${SITE_PACKAGES}/rocm_sdk/__init__.py"; \
     fi
 
 # Setup sccache for HIP compilation via HIP_CLANG_PATH
@@ -134,13 +134,13 @@ RUN if [ -f "${SITE_PACKAGES}/rocm_sdk/__init__.py" ]; then \
 # NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
 # Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
 RUN if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Setting up sccache wrappers for HIP compilation..." \
-        && mkdir -p /opt/sccache-wrappers \
-        && printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
-        && chmod +x /opt/sccache-wrappers/clang++ \
-        && printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
-        && chmod +x /opt/sccache-wrappers/clang \
-        && echo "sccache wrappers created in /opt/sccache-wrappers"; \
+    echo "Setting up sccache wrappers for HIP compilation..." \
+    && mkdir -p /opt/sccache-wrappers \
+    && printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
+    && chmod +x /opt/sccache-wrappers/clang++ \
+    && printf '#!/bin/bash\nexec sccache ${ROCM_PATH}/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
+    && chmod +x /opt/sccache-wrappers/clang \
+    && echo "sccache wrappers created in /opt/sccache-wrappers"; \
     fi
 
 # Set sccache environment variables only when USE_SCCACHE=1
@@ -168,14 +168,14 @@ ARG MORI_BRANCH
 ARG MORI_REPO
 RUN mkdir -p /app/install; \
     if echo "${PYTORCH_ROCM_ARCH}" | grep -q "gfx1250"; then \
-        echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping MORI build"; \
+    echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping MORI build"; \
     else \
-        git clone ${MORI_REPO} \
-        && cd mori \
-        && git checkout ${MORI_BRANCH} \
-        && git submodule update --init --recursive \
-        && python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl \
-        && cp /app/mori/dist/*.whl /app/install; \
+    git clone ${MORI_REPO} \
+    && cd mori \
+    && git checkout ${MORI_BRANCH} \
+    && git submodule update --init --recursive \
+    && python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl \
+    && cp /app/mori/dist/*.whl /app/install; \
     fi
 
 
@@ -188,19 +188,19 @@ ARG FA_REPO
 ARG USE_SCCACHE
 RUN mkdir -p /app/install; \
     if echo "${PYTORCH_ROCM_ARCH}" | grep -q "gfx1250"; then \
-        echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping FlashAttention build"; \
+    echo "gfx1250 in PYTORCH_ROCM_ARCH; skipping FlashAttention build"; \
     else \
-        git clone ${FA_REPO} \
-        && cd flash-attention \
-        && git checkout ${FA_BRANCH} \
-        && git submodule update --init \
-        && if [ "$USE_SCCACHE" = "1" ]; then \
-               export HIP_CLANG_PATH=/opt/sccache-wrappers \
-               && sccache --show-stats; \
-           fi \
-        && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
-        && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
-        && cp dist/*.whl /app/install; \
+    git clone ${FA_REPO} \
+    && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+    export HIP_CLANG_PATH=/opt/sccache-wrappers \
+    && sccache --show-stats; \
+    fi \
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
+    && cp dist/*.whl /app/install; \
     fi
 
 
@@ -217,9 +217,9 @@ RUN cd aiter \
     && pip install -r requirements.txt
 RUN pip install pyyaml && cd aiter \
     && if [ "$USE_SCCACHE" = "1" ]; then \
-           export HIP_CLANG_PATH=/opt/sccache-wrappers \
-           && sccache --show-stats; \
-       fi \
+    export HIP_CLANG_PATH=/opt/sccache-wrappers \
+    && sccache --show-stats; \
+    fi \
     && AITER_USE_SYSTEM_TRITON=1 PREBUILD_KERNELS=${PREBUILD_KERNELS} GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
     && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
     && ls /app/aiter/dist/*.whl

@@ -48,34 +48,16 @@ def aiter_triton_kernel_w4a8_moe_forward(
     )
     from vllm.platforms.rocm import on_gfx1250
 
-    # aiter exposes its MoE routing under two module paths across versions;
-    # prefer the nested `moe.moe_routing` location, fall back to the legacy one.
-    #
-    # On gfx1250 use aiter's pure-torch ``routing_torch`` instead of the triton
-    # ``routing``: the triton routing kernel compiles a TMA (TDM) descriptor
-    # whose last dim is ``topk * 2`` bytes, which is < 16 bytes (the descriptor
-    # minimum) for a power-of-2 topk such as gpt-oss' topk=4 and fails to
-    # compile for small batches (warmup dummy run + every decode step).
-    # ``routing_torch`` avoids the TDM kernel and is numerically identical
-    # (validated on the FFM sim: gather/scatter/gate_scal match the triton path
-    # exactly where the latter compiles). DeepSeek-V4's topk=6 dodged this since
-    # ``next_power_of_2(6) == 8 != 6`` disables the descriptor branch.
+    try:
+        from aiter.ops.triton.moe.moe_routing import routing as _routing_mod
+    except ImportError:
+        from aiter.ops.triton.moe_routing import routing as _routing_mod
+
+    # TODO: (JPVILLAM) This causes a tl compile error on 1250.
+    # Need to figure out why this is a problem and sync with triton team
     if on_gfx1250():
-        try:
-            from aiter.ops.triton.moe.moe_routing.routing import (
-                routing_torch as aiter_routing,
-            )
-        except ImportError:
-            from aiter.ops.triton.moe_routing.routing import (
-                routing as aiter_routing,
-            )
-    else:
-        try:
-            from aiter.ops.triton.moe.moe_routing.routing import (
-                routing as aiter_routing,
-            )
-        except ImportError:
-            from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
+        _routing_mod.is_tdm_avail = lambda: False
+    aiter_routing = _routing_mod.routing
 
     routing_data, gather_idx, scatter_idx = aiter_routing(
         gating_output, topk, sm_first=not renormalize

@@ -191,9 +191,7 @@ def _get_gcn_arch() -> str:
         # current_platform is bound. This path is taken on the FFM simulator,
         # where amdsmi is unavailable (and would report the host's real gfx950
         # cards rather than the simulated gfx1250) — torch.cuda below is correct.
-        logger.debug(
-            "Failed to get GCN arch via amdsmi, falling back to torch.cuda."
-        )
+        logger.debug("Failed to get GCN arch via amdsmi, falling back to torch.cuda.")
     # Ultimate fallback: use torch.cuda (will initialize CUDA)
     return torch.cuda.get_device_properties("cuda").gcnArchName
 
@@ -206,10 +204,15 @@ def _get_gcn_arch() -> str:
 _ON_GFX1X = any(arch in _GCN_ARCH for arch in ["gfx11", "gfx12"])
 _ON_GFX12X = any(arch in _GCN_ARCH for arch in ["gfx12"])
 _ON_MI3XX = any(arch in _GCN_ARCH for arch in ["gfx942", "gfx950", "gfx1250"])
-_ON_GFX9 = any(arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950", "gfx1250"])
+_ON_GFX9 = any(
+    arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950", "gfx1250"]
+)  # TODO(JPVILLAM): Bubblegum patch to unlock gptoss
 _ON_GFX90A = "gfx90a" in _GCN_ARCH
 _ON_GFX942 = "gfx942" in _GCN_ARCH
 _ON_GFX950 = "gfx950" in _GCN_ARCH
+# any(
+#    arch in _GCN_ARCH for arch in ["gfx950", "gfx1250"]
+# )  # TODO(JPVILLAM): Bubblegum patch to unlock DSR1
 _ON_GFX1250 = "gfx1250" in _GCN_ARCH