ROCm · mgehre-amd · Jun 19, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   rev: v21.1.2
   hooks:
   - id: clang-format
-    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|libtorch_stable/quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
     types_or: [c++, cuda]
     args: [--style=file, --verbose]
 - repo: https://github.com/DavidAnson/markdownlint-cli2

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
@@ -31,15 +31,15 @@ endif()
 
 if(VLLM_FLASH_ATTN_SRC_DIR)
   FetchContent_Declare(
-          vllm-flash-attn SOURCE_DIR 
+          vllm-flash-attn SOURCE_DIR
           ${VLLM_FLASH_ATTN_SRC_DIR}
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
   )
 else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG bce29425653ec0fbc579d329883030e832d15ada
+          GIT_TAG dd62dac706b1cf7895bd99b18c6cb7e7e117ee25
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

diff --git a/csrc/async_util.cuh → csrc/libtorch_stable/async_util.cuh b/csrc/async_util.cuh → csrc/libtorch_stable/async_util.cuh
diff --git a/.../epilogue/broadcast_load_epilogue_c2x.hpp → .../epilogue/broadcast_load_epilogue_c2x.hpp b/.../epilogue/broadcast_load_epilogue_c2x.hpp → .../epilogue/broadcast_load_epilogue_c2x.hpp
diff --git a/csrc/libtorch_stable/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/libtorch_stable/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -2,7 +2,7 @@
 
 #include <torch/csrc/stable/tensor.h>
 
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
+#include "broadcast_load_epilogue_c2x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,

diff --git a/csrc/libtorch_stable/fused_qknorm_rope_kernel.cu b/csrc/libtorch_stable/fused_qknorm_rope_kernel.cu
@@ -20,7 +20,7 @@
 
 #include "torch_utils.h"
 
-#include "../async_util.cuh"
+#include "async_util.cuh"
 #include "../cuda_compat.h"
 #include "../type_convert.cuh"
 #include "dispatch_utils.h"

diff --git a/csrc/launch_bounds_utils.h → csrc/libtorch_stable/launch_bounds_utils.h b/csrc/launch_bounds_utils.h → csrc/libtorch_stable/launch_bounds_utils.h
diff --git a/csrc/persistent_topk.cuh → csrc/libtorch_stable/persistent_topk.cuh b/csrc/persistent_topk.cuh → csrc/libtorch_stable/persistent_topk.cuh
diff --git a/csrc/libtorch_stable/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/libtorch_stable/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -25,7 +25,7 @@
 #include <cuda_fp8.h>
 
 #include "cuda_utils.h"
-#include "launch_bounds_utils.h"
+#include "libtorch_stable/launch_bounds_utils.h"
 
 // Define before including nvfp4_utils.cuh so the header
 // can use this macro during compilation.

diff --git a/csrc/libtorch_stable/quantization/fp4/mxfp4_experts_quant.cu b/csrc/libtorch_stable/quantization/fp4/mxfp4_experts_quant.cu
@@ -34,7 +34,7 @@
 static_assert(CVT_FP4_ELTS_PER_THREAD == 16,
               "MXFP4 experts quant requires PACK16 mode (CUDA >= 12.9)");
 
-#include "launch_bounds_utils.h"
+#include "libtorch_stable/launch_bounds_utils.h"
 
 namespace vllm {
 

diff --git a/csrc/libtorch_stable/quantization/fp4/nvfp4_experts_quant.cu b/csrc/libtorch_stable/quantization/fp4/nvfp4_experts_quant.cu
@@ -26,7 +26,7 @@
 
 #include "cuda_utils.h"
 #include "nvfp4_utils.cuh"
-#include "launch_bounds_utils.h"
+#include "libtorch_stable/launch_bounds_utils.h"
 
 namespace vllm {
 

diff --git a/csrc/libtorch_stable/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/libtorch_stable/quantization/fp4/nvfp4_quant_kernels.cu
@@ -26,7 +26,7 @@
 #include "../../cuda_vec_utils.cuh"
 
 #include "cuda_utils.h"
-#include "launch_bounds_utils.h"
+#include "libtorch_stable/launch_bounds_utils.h"
 
 // Define before including nvfp4_utils.cuh so the header
 // can use this macro during compilation.

diff --git a/csrc/quantization/gguf/dequantize.cuh → ...h_stable/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh → ...h_stable/quantization/gguf/dequantize.cuh
diff --git a/csrc/quantization/gguf/ggml-common.h → ...ch_stable/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h → ...ch_stable/quantization/gguf/ggml-common.h
diff --git a/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu b/csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu
@@ -7,14 +7,11 @@
 
 #include <torch/csrc/stable/ops.h>
 
-// NOTE: These headers are intentionally kept in csrc/quantization/gguf/ (not
-// moved to libtorch_stable) to avoid unnecessary reformatting that would break
-// git rename detection and pollute blame history.
-#include "../../../quantization/gguf/ggml-common.h"
-#include "../../../quantization/gguf/vecdotq.cuh"
-#include "../../../quantization/gguf/dequantize.cuh"
-#include "../../../quantization/gguf/mmvq.cuh"
-#include "../../../quantization/gguf/mmq.cuh"
+#include "ggml-common.h"
+#include "vecdotq.cuh"
+#include "dequantize.cuh"
+#include "mmvq.cuh"
+#include "mmq.cuh"
 #include "moe.cuh"
 #include "moe_vec.cuh"
 

diff --git a/csrc/quantization/gguf/mmq.cuh → ...libtorch_stable/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh → ...libtorch_stable/quantization/gguf/mmq.cuh
diff --git a/csrc/quantization/gguf/mmvq.cuh → ...ibtorch_stable/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh → ...ibtorch_stable/quantization/gguf/mmvq.cuh
diff --git a/csrc/quantization/gguf/vecdotq.cuh → ...orch_stable/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh → ...orch_stable/quantization/gguf/vecdotq.cuh
diff --git a/csrc/libtorch_stable/topk.cu b/csrc/libtorch_stable/topk.cu
@@ -7,7 +7,7 @@
 #include "torch_utils.h"
 
 #ifndef USE_ROCM
-  #include "../persistent_topk.cuh"
+  #include "persistent_topk.cuh"
 #endif
 
 namespace {

diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md
@@ -246,20 +246,26 @@ Every image listed in "image_files" is added to the request in the listed order
 
 The "image" shorthand accepts the same values as "image_files". The "image_url" field accepts either an OpenAI-style object with a "url" field or a URL string.
 
+By default, image references are sent to the serving endpoint as provided, with local image paths converted to `file://` URLs.
+
+If the benchmark client should load local and HTTP(S) images before sending requests, pass `--custom-ensure-client-side-data` to encode them as base64 data URLs on the client side.
+
+Existing `data:image/...` URLs are already self-contained and are kept unchanged.
+
 ```bash
 # need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```
 
 ```bash
 # run benchmarking script
-vllm bench serve--save-result --save-detailed \
+vllm bench serve --save-result --save-detailed \
   --backend openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name custom_image \
   --dataset-path <path-to-your-image-data-jsonl> \
-  --allowed-local-media-path /path/to/image/folder
+  --custom-ensure-client-side-data
 ```
 
 Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs.

diff --git a/rust/src/chat/tests/roundtrip.rs b/rust/src/chat/tests/roundtrip.rs
@@ -20,6 +20,7 @@ use vllm_chat::{
 use vllm_text::{DecodedTextEvent, Finished, Prompt};
 
 /// One model/parser configuration used to run the fixed roundtrip fixtures.
+#[derive(Clone)]
 struct RoundtripCase {
     /// Hugging Face model id resolved through the production backend loader.
     model_id: &'static str,
@@ -31,11 +32,45 @@ struct RoundtripCase {
     tool_call_parser: ParserSelection,
     /// Reasoning parser selection used by the output processor.
     reasoning_parser: ParserSelection,
+    /// How this model's chat template handles thinking mode.
+    thinking_behavior: ThinkingBehavior,
     /// JSON formatting expected after this model's template has materialized
     /// tool-call arguments.
     json_fmt: JsonFmt,
 }
 
+#[derive(Clone, Copy)]
+enum ThinkingBehavior {
+    /// The chat template accepts explicit thinking on/off kwargs, and uses
+    /// `default` when the request does not specify either kwarg.
+    Toggleable { default: bool },
+    /// The chat template always behaves as `value` for this fixture.
+    Always { value: bool },
+}
+
+impl ThinkingBehavior {
+    fn default(self) -> bool {
+        match self {
+            Self::Toggleable { default } => default,
+            Self::Always { value } => value,
+        }
+    }
+
+    fn fixtures(self) -> Vec<Option<bool>> {
+        match self {
+            Self::Toggleable { .. } => vec![
+                Some(true),  // explicitly enable thinking
+                Some(false), // explicitly disable thinking
+                None,        // use default template behavior
+            ],
+            Self::Always { value } => vec![
+                Some(value), // explicitly request the supported thinking behavior
+                None,        // use default template behavior
+            ],
+        }
+    }
+}
+
 impl RoundtripCase {
     /// Qwen3 XML tool-call format with `qwen3` reasoning tags.
     fn qwen3() -> Self {
@@ -44,6 +79,7 @@ impl RoundtripCase {
             assistant_stop_suffix: "<|im_end|>\n",
             tool_call_parser: ParserSelection::Auto,
             reasoning_parser: ParserSelection::Auto,
+            thinking_behavior: ThinkingBehavior::Toggleable { default: true },
             json_fmt: spaced_json_fmt(),
         }
     }
@@ -55,6 +91,7 @@ impl RoundtripCase {
             assistant_stop_suffix: "<|im_end|>\n",
             tool_call_parser: ParserSelection::Auto,
             reasoning_parser: ParserSelection::Auto,
+            thinking_behavior: ThinkingBehavior::Toggleable { default: true },
             json_fmt: compact_json_fmt(),
         }
     }
@@ -66,6 +103,7 @@ impl RoundtripCase {
             assistant_stop_suffix: "[e~[\n",
             tool_call_parser: ParserSelection::Auto,
             reasoning_parser: ParserSelection::Auto,
+            thinking_behavior: ThinkingBehavior::Always { value: true },
             json_fmt: compact_json_fmt(),
         }
     }
@@ -77,6 +115,7 @@ impl RoundtripCase {
             assistant_stop_suffix: "<｜end▁of▁sentence｜>",
             tool_call_parser: ParserSelection::Auto,
             reasoning_parser: ParserSelection::Auto,
+            thinking_behavior: ThinkingBehavior::Toggleable { default: false },
             json_fmt: compact_json_fmt(),
         }
     }
@@ -88,6 +127,7 @@ impl RoundtripCase {
             assistant_stop_suffix: "",
             tool_call_parser: ParserSelection::Auto,
             reasoning_parser: ParserSelection::Auto,
+            thinking_behavior: ThinkingBehavior::Toggleable { default: true },
             json_fmt: compact_json_fmt(),
         }
     }
@@ -100,6 +140,7 @@ impl RoundtripCase {
             assistant_stop_suffix: "<|im_end|>",
             tool_call_parser: ParserSelection::Auto,
             reasoning_parser: ParserSelection::Auto,
+            thinking_behavior: ThinkingBehavior::Toggleable { default: true },
             json_fmt: spaced_json_fmt(),
         }
     }
@@ -135,35 +176,44 @@ roundtrip_tests! {
 
 /// Run the fixed reasoning+content fixture for one model/parser case.
 async fn run_roundtrip_reasoning_and_content(case: RoundtripCase) -> Result<()> {
+    for thinking in case.thinking_behavior.fixtures() {
+        run_roundtrip_reasoning_and_content_inner(case.clone(), thinking).await?;
+    }
+    Ok(())
+}
+
+async fn run_roundtrip_reasoning_and_content_inner(
+    case: RoundtripCase,
+    thinking: Option<bool>,
+) -> Result<()> {
     let backends = load_roundtrip_backends(&case).await?;
     let request = roundtrip_request(
         "roundtrip-reasoning-content",
         vec![ChatMessage::text(ChatRole::User, "What is 2 + 2?")],
         Vec::new(),
+        thinking,
     );
     let expected_reasoning = "Need compute 2 + 2 directly.";
     let expected_text = "The answer is 4.";
+    let effective_thinking = thinking.unwrap_or(case.thinking_behavior.default());
 
-    let result = run_roundtrip(
-        &case,
-        &backends,
-        &request,
-        AssistantMessage {
-            content: vec![
-                AssistantContentBlock::Reasoning {
-                    text: expected_reasoning.to_string(),
-                },
-                AssistantContentBlock::Text {
-                    text: expected_text.to_string(),
-                },
-            ],
-        },
-    )
-    .await?;
+    let assistant = {
+        let mut content = Vec::new();
+        if effective_thinking {
+            content.push(AssistantContentBlock::Reasoning {
+                text: expected_reasoning.to_string(),
+            });
+        }
+        content.push(AssistantContentBlock::Text {
+            text: expected_text.to_string(),
+        });
+        AssistantMessage { content }
+    };
+    let result = run_roundtrip(&case, &backends, &request, assistant).await?;
 
     assert_eq!(
         result.parsed_message.reasoning().as_deref().map(str::trim),
-        Some(expected_reasoning)
+        effective_thinking.then_some(expected_reasoning)
     );
     assert_eq!(result.parsed_message.text().trim(), expected_text);
     assert_eq!(result.parsed_message.tool_calls().count(), 0);
@@ -186,6 +236,7 @@ async fn run_roundtrip_tool_call_mix(case: RoundtripCase) -> Result<()> {
             "Check Shanghai weather and add 1.00 plus 2.",
         )],
         test_tools(),
+        Some(true), // always enable thinking in this fixture
     );
     let expected_reasoning = "Need call the weather and add tools.";
     let expected_text = "I will call the tools.";
@@ -487,6 +538,7 @@ fn roundtrip_request(
     request_id: impl Into<String>,
     messages: Vec<ChatMessage>,
     tools: Vec<ChatTool>,
+    thinking: Option<bool>,
 ) -> ChatRequest {
     let mut request = ChatRequest {
         request_id: request_id.into(),
@@ -500,10 +552,12 @@ fn roundtrip_request(
         ..ChatRequest::for_test()
     };
 
-    // Enable thinking for some models so that rendering and parsing the reasoning block is
-    // exercised in the roundtrip.
-    for key in ["thinking", "enable_thinking"] {
-        request.chat_options.template_kwargs.insert(key.to_string(), true.into());
+    // Explicitly enable or disable thinking so that rendering and parsing the reasoning block is
+    // exercised or skipped in the roundtrip. If unspecified, use the default template behavior.
+    if let Some(thinking) = thinking {
+        for key in ["thinking", "enable_thinking"] {
+            request.chat_options.template_kwargs.insert(key.to_string(), thinking.into());
+        }
     }
 
     request

diff --git a/rust/src/tool-parser/src/deepseek_dsml/mod.rs b/rust/src/tool-parser/src/deepseek_dsml/mod.rs
@@ -104,7 +104,7 @@ impl DeepSeekDsmlToolParser {
                         self.tool_parameters.convert_param_with_schema(
                             &name,
                             &param.name,
-                            &param.value,
+                            param.value,
                         )
                     };
                     arguments.insert(param.name, value);