Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
ea0d045
[FlashAttention] Sync FA with upstream (#44065)
MatthewBonanni Jun 2, 2026
c91a87f
[BugFix] [GDN] Read linear_key_head_dim from hf_text_config for multi…
IdoAtadTD Jun 2, 2026
6314de8
[XPU] [Bug] remove xpuw4a16 output size check (#44168)
zufangzhu Jun 2, 2026
880fc03
[Rust Frontend] Support recursive tool parameter conversion (#44299)
BugenZhao Jun 2, 2026
88f1721
[ROCm] Fix AITER RMSNormQuantFusion for Kimi-Linear (#44308)
pschlan-amd Jun 2, 2026
586201e
[Rust Frontend] Cover different thinking modes in roundtrip tests (#4…
BugenZhao Jun 2, 2026
4d93bc3
Migrate header files to torch stable abi (#44013)
cleonard530 Jun 2, 2026
53fa09d
[Misc] Support local image encoding in benchmarks (#43843)
xiaozcy Jun 2, 2026
774e552
[compressed-tensors] Asymmetric support for MoE WNA16 marlin (#44025)
brian-dellabetta Jun 2, 2026
cab5c9a
[Core] Move `max_concurrent_batches` to `VllmConfig` (#44274)
njhill Jun 2, 2026
478b49d
[Refactor] Remove dead code from parser infrastructure (#44279)
sfeng33 Jun 2, 2026
3f3e270
[XPU] Enable rms_norm/act quant fusions (#43963)
zhenwei-intel Jun 2, 2026
afcb580
[BugFix] Fix Humming MoE deploy error (#43100)
adotdad Jun 2, 2026
fe32e78
[Bugfix] flashinfer: fail fast when --kv-cache-dtype nvfp4 used on un…
Kartavyasonar Jun 2, 2026
2427094
[Feature] Support EPLB for DeepSeek v4 Mega Moe (#43339)
wzhao18 Jun 2, 2026
ed9a752
[Anthropic] Support system role messages inside messages array (#44283)
chaunceyjiang Jun 2, 2026
da107a5
[MRV2] Also enable MRV2 for Llama and Mistral dense models (#43458)
njhill Jun 2, 2026
6158437
Merge commit 'da107a59e5' into merge-from-upstream
eble-amd Jun 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
rev: v21.1.2
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
exclude: 'csrc/(moe/topk_softmax_kernels.cu|libtorch_stable/quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/DavidAnson/markdownlint-cli2
Expand Down
4 changes: 2 additions & 2 deletions cmake/external_projects/vllm_flash_attn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ endif()

if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG bce29425653ec0fbc579d329883030e832d15ada
GIT_TAG dd62dac706b1cf7895bd99b18c6cb7e7e117ee25
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include <torch/csrc/stable/tensor.h>

#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
#include "broadcast_load_epilogue_c2x.hpp"

/*
This file defines custom epilogues for fusing channel scales, token scales,
Expand Down
2 changes: 1 addition & 1 deletion csrc/libtorch_stable/fused_qknorm_rope_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

#include "torch_utils.h"

#include "../async_util.cuh"
#include "async_util.cuh"
#include "../cuda_compat.h"
#include "../type_convert.cuh"
#include "dispatch_utils.h"
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <cuda_fp8.h>

#include "cuda_utils.h"
#include "launch_bounds_utils.h"
#include "libtorch_stable/launch_bounds_utils.h"

// Define before including nvfp4_utils.cuh so the header
// can use this macro during compilation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
static_assert(CVT_FP4_ELTS_PER_THREAD == 16,
"MXFP4 experts quant requires PACK16 mode (CUDA >= 12.9)");

#include "launch_bounds_utils.h"
#include "libtorch_stable/launch_bounds_utils.h"

namespace vllm {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

#include "cuda_utils.h"
#include "nvfp4_utils.cuh"
#include "launch_bounds_utils.h"
#include "libtorch_stable/launch_bounds_utils.h"

namespace vllm {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "../../cuda_vec_utils.cuh"

#include "cuda_utils.h"
#include "launch_bounds_utils.h"
#include "libtorch_stable/launch_bounds_utils.h"

// Define before including nvfp4_utils.cuh so the header
// can use this macro during compilation.
Expand Down
13 changes: 5 additions & 8 deletions csrc/libtorch_stable/quantization/gguf/gguf_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,11 @@

#include <torch/csrc/stable/ops.h>

// NOTE: These headers are intentionally kept in csrc/quantization/gguf/ (not
// moved to libtorch_stable) to avoid unnecessary reformatting that would break
// git rename detection and pollute blame history.
#include "../../../quantization/gguf/ggml-common.h"
#include "../../../quantization/gguf/vecdotq.cuh"
#include "../../../quantization/gguf/dequantize.cuh"
#include "../../../quantization/gguf/mmvq.cuh"
#include "../../../quantization/gguf/mmq.cuh"
#include "ggml-common.h"
#include "vecdotq.cuh"
#include "dequantize.cuh"
#include "mmvq.cuh"
#include "mmq.cuh"
#include "moe.cuh"
#include "moe_vec.cuh"

Expand Down
2 changes: 1 addition & 1 deletion csrc/libtorch_stable/topk.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "torch_utils.h"

#ifndef USE_ROCM
#include "../persistent_topk.cuh"
#include "persistent_topk.cuh"
#endif

namespace {
Expand Down
10 changes: 8 additions & 2 deletions docs/benchmarking/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,20 +246,26 @@ Every image listed in "image_files" is added to the request in the listed order

The "image" shorthand accepts the same values as "image_files". The "image_url" field accepts either an OpenAI-style object with a "url" field or a URL string.

By default, image references are sent to the serving endpoint as provided, with local image paths converted to `file://` URLs.

If the benchmark client should load local and HTTP(S) images before sending requests, pass `--custom-ensure-client-side-data` to encode them as base64 data URLs on the client side.

Existing `data:image/...` URLs are already self-contained and are kept unchanged.

```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct
```

```bash
# run benchmarking script
vllm bench serve--save-result --save-detailed \
vllm bench serve --save-result --save-detailed \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name custom_image \
--dataset-path <path-to-your-image-data-jsonl> \
--allowed-local-media-path /path/to/image/folder
--custom-ensure-client-side-data
```

Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs.
Expand Down
96 changes: 75 additions & 21 deletions rust/src/chat/tests/roundtrip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use vllm_chat::{
use vllm_text::{DecodedTextEvent, Finished, Prompt};

/// One model/parser configuration used to run the fixed roundtrip fixtures.
#[derive(Clone)]
struct RoundtripCase {
/// Hugging Face model id resolved through the production backend loader.
model_id: &'static str,
Expand All @@ -31,11 +32,45 @@ struct RoundtripCase {
tool_call_parser: ParserSelection,
/// Reasoning parser selection used by the output processor.
reasoning_parser: ParserSelection,
/// How this model's chat template handles thinking mode.
thinking_behavior: ThinkingBehavior,
/// JSON formatting expected after this model's template has materialized
/// tool-call arguments.
json_fmt: JsonFmt,
}

#[derive(Clone, Copy)]
enum ThinkingBehavior {
/// The chat template accepts explicit thinking on/off kwargs, and uses
/// `default` when the request does not specify either kwarg.
Toggleable { default: bool },
/// The chat template always behaves as `value` for this fixture.
Always { value: bool },
}

impl ThinkingBehavior {
fn default(self) -> bool {
match self {
Self::Toggleable { default } => default,
Self::Always { value } => value,
}
}

fn fixtures(self) -> Vec<Option<bool>> {
match self {
Self::Toggleable { .. } => vec![
Some(true), // explicitly enable thinking
Some(false), // explicitly disable thinking
None, // use default template behavior
],
Self::Always { value } => vec![
Some(value), // explicitly request the supported thinking behavior
None, // use default template behavior
],
}
}
}

impl RoundtripCase {
/// Qwen3 XML tool-call format with `qwen3` reasoning tags.
fn qwen3() -> Self {
Expand All @@ -44,6 +79,7 @@ impl RoundtripCase {
assistant_stop_suffix: "<|im_end|>\n",
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
thinking_behavior: ThinkingBehavior::Toggleable { default: true },
json_fmt: spaced_json_fmt(),
}
}
Expand All @@ -55,6 +91,7 @@ impl RoundtripCase {
assistant_stop_suffix: "<|im_end|>\n",
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
thinking_behavior: ThinkingBehavior::Toggleable { default: true },
json_fmt: compact_json_fmt(),
}
}
Expand All @@ -66,6 +103,7 @@ impl RoundtripCase {
assistant_stop_suffix: "[e~[\n",
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
thinking_behavior: ThinkingBehavior::Always { value: true },
json_fmt: compact_json_fmt(),
}
}
Expand All @@ -77,6 +115,7 @@ impl RoundtripCase {
assistant_stop_suffix: "<|end▁of▁sentence|>",
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
thinking_behavior: ThinkingBehavior::Toggleable { default: false },
json_fmt: compact_json_fmt(),
}
}
Expand All @@ -88,6 +127,7 @@ impl RoundtripCase {
assistant_stop_suffix: "",
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
thinking_behavior: ThinkingBehavior::Toggleable { default: true },
json_fmt: compact_json_fmt(),
}
}
Expand All @@ -100,6 +140,7 @@ impl RoundtripCase {
assistant_stop_suffix: "<|im_end|>",
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
thinking_behavior: ThinkingBehavior::Toggleable { default: true },
json_fmt: spaced_json_fmt(),
}
}
Expand Down Expand Up @@ -135,35 +176,44 @@ roundtrip_tests! {

/// Run the fixed reasoning+content fixture for one model/parser case.
async fn run_roundtrip_reasoning_and_content(case: RoundtripCase) -> Result<()> {
for thinking in case.thinking_behavior.fixtures() {
run_roundtrip_reasoning_and_content_inner(case.clone(), thinking).await?;
}
Ok(())
}

async fn run_roundtrip_reasoning_and_content_inner(
case: RoundtripCase,
thinking: Option<bool>,
) -> Result<()> {
let backends = load_roundtrip_backends(&case).await?;
let request = roundtrip_request(
"roundtrip-reasoning-content",
vec![ChatMessage::text(ChatRole::User, "What is 2 + 2?")],
Vec::new(),
thinking,
);
let expected_reasoning = "Need compute 2 + 2 directly.";
let expected_text = "The answer is 4.";
let effective_thinking = thinking.unwrap_or(case.thinking_behavior.default());

let result = run_roundtrip(
&case,
&backends,
&request,
AssistantMessage {
content: vec![
AssistantContentBlock::Reasoning {
text: expected_reasoning.to_string(),
},
AssistantContentBlock::Text {
text: expected_text.to_string(),
},
],
},
)
.await?;
let assistant = {
let mut content = Vec::new();
if effective_thinking {
content.push(AssistantContentBlock::Reasoning {
text: expected_reasoning.to_string(),
});
}
content.push(AssistantContentBlock::Text {
text: expected_text.to_string(),
});
AssistantMessage { content }
};
let result = run_roundtrip(&case, &backends, &request, assistant).await?;

assert_eq!(
result.parsed_message.reasoning().as_deref().map(str::trim),
Some(expected_reasoning)
effective_thinking.then_some(expected_reasoning)
);
assert_eq!(result.parsed_message.text().trim(), expected_text);
assert_eq!(result.parsed_message.tool_calls().count(), 0);
Expand All @@ -186,6 +236,7 @@ async fn run_roundtrip_tool_call_mix(case: RoundtripCase) -> Result<()> {
"Check Shanghai weather and add 1.00 plus 2.",
)],
test_tools(),
Some(true), // always enable thinking in this fixture
);
let expected_reasoning = "Need call the weather and add tools.";
let expected_text = "I will call the tools.";
Expand Down Expand Up @@ -487,6 +538,7 @@ fn roundtrip_request(
request_id: impl Into<String>,
messages: Vec<ChatMessage>,
tools: Vec<ChatTool>,
thinking: Option<bool>,
) -> ChatRequest {
let mut request = ChatRequest {
request_id: request_id.into(),
Expand All @@ -500,10 +552,12 @@ fn roundtrip_request(
..ChatRequest::for_test()
};

// Enable thinking for some models so that rendering and parsing the reasoning block is
// exercised in the roundtrip.
for key in ["thinking", "enable_thinking"] {
request.chat_options.template_kwargs.insert(key.to_string(), true.into());
// Explicitly enable or disable thinking so that rendering and parsing the reasoning block is
// exercised or skipped in the roundtrip. If unspecified, use the default template behavior.
if let Some(thinking) = thinking {
for key in ["thinking", "enable_thinking"] {
request.chat_options.template_kwargs.insert(key.to_string(), thinking.into());
}
}

request
Expand Down
2 changes: 1 addition & 1 deletion rust/src/tool-parser/src/deepseek_dsml/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ impl DeepSeekDsmlToolParser {
self.tool_parameters.convert_param_with_schema(
&name,
&param.name,
&param.value,
param.value,
)
};
arguments.insert(param.name, value);
Expand Down
Loading
Loading