Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions fastdeploy/engine/sched/resource_manager_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
EncoderCacheManager,
ProcessorCacheManager,
)
from fastdeploy.config import ErnieArchitectures
from fastdeploy.engine.request import (
ImagePosition,
Request,
Expand Down Expand Up @@ -761,13 +760,6 @@ def schedule(self):
Try to pull a batch of requests from the waiting queue and schedule them.
"""

def get_enough_request(request, scheduled_reqs):
return (
ErnieArchitectures.is_ernie5_arch(self.config.model_config.architectures)
and self._is_mm_request(request)
and self.exist_mm_prefill(scheduled_reqs)
)

with self.lock:
scheduled_reqs: list[Request] = []
preempted_reqs: list[Request] = []
Expand Down Expand Up @@ -910,9 +902,6 @@ def _allocate_decode_and_extend():
):
req_index += 1
continue
if get_enough_request(request, scheduled_reqs):
req_index += 1
continue
num_new_tokens = self._get_num_new_tokens(request, token_budget)
if num_new_tokens == 0:
req_index += 1
Expand Down Expand Up @@ -964,8 +953,6 @@ def _allocate_decode_and_extend():
break

request = self.waiting[0]
if get_enough_request(request, scheduled_reqs):
break
if request.status == RequestStatus.WAITING:
result = self.waiting_async_process(request)
if result is None:
Expand Down
94 changes: 93 additions & 1 deletion fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
speculate_schedule_cache,
set_data_ipc,
unset_data_ipc,
update_attn_mask_offsets,
)

import zmq
Expand Down Expand Up @@ -179,6 +180,9 @@ def __init__(
else:
self.encoder_cache = None

# Note(Zhengshifeng) init video cache for VL model
self.video_cache = {}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 self.video_cache = {} 初始化后在本文件中未见任何读写

video_cache 被初始化但在此 PR 所有变更中均未使用(video_features 直接通过 self.share_inputs 传递)。若此字段用于后续功能,建议添加注释说明预期用途;否则建议移除,避免混淆。


Comment on lines +183 to +185
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.video_cache 在本文件中只初始化但未被任何逻辑读取/写入(search 仅命中这一处)。如果不是本 PR 范围内即将使用的字段,建议删除避免误导;如果后续会用到,建议至少在注释里说明其生命周期/访问路径,或在本 PR 中补齐使用点。

Suggested change
# Note(Zhengshifeng) init video cache for VL model
self.video_cache = {}

Copilot uses AI. Check for mistakes.
# Sampler
if not self.speculative_decoding:
self.sampler = Sampler(fd_config)
Expand Down Expand Up @@ -497,6 +501,8 @@ def _process_mm_features(self, request_list: List[Request]):
"feature_position_list": [],
"grid_thw_lst_batches": [],
"feature_position_list_batches": [],
"image_features": [],
"image_grid_thws": [],
}
for request in request_list:
if request.task_type.value != RequestType.PREFILL.value:
Expand All @@ -509,10 +515,10 @@ def _process_mm_features(self, request_list: List[Request]):
self.encoder_cache.pop(mm_hash, None)
idx = self.share_inputs.get_index_by_batch_id(request.idx)
req_idx_img_index_map[idx] = -1
inputs = request.multimodal_inputs
if request.with_image:
req_idx_img_index_map[idx] = img_index
img_index = img_index + 1
inputs = request.multimodal_inputs
if self.encoder_cache is not None:
if envs.FD_ENABLE_MAX_PREFILL:
if "vit_seqlen" in inputs:
Expand Down Expand Up @@ -618,6 +624,43 @@ def _process_mm_features(self, request_list: List[Request]):
prefill_end_index=request.prefill_end_index,
)
)

if (
inputs is not None
and inputs.get("image_feature_urls", None) is not None
and len(inputs["image_feature_urls"]) > 0
):
multi_vision_inputs["image_grid_thws"].extend(
Comment on lines +628 to +633
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

新增的预编码特征路径(image_feature_urls/image_features + image_grid_thws)以及 prefill 阶段的 attention_mask_offset 填充/更新属于关键推理逻辑,但当前 tests/worker/test_gpu_model_runner.py 里没有覆盖该分支。建议补充单测:构造带 image_feature_urls 的 request(含/不含 image_grid_thws、shape 不匹配等),验证 process_mm_features 的输出类型与错误处理;同时覆盖 update_attn_mask_offsets + copy 的基本形状约束。

Copilot generated this review using guidance from repository custom instructions.
inputs["image_grid_thws"][request.image_start : request.image_end]
)
image_feature = inputs["image_features"][request.image_start : request.image_end]
Comment on lines +633 to +636
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里仅以 image_feature_urls 非空作为条件,但随后直接访问 inputs["image_grid_thws"]/inputs["image_features"]。在 ResourceManagerV1._download_features 只会填充 image_features(不保证存在 image_grid_thws),因此该分支可能触发 KeyError 或出现长度不一致导致切片错误。建议改为显式校验这两个字段存在且长度与 image_feature_urls 对齐;缺失时给 request 设置清晰的 error_message/error_code 或直接抛出异常。

Suggested change
multi_vision_inputs["image_grid_thws"].extend(
inputs["image_grid_thws"][request.image_start : request.image_end]
)
image_feature = inputs["image_features"][request.image_start : request.image_end]
image_feature_urls = inputs["image_feature_urls"]
image_grid_thws = inputs.get("image_grid_thws")
image_features = inputs.get("image_features")
image_start = request.image_start
image_end = request.image_end
if image_grid_thws is None or image_features is None:
raise ValueError(
"Missing multimodal input fields for image features: "
f"request_idx={request.idx}, "
f"has_image_feature_urls={image_feature_urls is not None}, "
f"has_image_features={image_features is not None}, "
f"has_image_grid_thws={image_grid_thws is not None}"
)
if not (
len(image_feature_urls) == len(image_features) == len(image_grid_thws)
):
raise ValueError(
"Mismatched multimodal input lengths: "
f"request_idx={request.idx}, "
f"image_feature_urls={len(image_feature_urls)}, "
f"image_features={len(image_features)}, "
f"image_grid_thws={len(image_grid_thws)}"
)
if not (0 <= image_start <= image_end <= len(image_feature_urls)):
raise ValueError(
"Invalid image slice range: "
f"request_idx={request.idx}, "
f"image_start={image_start}, "
f"image_end={image_end}, "
f"total_images={len(image_feature_urls)}"
)
multi_vision_inputs["image_grid_thws"].extend(
image_grid_thws[image_start:image_end]
)
image_feature = image_features[image_start:image_end]

Copilot uses AI. Check for mistakes.

if len(image_feature) > 0:
if isinstance(image_feature[0], paddle.Tensor) and len(image_feature[0].shape) == 2:
# Enable encode vision_embedding
for image_feature_tensor in image_feature:
if image_feature_tensor.shape[1] != self.fd_config.model_config.hidden_size:
logger.error(
f"Shape mismatch: expected shape={self.fd_config.model_config.hidden_size}, \
but got {image_feature_tensor.shape}"
)
Comment on lines +641 to +646
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

检测到 image_feature_tensor 的 hidden_size 不匹配时这里只是 logger.error 但继续执行,后续 concat/模型 forward 很可能因 shape 不一致直接崩溃或产生错误结果。建议在发现不匹配时立即 fail-fast(raise 异常或设置 request.error_message/error_code 并跳过该请求),并在错误信息中同时包含期望 shape(如 [*, hidden_size])与实际 shape。

Suggested change
for image_feature_tensor in image_feature:
if image_feature_tensor.shape[1] != self.fd_config.model_config.hidden_size:
logger.error(
f"Shape mismatch: expected shape={self.fd_config.model_config.hidden_size}, \
but got {image_feature_tensor.shape}"
)
expected_hidden_size = self.fd_config.model_config.hidden_size
for image_feature_tensor in image_feature:
if image_feature_tensor.shape[1] != expected_hidden_size:
error_message = (
f"Image feature hidden size mismatch for request idx={request.idx}: "
f"expected shape [*, {expected_hidden_size}], "
f"but got {list(image_feature_tensor.shape)}"
)
logger.error(error_message)
raise ValueError(error_message)

Copilot uses AI. Check for mistakes.
image_features_gpu = [vf.cuda() for vf in image_feature]
image_embeds = paddle.concat(image_features_gpu, axis=0)
Comment on lines +647 to +648
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里使用 vf.cuda() 会硬编码 CUDA 设备,并且逐个 tensor 迁移可能带来额外开销;同时本 runner 其他路径普遍用 .to(self.device) 保持设备一致。建议改为使用 image_feature_tensor.to(self.device)(或在 to_tensor 阶段统一放到目标 device),并尽量减少逐个拷贝。

Suggested change
image_features_gpu = [vf.cuda() for vf in image_feature]
image_embeds = paddle.concat(image_features_gpu, axis=0)
image_embeds = paddle.concat(image_feature, axis=0).to(self.device)

Copilot uses AI. Check for mistakes.
multi_vision_inputs["image_features"].append(image_embeds)
logger.info("Enable Encode image embedding.")
else:
multi_vision_inputs["image_features"].extend(image_feature)
logger.info("Disable Encode image embedding.")

self.share_inputs["image_features"] = multi_vision_inputs["image_features"]
if len(multi_vision_inputs["image_features"]) > 0:
if (
isinstance(multi_vision_inputs["image_features"][0], paddle.Tensor)
and len(multi_vision_inputs["image_features"][0].shape) == 2
):
self.share_inputs["image_features"] = paddle.concat(multi_vision_inputs["image_features"], axis=0)
self.share_inputs["image_grid_thws"] = multi_vision_inputs["image_grid_thws"]

if self.encoder_cache is not None:
if len(multi_vision_inputs["images_lst"]) > 0 or len(multi_vision_inputs["encoder_cache_info"]) > 0:
image_features_output = None
Expand Down Expand Up @@ -734,6 +777,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
"position_ids_offset": [0],
"max_tokens_lst": [],
}
if self.enable_mm:
# Sort by idx to ensure attention mask offsets are filled in order during mm prefill
req_dicts = sorted(req_dicts, key=lambda r: r.idx)
for i in range(req_len):
request = req_dicts[i]
idx = self.share_inputs.get_index_by_batch_id(request.idx)
Expand Down Expand Up @@ -783,6 +829,20 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
prefill_start_index = request.prefill_start_index
prefill_end_index = request.prefill_end_index
length = prefill_end_index - prefill_start_index
if self.enable_mm:
self.share_inputs["decode_states"][idx, 0] = 0
inputs = request.multimodal_inputs
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Bug inputs 可能为 None,但此处直接调用 inputs.get(...) 未做 None 检查

request.multimodal_inputs 返回 None 时(非多模态请求进入 prefill 分支),inputs.get("attention_mask_offset", None) 会抛出 AttributeError: 'NoneType' object has no attribute 'get'

建议修复:

if self.enable_mm:
    self.share_inputs["decode_states"][idx, 0] = 0
    inputs = request.multimodal_inputs
    if inputs is not None:
        attn_offset_len = prefill_end_index - prefill_start_index
        if inputs.get("attention_mask_offset", None) is None:
            attention_mask_offset_slice = np.arange(...)
        ...

# mm attention_mask
attn_offset_len = prefill_end_index - prefill_start_index
if inputs.get("attention_mask_offset", None) is None:
attention_mask_offset_slice = np.arange(prefill_start_index, prefill_end_index, dtype=np.int32)
else:
attention_mask_offset_slice = np.asarray(
inputs["attention_mask_offset"][prefill_start_index:prefill_end_index], dtype=np.int32
)
self.share_inputs["attn_mask_offsets_full"][idx, 0:attn_offset_len] = paddle.to_tensor(
attention_mask_offset_slice, dtype="int32"
)
if not self.is_pooling_model:
if request.get("enable_thinking") is not None:
enable_thinking = bool(request.get("enable_thinking"))
Expand Down Expand Up @@ -1201,6 +1261,19 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p
self._real_output_token_num_host.copy_(real_output_token_num, False)
self.output_token_num_event.record()

if self.enable_mm:
attn_mask_offsets = update_attn_mask_offsets(
self.share_inputs["ids_remove_padding"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_encoder"],
self.share_inputs["seq_lens_decoder"],
self.share_inputs["cu_seqlens_q"],
self.share_inputs["attn_mask_offsets_full"],
self.share_inputs["is_block_step"],
self.share_inputs["decode_states"],
)
self.share_inputs["attn_mask_offsets"].copy_(attn_mask_offsets, False)
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update_attn_mask_offsets 在 GPU 侧返回的 attn_mask_offsets 长度是 ids_remove_padding.shape[0] * 2(见算子实现),这里直接 copy_ 到 share_inputs["attn_mask_offsets"] 要求目标 buffer shape 完全一致。请确认 InputBatch/ProposerInputBatch 里 attn_mask_offsets 的预分配长度是 max_token_capacity*2,否则这里会在运行时因 shape 不匹配而报错。

Suggested change
self.share_inputs["attn_mask_offsets"].copy_(attn_mask_offsets, False)
attn_mask_offsets_buffer = self.share_inputs["attn_mask_offsets"]
attn_mask_offsets_len = attn_mask_offsets.shape[0]
if attn_mask_offsets_buffer.numel() < attn_mask_offsets_len:
raise RuntimeError(
"attn_mask_offsets buffer capacity is insufficient: "
f"required={attn_mask_offsets_len}, "
f"capacity={attn_mask_offsets_buffer.numel()}. "
"Please ensure the preallocated attn_mask_offsets buffer "
"has capacity for max_token_capacity * 2."
)
attn_mask_offsets_buffer[:attn_mask_offsets_len].copy_(attn_mask_offsets, False)

Copilot uses AI. Check for mistakes.

# Initialize forward meta data
self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)
self.forward_meta.real_bsz = real_bsz
Expand Down Expand Up @@ -1310,6 +1383,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
kv_batch_ids=self.share_inputs["kv_batch_ids"],
kv_tile_ids_per_batch=self.share_inputs["kv_tile_ids_per_batch"],
kv_num_blocks_x_cpu=self.share_inputs["kv_num_blocks_x_cpu"],
attn_mask_offsets=self.share_inputs["attn_mask_offsets"] if self.enable_mm else None,
routing_replay_table=routing_replay_table,
)

Expand Down Expand Up @@ -2167,6 +2241,24 @@ def _preprocess(
model_inputs["generated_modality"] = self.share_inputs["generated_modality"]
if self.enable_mm:
model_inputs["image_features"] = self.share_inputs["image_features"]
model_inputs["decode_states"] = self.share_inputs["decode_states"]
model_inputs["image_grid_thws"] = self.share_inputs.get("image_grid_thws", None)
video_features = self.share_inputs.get("video_features", None)
video_grid_thws = self.share_inputs.get("video_grid_thws", None)
video_infinity_scales = self.share_inputs.get("video_infinity_scales", None)
if video_features is not None:
model_inputs["video_features"] = video_features
if video_grid_thws is not None:
model_inputs["video_grid_thws"] = video_grid_thws
if video_infinity_scales is not None:
model_inputs["video_infinity_scales"] = video_infinity_scales

# init features and grid_thws
self.share_inputs["image_features"] = None
self.share_inputs["image_grid_thws"] = None
self.share_inputs["video_features"] = None
self.share_inputs["video_grid_thws"] = None
self.share_inputs["video_infinity_scales"] = None

return model_inputs, p_done_idxs, token_num_event

Expand Down
40 changes: 40 additions & 0 deletions fastdeploy/worker/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,11 @@ def init_share_inputs(self):
)
if self.is_mm_model:
self.image_features = None
self.image_grid_thws = None
self.image_features_list = None
self.video_features = None
self.video_grid_thws = None
self.video_infinity_scales = None

# Set block tables
pre_max_block_num = (
Expand Down Expand Up @@ -345,7 +349,26 @@ def init_share_inputs(self):
dtype="float32",
)
self.image_features = None # Built before the forward
self.image_grid_thws = None
self.image_features_list = None
self.video_features = None
self.video_grid_thws = None
self.video_infinity_scales = None

decode_states_len = self.speculative_config.num_speculative_tokens + 1 if self.speculative_decoding else 1
self.decode_states = paddle.full(
[self.scheduler_config.max_num_seqs, decode_states_len],
-1,
dtype="int32",
)
self.attn_mask_offsets = paddle.full(
shape=[self.scheduler_config.max_num_seqs * self.model_config.max_model_len],
Comment on lines +364 to +365
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里预分配的 attn_mask_offsets 长度只有 max_num_seqs * max_model_len,但 GPU 算子 update_attn_mask_offsets 的输出是 ids_remove_padding.shape[0] * 2(每个 token 两个 offset)。当前尺寸会导致后续 copy_ 时 shape 不匹配/潜在越界。建议按 ids_remove_padding 的最大长度预分配并乘以 2(例如 max_num_seqs * max_chunk_tokens * 2),并在 reset 时保持一致。

Suggested change
self.attn_mask_offsets = paddle.full(
shape=[self.scheduler_config.max_num_seqs * self.model_config.max_model_len],
attn_mask_token_capacity = self.scheduler_config.max_num_seqs * self.model_config.max_model_len
self.attn_mask_offsets = paddle.full(
shape=[attn_mask_token_capacity * 2],

Copilot uses AI. Check for mistakes.
fill_value=-1,
dtype="int32",
)
self.attn_mask_offsets_full = paddle.full(
[self.scheduler_config.max_num_seqs, self.model_config.max_model_len], -1, dtype="int32"
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 非 speculative 路径缺少 attn_mask_offsets_decoder 初始化

swap_states(line 939)中在 self.enable_mm 条件下会访问 self.attn_mask_offsets_decoder,而当前新增的 init_share_inputs 多模态初始化分支(非 speculative)仅初始化了 attn_mask_offsets_fullattn_mask_offsets,缺少:

self.attn_mask_offsets_decoder = paddle.full([self.scheduler_config.max_num_seqs, 1], -1, dtype="int32")

如果非 speculative 场景下同样需要 swap_states,则会触发 AttributeError


# For logits processors
self.logits_processors = build_logits_processors(self.fd_config)
Expand Down Expand Up @@ -412,6 +435,7 @@ def swap_data(tensor, idx1, idx2):
swap_data(self.ori_seq_lens_encoder, i1, i2)
swap_data(self.system_lens, i1, i2)
swap_data(self.system_ids, i1, i2)
swap_data(self.generated_modality, i1, i2)
swap_data(self.enable_thinking, i1, i2)
swap_data(self.max_think_lens, i1, i2)
swap_data(self.limit_think_status, i1, i2)
Expand Down Expand Up @@ -454,6 +478,8 @@ def swap_data(tensor, idx1, idx2):
self.image_features_list[i1],
)
swap_data(self.share_inputs["rope_emb"], i1, i2)
swap_data(self.decode_states, i1, i2)
swap_data(self.attn_mask_offsets_full, i1, i2)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 swap_data 中新增了 decode_statesattn_mask_offsets_full 的 swap,但缺少 attn_mask_offsets 的 swap

swap_states(line 937)的逻辑相比,此处的 swap_data 块(init_share_inputs 路径)新增了 decode_statesattn_mask_offsets_full,但遗漏了 attn_mask_offsets。若两处 swap 逻辑不一致,可能导致 attn_mask_offsets 在某些 preemption 场景下数据错乱。建议补充:

swap_data(self.attn_mask_offsets, i1, i2)

# Swap mask rollback
swap_data(self.mask_rollback, i1, i2)

Expand Down Expand Up @@ -581,6 +607,7 @@ def reset_share_inputs(self):
fill_paddle_tensor(self, "ori_seq_lens_encoder", 0)
fill_paddle_tensor(self, "system_lens", 0)
fill_paddle_tensor(self, "system_ids", -1)
fill_paddle_tensor(self, "generated_modality", -1)

fill_paddle_tensor(self, "ids_remove_padding", 0)
fill_paddle_tensor(self, "batch_id_per_token", 0)
Expand Down Expand Up @@ -665,7 +692,14 @@ def reset_share_inputs(self):
dtype="float32",
)
self.image_features = None
self.image_grid_thws = None
self.image_features_list = None
self.video_features = None
self.video_grid_thws = None
self.video_infinity_scales = None
fill_paddle_tensor(self, "decode_states", -1)
fill_paddle_tensor(self, "attn_mask_offsets", -1)
fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
Comment on lines +701 to +702
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reset_share_inputs 里对 attn_mask_offsets 做 fill(-1) 没问题,但需要确保 attn_mask_offsets 的预分配 shape 与 update_attn_mask_offsets 输出一致(token_num*2)。如果按当前初始化的较小 shape,会在 prepare_inputs 里 copy 时报错。建议在这里也同步按 *2 的 shape 维护。

Suggested change
fill_paddle_tensor(self, "attn_mask_offsets", -1)
fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
attn_mask_token_num = max_num_seqs * self.model_config.max_model_len
attn_mask_offsets_shape = [attn_mask_token_num * 2]
attn_mask_offsets = getattr(self, "attn_mask_offsets", None)
if attn_mask_offsets is None or list(attn_mask_offsets.shape) != attn_mask_offsets_shape:
attn_mask_offsets_dtype = attn_mask_offsets.dtype if attn_mask_offsets is not None else "int32"
self.attn_mask_offsets = paddle.full(
shape=attn_mask_offsets_shape,
fill_value=-1,
dtype=attn_mask_offsets_dtype,
)
else:
fill_paddle_tensor(self, "attn_mask_offsets", -1)
attn_mask_offsets_full = getattr(self, "attn_mask_offsets_full", None)
if attn_mask_offsets_full is None or list(attn_mask_offsets_full.shape) != attn_mask_offsets_shape:
attn_mask_offsets_full_dtype = (
attn_mask_offsets_full.dtype if attn_mask_offsets_full is not None else "int32"
)
self.attn_mask_offsets_full = paddle.full(
shape=attn_mask_offsets_shape,
fill_value=-1,
dtype=attn_mask_offsets_full_dtype,
)
else:
fill_paddle_tensor(self, "attn_mask_offsets_full", -1)

Copilot uses AI. Check for mistakes.
else:
# Reset non-multimodal rope_emb
self.rope_emb = get_rope(
Expand All @@ -677,7 +711,11 @@ def reset_share_inputs(self):
)
if self.is_mm_model:
self.image_features = None
self.image_grid_thws = None
self.image_features_list = None
self.video_features = None
self.video_grid_thws = None
self.video_infinity_scales = None

# Reset other miscellaneous tensors
fill_paddle_tensor(self, "mask_rollback", 0)
Expand Down Expand Up @@ -895,6 +933,8 @@ def swap_data(tensor, idx1, idx2):
swap_data(self.mask_rollback, i1, i2)
swap_data(self.recompute_token_num, i1, i2)
if self.enable_mm:
swap_data(self.decode_states, i1, i2)
swap_data(self.attn_mask_offsets, i1, i2)
swap_data(self.attn_mask_offsets_full, i1, i2)
swap_data(self.attn_mask_offsets_decoder, i1, i2)
Comment on lines +937 to 939
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里新增 swap_data(self.attn_mask_offsets, i1, i2) 可能是错误的:attn_mask_offsets 是按 token 展平的一维 buffer(长度与 ids_remove_padding token_num 对齐,且 update_attn_mask_offsets 输出为 token_num*2),按 batch 维度交换 i1/i2 只会交换单个元素,无法保持与 cu_seqlens_q/query_start 的一致性,容易导致 attention mask offsets 错乱。建议不要在 reorder/swap 时交换该一维 buffer,而是在每次 pre_process 后统一重算并整段 copy_;或者把其存储改为按 batch 的二维布局后再实现正确的交换逻辑。

Suggested change
swap_data(self.attn_mask_offsets, i1, i2)
swap_data(self.attn_mask_offsets_full, i1, i2)
swap_data(self.attn_mask_offsets_decoder, i1, i2)
# Attention mask offset buffers may be token-flattened derived state
# rather than batch-aligned storage. Swapping a single element by
# batch index can break consistency with the flattened token layout.
# Keep them untouched here and let the later preprocessing stage
# rebuild them from the current batch layout.

Copilot uses AI. Check for mistakes.

Expand Down
Loading