PaddlePaddle · xiaoxiaohehe001 · Apr 23, 2026 · PaddlePaddle-bot · Apr 23, 2026 · Copilot
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -32,7 +32,6 @@
     EncoderCacheManager,
     ProcessorCacheManager,
 )
-from fastdeploy.config import ErnieArchitectures
 from fastdeploy.engine.request import (
     ImagePosition,
     Request,
@@ -761,13 +760,6 @@ def schedule(self):
         Try to pull a batch of requests from the waiting queue and schedule them.
         """
 
-        def get_enough_request(request, scheduled_reqs):
-            return (
-                ErnieArchitectures.is_ernie5_arch(self.config.model_config.architectures)
-                and self._is_mm_request(request)
-                and self.exist_mm_prefill(scheduled_reqs)
-            )
-
         with self.lock:
             scheduled_reqs: list[Request] = []
             preempted_reqs: list[Request] = []
@@ -910,9 +902,6 @@ def _allocate_decode_and_extend():
                     ):
                         req_index += 1
                         continue
-                    if get_enough_request(request, scheduled_reqs):
-                        req_index += 1
-                        continue
                     num_new_tokens = self._get_num_new_tokens(request, token_budget)
                     if num_new_tokens == 0:
                         req_index += 1
@@ -964,8 +953,6 @@ def _allocate_decode_and_extend():
                         break
 
                     request = self.waiting[0]
-                    if get_enough_request(request, scheduled_reqs):
-                        break
                     if request.status == RequestStatus.WAITING:
                         result = self.waiting_async_process(request)
                         if result is None:

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -78,6 +78,7 @@
         speculate_schedule_cache,
         set_data_ipc,
         unset_data_ipc,
+        update_attn_mask_offsets,
     )
 
 import zmq
@@ -179,6 +180,9 @@ def __init__(
             else:
                 self.encoder_cache = None
 
+            # Note(Zhengshifeng) init video cache for VL model
+            self.video_cache = {}
+
-            # Note(Zhengshifeng) init video cache for VL model
-            self.video_cache = {}
-            # Note(Zhengshifeng) init video cache for VL model
-            self.video_cache = {}
         #  Sampler
         if not self.speculative_decoding:
             self.sampler = Sampler(fd_config)
@@ -497,6 +501,8 @@ def _process_mm_features(self, request_list: List[Request]):
             "feature_position_list": [],
             "grid_thw_lst_batches": [],
             "feature_position_list_batches": [],
+            "image_features": [],
+            "image_grid_thws": [],
         }
         for request in request_list:
             if request.task_type.value != RequestType.PREFILL.value:
@@ -509,10 +515,10 @@ def _process_mm_features(self, request_list: List[Request]):
                         self.encoder_cache.pop(mm_hash, None)
             idx = self.share_inputs.get_index_by_batch_id(request.idx)
             req_idx_img_index_map[idx] = -1
+            inputs = request.multimodal_inputs
             if request.with_image:
                 req_idx_img_index_map[idx] = img_index
                 img_index = img_index + 1
-                inputs = request.multimodal_inputs
                 if self.encoder_cache is not None:
                     if envs.FD_ENABLE_MAX_PREFILL:
                         if "vit_seqlen" in inputs:
@@ -618,6 +624,43 @@ def _process_mm_features(self, request_list: List[Request]):
                             prefill_end_index=request.prefill_end_index,
                         )
                     )
+
+            if (
+                inputs is not None
+                and inputs.get("image_feature_urls", None) is not None
+                and len(inputs["image_feature_urls"]) > 0
+            ):
+                multi_vision_inputs["image_grid_thws"].extend(
+                    inputs["image_grid_thws"][request.image_start : request.image_end]
+                )
+                image_feature = inputs["image_features"][request.image_start : request.image_end]
-                multi_vision_inputs["image_grid_thws"].extend(
-                    inputs["image_grid_thws"][request.image_start : request.image_end]
-                )
-                image_feature = inputs["image_features"][request.image_start : request.image_end]
+                image_feature_urls = inputs["image_feature_urls"]
+                image_grid_thws = inputs.get("image_grid_thws")
+                image_features = inputs.get("image_features")
+                image_start = request.image_start
+                image_end = request.image_end
+
+                if image_grid_thws is None or image_features is None:
+                    raise ValueError(
+                        "Missing multimodal input fields for image features: "
+                        f"request_idx={request.idx}, "
+                        f"has_image_feature_urls={image_feature_urls is not None}, "
+                        f"has_image_features={image_features is not None}, "
+                        f"has_image_grid_thws={image_grid_thws is not None}"
+                    )
+
+                if not (
+                    len(image_feature_urls) == len(image_features) == len(image_grid_thws)
+                ):
+                    raise ValueError(
+                        "Mismatched multimodal input lengths: "
+                        f"request_idx={request.idx}, "
+                        f"image_feature_urls={len(image_feature_urls)}, "
+                        f"image_features={len(image_features)}, "
+                        f"image_grid_thws={len(image_grid_thws)}"
+                    )
+
+                if not (0 <= image_start <= image_end <= len(image_feature_urls)):
+                    raise ValueError(
+                        "Invalid image slice range: "
+                        f"request_idx={request.idx}, "
+                        f"image_start={image_start}, "
+                        f"image_end={image_end}, "
+                        f"total_images={len(image_feature_urls)}"
+                    )
+
+                multi_vision_inputs["image_grid_thws"].extend(
+                    image_grid_thws[image_start:image_end]
+                )
+                image_feature = image_features[image_start:image_end]
-                multi_vision_inputs["image_grid_thws"].extend(
-                    inputs["image_grid_thws"][request.image_start : request.image_end]
-                )
-                image_feature = inputs["image_features"][request.image_start : request.image_end]
+                image_feature_urls = inputs["image_feature_urls"]
+                image_grid_thws = inputs.get("image_grid_thws")
+                image_features = inputs.get("image_features")
+                image_start = request.image_start
+                image_end = request.image_end
+
+                if image_grid_thws is None or image_features is None:
+                    raise ValueError(
+                        "Missing multimodal input fields for image features: "
+                        f"request_idx={request.idx}, "
+                        f"has_image_feature_urls={image_feature_urls is not None}, "
+                        f"has_image_features={image_features is not None}, "
+                        f"has_image_grid_thws={image_grid_thws is not None}"
+                    )
+
+                if not (
+                    len(image_feature_urls) == len(image_features) == len(image_grid_thws)
+                ):
+                    raise ValueError(
+                        "Mismatched multimodal input lengths: "
+                        f"request_idx={request.idx}, "
+                        f"image_feature_urls={len(image_feature_urls)}, "
+                        f"image_features={len(image_features)}, "
+                        f"image_grid_thws={len(image_grid_thws)}"
+                    )
+
+                if not (0 <= image_start <= image_end <= len(image_feature_urls)):
+                    raise ValueError(
+                        "Invalid image slice range: "
+                        f"request_idx={request.idx}, "
+                        f"image_start={image_start}, "
+                        f"image_end={image_end}, "
+                        f"total_images={len(image_feature_urls)}"
+                    )
+
+                multi_vision_inputs["image_grid_thws"].extend(
+                    image_grid_thws[image_start:image_end]
+                )
+                image_feature = image_features[image_start:image_end]
+
+                if len(image_feature) > 0:
+                    if isinstance(image_feature[0], paddle.Tensor) and len(image_feature[0].shape) == 2:
+                        # Enable encode vision_embedding
+                        for image_feature_tensor in image_feature:
+                            if image_feature_tensor.shape[1] != self.fd_config.model_config.hidden_size:
+                                logger.error(
+                                    f"Shape mismatch: expected shape={self.fd_config.model_config.hidden_size}, \
+                                        but got {image_feature_tensor.shape}"
+                                )
-                        for image_feature_tensor in image_feature:
-                            if image_feature_tensor.shape[1] != self.fd_config.model_config.hidden_size:
-                                logger.error(
-                                    f"Shape mismatch: expected shape={self.fd_config.model_config.hidden_size}, \
-                                        but got {image_feature_tensor.shape}"
-                                )
+                        expected_hidden_size = self.fd_config.model_config.hidden_size
+                        for image_feature_tensor in image_feature:
+                            if image_feature_tensor.shape[1] != expected_hidden_size:
+                                error_message = (
+                                    f"Image feature hidden size mismatch for request idx={request.idx}: "
+                                    f"expected shape [*, {expected_hidden_size}], "
+                                    f"but got {list(image_feature_tensor.shape)}"
+                                )
+                                logger.error(error_message)
+                                raise ValueError(error_message)
-                        for image_feature_tensor in image_feature:
-                            if image_feature_tensor.shape[1] != self.fd_config.model_config.hidden_size:
-                                logger.error(
-                                    f"Shape mismatch: expected shape={self.fd_config.model_config.hidden_size}, \
-                                        but got {image_feature_tensor.shape}"
-                                )
+                        expected_hidden_size = self.fd_config.model_config.hidden_size
+                        for image_feature_tensor in image_feature:
+                            if image_feature_tensor.shape[1] != expected_hidden_size:
+                                error_message = (
+                                    f"Image feature hidden size mismatch for request idx={request.idx}: "
+                                    f"expected shape [*, {expected_hidden_size}], "
+                                    f"but got {list(image_feature_tensor.shape)}"
+                                )
+                                logger.error(error_message)
+                                raise ValueError(error_message)
+                        image_features_gpu = [vf.cuda() for vf in image_feature]
+                        image_embeds = paddle.concat(image_features_gpu, axis=0)
-                        image_features_gpu = [vf.cuda() for vf in image_feature]
-                        image_embeds = paddle.concat(image_features_gpu, axis=0)
+                        image_embeds = paddle.concat(image_feature, axis=0).to(self.device)
-                        image_features_gpu = [vf.cuda() for vf in image_feature]
-                        image_embeds = paddle.concat(image_features_gpu, axis=0)
+                        image_embeds = paddle.concat(image_feature, axis=0).to(self.device)
+                        multi_vision_inputs["image_features"].append(image_embeds)
+                        logger.info("Enable Encode image embedding.")
+                    else:
+                        multi_vision_inputs["image_features"].extend(image_feature)
+                        logger.info("Disable Encode image embedding.")
+
+        self.share_inputs["image_features"] = multi_vision_inputs["image_features"]
+        if len(multi_vision_inputs["image_features"]) > 0:
+            if (
+                isinstance(multi_vision_inputs["image_features"][0], paddle.Tensor)
+                and len(multi_vision_inputs["image_features"][0].shape) == 2
+            ):
+                self.share_inputs["image_features"] = paddle.concat(multi_vision_inputs["image_features"], axis=0)
+        self.share_inputs["image_grid_thws"] = multi_vision_inputs["image_grid_thws"]
+
         if self.encoder_cache is not None:
             if len(multi_vision_inputs["images_lst"]) > 0 or len(multi_vision_inputs["encoder_cache_info"]) > 0:
                 image_features_output = None
@@ -734,6 +777,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
             "position_ids_offset": [0],
             "max_tokens_lst": [],
         }
+        if self.enable_mm:
+            # Sort by idx to ensure attention mask offsets are filled in order during mm prefill
+            req_dicts = sorted(req_dicts, key=lambda r: r.idx)
         for i in range(req_len):
             request = req_dicts[i]
             idx = self.share_inputs.get_index_by_batch_id(request.idx)
@@ -783,6 +829,20 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                 prefill_start_index = request.prefill_start_index
                 prefill_end_index = request.prefill_end_index
                 length = prefill_end_index - prefill_start_index
+                if self.enable_mm:
+                    self.share_inputs["decode_states"][idx, 0] = 0
+                    inputs = request.multimodal_inputs
+                    # mm attention_mask
+                    attn_offset_len = prefill_end_index - prefill_start_index
+                    if inputs.get("attention_mask_offset", None) is None:
+                        attention_mask_offset_slice = np.arange(prefill_start_index, prefill_end_index, dtype=np.int32)
+                    else:
+                        attention_mask_offset_slice = np.asarray(
+                            inputs["attention_mask_offset"][prefill_start_index:prefill_end_index], dtype=np.int32
+                        )
+                    self.share_inputs["attn_mask_offsets_full"][idx, 0:attn_offset_len] = paddle.to_tensor(
+                        attention_mask_offset_slice, dtype="int32"
+                    )
                 if not self.is_pooling_model:
                     if request.get("enable_thinking") is not None:
                         enable_thinking = bool(request.get("enable_thinking"))
@@ -1201,6 +1261,19 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p
             self._real_output_token_num_host.copy_(real_output_token_num, False)
             self.output_token_num_event.record()
 
+        if self.enable_mm:
+            attn_mask_offsets = update_attn_mask_offsets(
+                self.share_inputs["ids_remove_padding"],
+                self.share_inputs["seq_lens_this_time"],
+                self.share_inputs["seq_lens_encoder"],
+                self.share_inputs["seq_lens_decoder"],
+                self.share_inputs["cu_seqlens_q"],
+                self.share_inputs["attn_mask_offsets_full"],
+                self.share_inputs["is_block_step"],
+                self.share_inputs["decode_states"],
+            )
+            self.share_inputs["attn_mask_offsets"].copy_(attn_mask_offsets, False)
-            self.share_inputs["attn_mask_offsets"].copy_(attn_mask_offsets, False)
+            attn_mask_offsets_buffer = self.share_inputs["attn_mask_offsets"]
+            attn_mask_offsets_len = attn_mask_offsets.shape[0]
+            if attn_mask_offsets_buffer.numel() < attn_mask_offsets_len:
+                raise RuntimeError(
+                    "attn_mask_offsets buffer capacity is insufficient: "
+                    f"required={attn_mask_offsets_len}, "
+                    f"capacity={attn_mask_offsets_buffer.numel()}. "
+                    "Please ensure the preallocated attn_mask_offsets buffer "
+                    "has capacity for max_token_capacity * 2."
+                )
+            attn_mask_offsets_buffer[:attn_mask_offsets_len].copy_(attn_mask_offsets, False)
-            self.share_inputs["attn_mask_offsets"].copy_(attn_mask_offsets, False)
+            attn_mask_offsets_buffer = self.share_inputs["attn_mask_offsets"]
+            attn_mask_offsets_len = attn_mask_offsets.shape[0]
+            if attn_mask_offsets_buffer.numel() < attn_mask_offsets_len:
+                raise RuntimeError(
+                    "attn_mask_offsets buffer capacity is insufficient: "
+                    f"required={attn_mask_offsets_len}, "
+                    f"capacity={attn_mask_offsets_buffer.numel()}. "
+                    "Please ensure the preallocated attn_mask_offsets buffer "
+                    "has capacity for max_token_capacity * 2."
+                )
+            attn_mask_offsets_buffer[:attn_mask_offsets_len].copy_(attn_mask_offsets, False)
+
         # Initialize forward meta data
         self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)
         self.forward_meta.real_bsz = real_bsz
@@ -1310,6 +1383,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
             kv_batch_ids=self.share_inputs["kv_batch_ids"],
             kv_tile_ids_per_batch=self.share_inputs["kv_tile_ids_per_batch"],
             kv_num_blocks_x_cpu=self.share_inputs["kv_num_blocks_x_cpu"],
+            attn_mask_offsets=self.share_inputs["attn_mask_offsets"] if self.enable_mm else None,
             routing_replay_table=routing_replay_table,
         )
 
@@ -2167,6 +2241,24 @@ def _preprocess(
         model_inputs["generated_modality"] = self.share_inputs["generated_modality"]
         if self.enable_mm:
             model_inputs["image_features"] = self.share_inputs["image_features"]
+            model_inputs["decode_states"] = self.share_inputs["decode_states"]
+            model_inputs["image_grid_thws"] = self.share_inputs.get("image_grid_thws", None)
+            video_features = self.share_inputs.get("video_features", None)
+            video_grid_thws = self.share_inputs.get("video_grid_thws", None)
+            video_infinity_scales = self.share_inputs.get("video_infinity_scales", None)
+            if video_features is not None:
+                model_inputs["video_features"] = video_features
+            if video_grid_thws is not None:
+                model_inputs["video_grid_thws"] = video_grid_thws
+            if video_infinity_scales is not None:
+                model_inputs["video_infinity_scales"] = video_infinity_scales
+
+            # init features and grid_thws
+            self.share_inputs["image_features"] = None
+            self.share_inputs["image_grid_thws"] = None
+            self.share_inputs["video_features"] = None
+            self.share_inputs["video_grid_thws"] = None
+            self.share_inputs["video_infinity_scales"] = None
 
         return model_inputs, p_done_idxs, token_num_event
 

diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py
@@ -228,7 +228,11 @@ def init_share_inputs(self):
             )
             if self.is_mm_model:
                 self.image_features = None
+                self.image_grid_thws = None
                 self.image_features_list = None
+                self.video_features = None
+                self.video_grid_thws = None
+                self.video_infinity_scales = None
 
         # Set block tables
         pre_max_block_num = (
@@ -345,7 +349,26 @@ def init_share_inputs(self):
                 dtype="float32",
             )
             self.image_features = None  # Built before the forward
+            self.image_grid_thws = None
             self.image_features_list = None
+            self.video_features = None
+            self.video_grid_thws = None
+            self.video_infinity_scales = None
+
+            decode_states_len = self.speculative_config.num_speculative_tokens + 1 if self.speculative_decoding else 1
+            self.decode_states = paddle.full(
+                [self.scheduler_config.max_num_seqs, decode_states_len],
+                -1,
+                dtype="int32",
+            )
+            self.attn_mask_offsets = paddle.full(
+                shape=[self.scheduler_config.max_num_seqs * self.model_config.max_model_len],
-            self.attn_mask_offsets = paddle.full(
-                shape=[self.scheduler_config.max_num_seqs * self.model_config.max_model_len],
+            attn_mask_token_capacity = self.scheduler_config.max_num_seqs * self.model_config.max_model_len
+            self.attn_mask_offsets = paddle.full(
+                shape=[attn_mask_token_capacity * 2],
-            self.attn_mask_offsets = paddle.full(
-                shape=[self.scheduler_config.max_num_seqs * self.model_config.max_model_len],
+            attn_mask_token_capacity = self.scheduler_config.max_num_seqs * self.model_config.max_model_len
+            self.attn_mask_offsets = paddle.full(
+                shape=[attn_mask_token_capacity * 2],
+                fill_value=-1,
+                dtype="int32",
+            )
+            self.attn_mask_offsets_full = paddle.full(
+                [self.scheduler_config.max_num_seqs, self.model_config.max_model_len], -1, dtype="int32"
+            )
 
         # For logits processors
         self.logits_processors = build_logits_processors(self.fd_config)
@@ -412,6 +435,7 @@ def swap_data(tensor, idx1, idx2):
         swap_data(self.ori_seq_lens_encoder, i1, i2)
         swap_data(self.system_lens, i1, i2)
         swap_data(self.system_ids, i1, i2)
+        swap_data(self.generated_modality, i1, i2)
         swap_data(self.enable_thinking, i1, i2)
         swap_data(self.max_think_lens, i1, i2)
         swap_data(self.limit_think_status, i1, i2)
@@ -454,6 +478,8 @@ def swap_data(tensor, idx1, idx2):
                     self.image_features_list[i1],
                 )
             swap_data(self.share_inputs["rope_emb"], i1, i2)
+            swap_data(self.decode_states, i1, i2)
+            swap_data(self.attn_mask_offsets_full, i1, i2)
         # Swap mask rollback
         swap_data(self.mask_rollback, i1, i2)
 
@@ -581,6 +607,7 @@ def reset_share_inputs(self):
             fill_paddle_tensor(self, "ori_seq_lens_encoder", 0)
             fill_paddle_tensor(self, "system_lens", 0)
             fill_paddle_tensor(self, "system_ids", -1)
+            fill_paddle_tensor(self, "generated_modality", -1)
 
             fill_paddle_tensor(self, "ids_remove_padding", 0)
             fill_paddle_tensor(self, "batch_id_per_token", 0)
@@ -665,7 +692,14 @@ def reset_share_inputs(self):
                     dtype="float32",
                 )
                 self.image_features = None
+                self.image_grid_thws = None
                 self.image_features_list = None
+                self.video_features = None
+                self.video_grid_thws = None
+                self.video_infinity_scales = None
+                fill_paddle_tensor(self, "decode_states", -1)
+                fill_paddle_tensor(self, "attn_mask_offsets", -1)
+                fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
-                fill_paddle_tensor(self, "attn_mask_offsets", -1)
-                fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
+
+                attn_mask_token_num = max_num_seqs * self.model_config.max_model_len
+                attn_mask_offsets_shape = [attn_mask_token_num * 2]
+
+                attn_mask_offsets = getattr(self, "attn_mask_offsets", None)
+                if attn_mask_offsets is None or list(attn_mask_offsets.shape) != attn_mask_offsets_shape:
+                    attn_mask_offsets_dtype = attn_mask_offsets.dtype if attn_mask_offsets is not None else "int32"
+                    self.attn_mask_offsets = paddle.full(
+                        shape=attn_mask_offsets_shape,
+                        fill_value=-1,
+                        dtype=attn_mask_offsets_dtype,
+                    )
+                else:
+                    fill_paddle_tensor(self, "attn_mask_offsets", -1)
+
+                attn_mask_offsets_full = getattr(self, "attn_mask_offsets_full", None)
+                if attn_mask_offsets_full is None or list(attn_mask_offsets_full.shape) != attn_mask_offsets_shape:
+                    attn_mask_offsets_full_dtype = (
+                        attn_mask_offsets_full.dtype if attn_mask_offsets_full is not None else "int32"
+                    )
+                    self.attn_mask_offsets_full = paddle.full(
+                        shape=attn_mask_offsets_shape,
+                        fill_value=-1,
+                        dtype=attn_mask_offsets_full_dtype,
+                    )
+                else:
+                    fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
-                fill_paddle_tensor(self, "attn_mask_offsets", -1)
-                fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
+
+                attn_mask_token_num = max_num_seqs * self.model_config.max_model_len
+                attn_mask_offsets_shape = [attn_mask_token_num * 2]
+
+                attn_mask_offsets = getattr(self, "attn_mask_offsets", None)
+                if attn_mask_offsets is None or list(attn_mask_offsets.shape) != attn_mask_offsets_shape:
+                    attn_mask_offsets_dtype = attn_mask_offsets.dtype if attn_mask_offsets is not None else "int32"
+                    self.attn_mask_offsets = paddle.full(
+                        shape=attn_mask_offsets_shape,
+                        fill_value=-1,
+                        dtype=attn_mask_offsets_dtype,
+                    )
+                else:
+                    fill_paddle_tensor(self, "attn_mask_offsets", -1)
+
+                attn_mask_offsets_full = getattr(self, "attn_mask_offsets_full", None)
+                if attn_mask_offsets_full is None or list(attn_mask_offsets_full.shape) != attn_mask_offsets_shape:
+                    attn_mask_offsets_full_dtype = (
+                        attn_mask_offsets_full.dtype if attn_mask_offsets_full is not None else "int32"
+                    )
+                    self.attn_mask_offsets_full = paddle.full(
+                        shape=attn_mask_offsets_shape,
+                        fill_value=-1,
+                        dtype=attn_mask_offsets_full_dtype,
+                    )
+                else:
+                    fill_paddle_tensor(self, "attn_mask_offsets_full", -1)
             else:
                 # Reset non-multimodal rope_emb
                 self.rope_emb = get_rope(
@@ -677,7 +711,11 @@ def reset_share_inputs(self):
                 )
                 if self.is_mm_model:
                     self.image_features = None
+                    self.image_grid_thws = None
                     self.image_features_list = None
+                    self.video_features = None
+                    self.video_grid_thws = None
+                    self.video_infinity_scales = None
 
             # Reset other miscellaneous tensors
             fill_paddle_tensor(self, "mask_rollback", 0)
@@ -895,6 +933,8 @@ def swap_data(tensor, idx1, idx2):
         swap_data(self.mask_rollback, i1, i2)
         swap_data(self.recompute_token_num, i1, i2)
         if self.enable_mm:
+            swap_data(self.decode_states, i1, i2)
+            swap_data(self.attn_mask_offsets, i1, i2)
             swap_data(self.attn_mask_offsets_full, i1, i2)
             swap_data(self.attn_mask_offsets_decoder, i1, i2)
-            swap_data(self.attn_mask_offsets, i1, i2)
-            swap_data(self.attn_mask_offsets_full, i1, i2)
-            swap_data(self.attn_mask_offsets_decoder, i1, i2)
+            # Attention mask offset buffers may be token-flattened derived state
+            # rather than batch-aligned storage. Swapping a single element by
+            # batch index can break consistency with the flattened token layout.
+            # Keep them untouched here and let the later preprocessing stage
+            # rebuild them from the current batch layout.
-            swap_data(self.attn_mask_offsets, i1, i2)
-            swap_data(self.attn_mask_offsets_full, i1, i2)
-            swap_data(self.attn_mask_offsets_decoder, i1, i2)
+            # Attention mask offset buffers may be token-flattened derived state
+            # rather than batch-aligned storage. Swapping a single element by
+            # batch index can break consistency with the flattened token layout.
+            # Keep them untouched here and let the later preprocessing stage
+            # rebuild them from the current batch layout.