unslothai · maximedb · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · danielhanchen
diff --git a/unsloth_zoo/dataset_utils.py b/unsloth_zoo/dataset_utils.py
@@ -184,16 +184,22 @@ def _find_common_token_ids(component, tokenizer, force_match = False):
 
 def train_on_responses_only(
     trainer,
-    instruction_part = None,
-    response_part    = None,
-    force_match      = True,  # Match newlines as well!
-    tokenizer        = None,  # Optional
-    return_function  = False, # Useful for iterating over lists
-    num_proc         = None,
+    instruction_part  = None,
+    response_part     = None,
+    force_match       = True,  # Match newlines as well!
+    tokenizer         = None,  # Optional
+    return_function   = False, # Useful for iterating over lists
+    num_proc          = None,
+    last_response_only = False, # Train only on the last assistant turn
-    last_response_only = False, # Train only on the last assistant turn
+    last_response_only = False, # Train only on the last assistant turn
-    last_response_only = False, # Train only on the last assistant turn
+    last_response_only = False, # Train only on the last assistant turn
 ):
     """
     Trains only on responses and not on the instruction by masking out
     the labels with -100 for the instruction part.
+
+    If last_response_only=True, only the final assistant turn has its
+    labels unmasked; all earlier assistant turns remain masked at -100
+    (they are never written, so they keep the initialized -100 values
+    and are not copied from old_labels either).
     """
     # All Unsloth Zoo code licensed under LGPLv3
     if tokenizer is None and trainer is not None:
@@ -249,13 +255,16 @@ def _train_on_responses_only(examples):
         for input_ids, old_labels in zip(input_ids_, labels_):
             n = len(input_ids)
             labels = [-100] * n
-            
+
             use_old_labels = False
             if old_labels is not None:
                 use_old_labels = True
                 assert(n == len(old_labels))
             n_minus_1 = n - 1
             j = 0
+
+            # Collect all (assistant_k, user_j) spans for this sample
+            spans = []
             while j < n:
                 # Find <assistant>
                 if (input_ids[j] == A_first) and \
@@ -308,20 +317,27 @@ def _train_on_responses_only(examples):
                                 k = n
                             pass
 
-                            if not use_old_labels:
-                                # Now copy input_ids to labels
-                                labels[assistant_k : user_j] = input_ids [assistant_k : user_j]
-                                # print(assistant_j, assistant_k, user_j, user_k)
-                            else:
-                                # Copy over from old labels!
-                                labels[assistant_k : user_j] = old_labels[assistant_k : user_j]
+                            spans.append((assistant_k, user_j))
                             break
                         pass
                         j += 1
                     pass
                 pass
                 j += 1
             pass
+
+            # Apply labels: only the last assistant turn when last_response_only=True.
+            # Note: spans[-1:] safely returns [] when spans is empty (no assistant turn
+            # was found), so a sample with no assistant turn stays fully masked at -100.
+            apply_spans = spans[-1:] if last_response_only else spans
-            apply_spans = spans[-1:] if last_response_only else spans
+            # Apply labels: only the last assistant turn when last_response_only=True.
+            # Note: spans[-1:] safely returns [] when spans is empty (no assistant turn found),
+            # so a sample with no assistant turn stays fully masked at -100.
+            apply_spans = spans[-1:] if last_response_only else spans
-            apply_spans = spans[-1:] if last_response_only else spans
+            # Apply labels: only the last assistant turn when last_response_only=True.
+            # Note: spans[-1:] safely returns [] when spans is empty (no assistant turn found),
+            # so a sample with no assistant turn stays fully masked at -100.
+            apply_spans = spans[-1:] if last_response_only else spans
+            for assistant_k, user_j in apply_spans:
+                if not use_old_labels:
+                    # Now copy input_ids to labels
+                    labels[assistant_k : user_j] = input_ids [assistant_k : user_j]
+                else:
+                    # Copy over from old labels!
+                    labels[assistant_k : user_j] = old_labels[assistant_k : user_j]
+
             all_labels.append(labels)
         pass
         return { "labels" : torch.tensor(all_labels, dtype = torch.int64) if use_tensors else all_labels }

diff --git a/unsloth_zoo/vision_utils.py b/unsloth_zoo/vision_utils.py
@@ -660,6 +660,7 @@ def __init__(
         pad_to_multiple_of = None,
         resize_dimension = 0, # can be 0, 1, 'max' or 'min' (max resizes based on the max of height width, min the min size, 0 the first dim, etc)
         snap_to_patch_size = False,
+        last_response_only = False, # Train only on the last assistant turn
     ):
         if not hasattr(processor, "image_processor"):
             raise TypeError("Unsloth: UnslothVisionDataCollator is only for image models!")
@@ -733,12 +734,13 @@ def __init__(
             assert(isinstance(instruction_part, str) and isinstance(response_part, str))
             self.train_on_responses_only = _train_on_responses_only(
                 None,
-                instruction_part = instruction_part,
-                response_part    = response_part,
-                force_match      = force_match,
-                tokenizer        = processor,
-                return_function  = True,
-                num_proc         = num_proc,
+                instruction_part   = instruction_part,
+                response_part      = response_part,
+                force_match        = force_match,
+                tokenizer          = processor,
+                return_function    = True,
+                num_proc           = num_proc,
+                last_response_only = last_response_only,
             )
         else:
             self.train_on_responses_only = None