Merge branch 'main' into tcc/from_config

lancerts · web-flow · commit 8d83a0965278 · 2025-12-21T10:10:37.000-08:00
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@ def get_default_dependencies():
             "torch>=2.6.0",
         ]
     elif platform == "npu":
-        return ["torch_npu==2.6.0", "triton-ascend"]
+        return ["torch_npu==2.7.1", "triton-ascend"]
 
 
 def get_optional_dependencies():
diff --git a/src/liger_kernel/chunked_loss/cosine_similarity_loss.py b/src/liger_kernel/chunked_loss/cosine_similarity_loss.py
@@ -9,7 +9,13 @@
 
 class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase):
     @staticmethod
-    def distillation_loss_fn(student_logits, teacher_logits, beta=1.0):
+    def distillation_loss_fn(
+        student_logits,
+        teacher_logits,
+        target=None,
+        ignore_index=None,
+        beta=1.0,
+    ):
         """
         Compute Cosine loss (Cosine Similarity Loss).
         Args:
diff --git a/src/liger_kernel/chunked_loss/fused_linear_distillation.py b/src/liger_kernel/chunked_loss/fused_linear_distillation.py
@@ -13,6 +13,8 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
     def distillation_loss_fn(
         student_logits,
         teacher_logits,
+        target=None,
+        ignore_index=None,
     ):
         """
         Compute distillation loss.
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -430,7 +430,7 @@ def apply_liger_kernel_to_llava(
                     f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
                     f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
                 )
-            text_kwargs["model"] = model.language_model
+            text_kwargs["model"] = model.model.language_model
             text_liger_fn(**text_kwargs)
         elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
             logger.warning(f"{text_model_name} is not supported by Liger kernel.")
@@ -445,7 +445,7 @@ def apply_liger_kernel_to_llava(
                     f"These parameters are not supported by {vision_model_name}. Enter the remaining {list(vision_kwargs.keys())} except for {list(remain_params)}\n"
                     f"Parameters accepted by {vision_model_name}: {list(accept_params.keys())}"
                 )
-            vision_kwargs["model"] = model.vision_tower
+            vision_kwargs["model"] = model.model.vision_tower
             vision_liger_fn(**vision_kwargs)
         elif vision_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
             logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
@@ -615,8 +615,8 @@ def apply_liger_kernel_to_mllama(
         # instance variables that reference already-instantiated modules
 
         if isinstance(model, MllamaForConditionalGeneration):
-            language_model: MllamaForCausalLM = model.language_model
-            vision_model: MllamaVisionModel = model.vision_model
+            language_model: MllamaForCausalLM = model.model.language_model
+            vision_model: MllamaVisionModel = model.model.vision_model
             if isinstance(language_model, MllamaForCausalLM):
                 text_model: MllamaTextModel = language_model.model
             else:
@@ -1118,8 +1118,8 @@ def apply_liger_kernel_to_gemma3(
         # instance variables that reference already-instantiated modules
 
         if isinstance(model, Gemma3ForConditionalGeneration):
-            if isinstance(model.vision_tower, SiglipVisionModel):
-                vision_tower = model.vision_tower
+            if isinstance(model.model.vision_tower, SiglipVisionModel):
+                vision_tower = model.model.vision_tower
 
                 _patch_layer_norm_module(vision_tower.vision_model.post_layernorm)
 
@@ -1132,15 +1132,15 @@ def apply_liger_kernel_to_gemma3(
                 raise TypeError("The vision tower must be SiglipVisionModel")
 
             if rms_norm:
-                _patch_rms_norm_module_for_gemma3(model.multi_modal_projector.mm_soft_emb_norm)
+                _patch_rms_norm_module_for_gemma3(model.model.multi_modal_projector.mm_soft_emb_norm)
 
             apply_liger_kernel_to_gemma3_text(
                 rope=rope,
                 cross_entropy=False,
                 fused_linear_cross_entropy=False,
                 rms_norm=rms_norm,
                 geglu=geglu,
-                model=model.language_model,
+                model=model.model.language_model,
             )
 
         else:
@@ -1228,7 +1228,7 @@ def apply_liger_kernel_to_paligemma(
         if not isinstance(model, PaliGemmaForConditionalGeneration):
             raise TypeError("model have to be of type PaliGemmaForConditionalGeneration")
 
-        vision_tower: SiglipVisionModel = model.vision_tower
+        vision_tower: SiglipVisionModel = model.model.vision_tower
 
         _patch_layer_norm_module(vision_tower.vision_model.post_layernorm)
 
@@ -1238,7 +1238,7 @@ def apply_liger_kernel_to_paligemma(
                 _patch_layer_norm_module(layer.layer_norm1)
                 _patch_layer_norm_module(layer.layer_norm2)
 
-        language_model = model.language_model
+        language_model = model.model.language_model
 
         if isinstance(language_model, (GemmaForCausalLM, GemmaModel)):
             apply_liger_kernel_to_gemma(
@@ -1593,11 +1593,10 @@ def apply_liger_kernel_to_qwen2_vl(
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-
-        if isinstance(model, (Qwen2VLForConditionalGeneration, Qwen2VLModel)):
-            # Note: language_model and visual properties can be accessed throught conditional class for BC.
-            # Not sure if it is subject to changes in the future.
-            # Reference: https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1698
+        if isinstance(model, Qwen2VLForConditionalGeneration):
+            text_model: Qwen2VLTextModel = model.model.language_model
+            vision_model: Qwen2VisionTransformerPretrainedModel = model.model.visual
+        elif isinstance(model, Qwen2VLModel):
             text_model: Qwen2VLTextModel = model.language_model
             vision_model: Qwen2VisionTransformerPretrainedModel = model.visual
         elif isinstance(model, Qwen2VLTextModel):
@@ -1684,11 +1683,10 @@ def apply_liger_kernel_to_qwen2_5_vl(
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-
-        if isinstance(model, (Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel)):
-            # Note: language_model and visual properties can be accessed throught conditional class for BC.
-            # Not sure if it is subject to changes in the future.
-            # Reference: https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1823
+        if isinstance(model, Qwen2_5_VLForConditionalGeneration):
+            text_model: Qwen2_5_VLTextModel = model.model.language_model
+            vision_model: Qwen2_5_VisionTransformerPretrainedModel = model.model.visual
+        elif isinstance(model, Qwen2_5_VLModel):
             text_model: Qwen2_5_VLTextModel = model.language_model
             vision_model: Qwen2_5_VisionTransformerPretrainedModel = model.visual
         elif isinstance(model, Qwen2_5_VLTextModel):
@@ -1702,7 +1700,7 @@ def apply_liger_kernel_to_qwen2_5_vl(
 
         if vision_model is not None:
             # Patch Qwen2_5_VisionTransformerPretrainedModel
-            for vision_block in model.visual.blocks:
+            for vision_block in vision_model.blocks:
                 if rms_norm:
                     _patch_rms_norm_module(vision_block.norm1)
                     _patch_rms_norm_module(vision_block.norm2)
@@ -1771,7 +1769,9 @@ def apply_liger_kernel_to_qwen3_vl(
             modeling_qwen3_vl.Qwen3VLForConditionalGeneration.forward = qwen3_vl_lce_forward
 
     if model is not None and rms_norm:
-        if isinstance(model, (Qwen3VLForConditionalGeneration, Qwen3VLModel)):
+        if isinstance(model, Qwen3VLForConditionalGeneration):
+            text_model: Qwen3VLTextModel = model.model.language_model
+        elif isinstance(model, Qwen3VLModel):
             text_model: Qwen3VLTextModel = model.language_model
         elif isinstance(model, Qwen3VLTextModel):
             text_model = model
@@ -1846,7 +1846,9 @@ def apply_liger_kernel_to_qwen3_vl_moe(
             modeling_qwen3_vl_moe.Qwen3VLMoeForConditionalGeneration.forward = qwen3_vl_moe_lce_forward
 
     if model is not None and rms_norm:
-        if isinstance(model, (Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeModel)):
+        if isinstance(model, Qwen3VLMoeForConditionalGeneration):
+            text_model: Qwen3VLMoeTextModel = model.model.language_model
+        elif isinstance(model, Qwen3VLMoeModel):
             text_model: Qwen3VLMoeTextModel = model.language_model
         elif isinstance(model, Qwen3VLMoeTextModel):
             text_model = model
@@ -2191,10 +2193,10 @@ def apply_liger_kernel_to_glm4v(
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if isinstance(model, (Glm4vForConditionalGeneration, Glm4vModel)):
-            # Note: language_model and visual properties can be accessed throught conditional class for BC.
-            # Not sure if it is subject to changes in the future.
-            # Reference: https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4v/modeling_glm4v.py#L1305
+        if isinstance(model, Glm4vForConditionalGeneration):
+            text_model: Glm4vTextModel = model.model.language_model
+            vision_model: Glm4vVisionModel = model.model.visual
+        elif isinstance(model, Glm4vModel):
             text_model: Glm4vTextModel = model.language_model
             vision_model: Glm4vVisionModel = model.visual
         elif isinstance(model, Glm4vTextModel):
@@ -2281,10 +2283,11 @@ def apply_liger_kernel_to_glm4v_moe(
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if isinstance(model, (Glm4vMoeForConditionalGeneration, Glm4vMoeModel)):
-            # Note: language_model and visual properties can be accessed throught conditional class for BC.
-            # Not sure if it is subject to changes in the future.
-            # Reference: https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py#L337
+        if isinstance(model, Glm4vMoeForConditionalGeneration):
+            text_model: Glm4vMoeTextModel = model.model.language_model
+            vision_model: Glm4vMoeVisionModel = model.model.visual
+            Glm4vMoeTextMoE = modeling_glm4v_moe.Glm4vMoeTextMoE
+        elif isinstance(model, Glm4vMoeModel):
             text_model: Glm4vMoeTextModel = model.language_model
             vision_model: Glm4vMoeVisionModel = model.visual
             Glm4vMoeTextMoE = modeling_glm4v_moe.Glm4vMoeTextMoE
@@ -2387,8 +2390,10 @@ def apply_liger_kernel_to_internvl(
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if isinstance(model, (InternVLForConditionalGeneration, InternVLModel)):
-            # NOTE: language_model and visual properties can be accessed throught conditional class.
+        if isinstance(model, InternVLForConditionalGeneration):
+            text_model = model.model.language_model
+            vision_model: InternVLVisionModel = model.model.vision_tower
+        elif isinstance(model, InternVLModel):
             text_model = model.language_model
             vision_model: InternVLVisionModel = model.vision_tower
         else:
diff --git a/test/chunked_loss/test_cosine_loss.py b/test/chunked_loss/test_cosine_loss.py
@@ -33,7 +33,7 @@ def __init__(
             temperature=temperature,
         )
 
-    def distillation_loss(self, student_logits, teacher_logits, beta=1.0):
+    def distillation_loss(self, student_logits, teacher_logits, target=None, ignore_index=None, beta=1.0, **kwargs):
         # Compute normalized logits
         print(f"student_logits.shape: {student_logits.shape}")
         student_norm = F.normalize(student_logits, p=2, dim=-1)
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py
diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ def get_default_dependencies():`
`25`	`25`	`"torch>=2.6.0",`
`26`	`26`	`]`
`27`	`27`	`elif platform == "npu":`
`28`		`- return ["torch_npu==2.6.0", "triton-ascend"]`
	`28`	`+ return ["torch_npu==2.7.1", "triton-ascend"]`
`29`	`29`
`30`	`30`
`31`	`31`	`def get_optional_dependencies():`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def __init__(`
`33`	`33`	`temperature=temperature,`
`34`	`34`	`)`
`35`	`35`
`36`		`- def distillation_loss(self, student_logits, teacher_logits, beta=1.0):`
	`36`	`+ def distillation_loss(self, student_logits, teacher_logits, target=None, ignore_index=None, beta=1.0, **kwargs):`
`37`	`37`	`# Compute normalized logits`
`38`	`38`	`print(f"student_logits.shape: {student_logits.shape}")`
`39`	`39`	`student_norm = F.normalize(student_logits, p=2, dim=-1)`