[Feature] Use Liger's Relu_Squared kernel for Nemotron models (#1176)

vaibhavjindal · claude · web-flow · commit 1c013e286c2b · 2026-04-01T18:37:55.000Z
## Summary  Use relu_squared in nemotron. This PR is generated using the liger-autopatch skill and tests the changes in #1177 .  Class patching and instance patching of relu_squared function. ## Testing Done   - Hardware Type: H100 - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/README.md b/README.md
@@ -253,7 +253,7 @@ loss.backward()
 | Ministral   | `liger_kernel.transformers.apply_liger_kernel_to_ministral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Mistral     | `liger_kernel.transformers.apply_liger_kernel_to_mistral`  | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Mixtral     | `liger_kernel.transformers.apply_liger_kernel_to_mixtral`  | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
-| Nemotron    | `liger_kernel.transformers.apply_liger_kernel_to_nemotron` | CrossEntropyLoss, FusedLinearCrossEntropy                               |
+| Nemotron    | `liger_kernel.transformers.apply_liger_kernel_to_nemotron` | ReLUSquared, CrossEntropyLoss, FusedLinearCrossEntropy                  |
 | Pixtral     | `liger_kernel.transformers.apply_liger_kernel_to_pixtral`  | RoPE, RMSNorm, SwiGLU|
 | Gemma1      | `liger_kernel.transformers.apply_liger_kernel_to_gemma`    | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
 | Gemma2      | `liger_kernel.transformers.apply_liger_kernel_to_gemma2`   | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -29,6 +29,7 @@
 from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forward
 from liger_kernel.transformers.model.smollm3 import lce_forward as smollm3_lce_forward
 from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
+from liger_kernel.transformers.relu_squared import LigerReLUSquared
 from liger_kernel.transformers.rms_norm import LigerRMSNorm
 from liger_kernel.transformers.rope import liger_rotary_pos_emb
 from liger_kernel.transformers.rope import liger_rotary_pos_emb_vision
@@ -748,6 +749,7 @@ def apply_liger_kernel_to_mistral(
 
 
 def apply_liger_kernel_to_nemotron(
+    relu_squared: bool = True,
     cross_entropy: bool = False,
     fused_linear_cross_entropy: bool = True,
     model: PreTrainedModel = None,
@@ -756,12 +758,12 @@ def apply_liger_kernel_to_nemotron(
     """
     Apply Liger kernels to replace original implementation in HuggingFace Nemotron models.
 
-    Note: Nemotron uses a non-gated MLP (squared ReLU) and NemotronLayerNorm1P (LayerNorm with +1 offset),
-    which are not currently supported by Liger kernels. RoPE is also not patched because Nemotron uses
-    partial rotary embeddings (partial_rotary_factor=0.5) which the Liger RoPE kernel does not support.
-    Only cross entropy optimizations are applied.
+    Note: NemotronLayerNorm1P (LayerNorm with +1 offset) is not currently supported by Liger kernels.
+    RoPE is also not patched because Nemotron uses partial rotary embeddings
+    (partial_rotary_factor=0.5) which the Liger RoPE kernel does not support.
 
     Args:
+        relu_squared (bool): Whether to apply Liger's ReLU squared activation. Default is True.
         cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
         fused_linear_cross_entropy (bool):
             Whether to apply Liger's fused linear cross entropy loss. Default is True.
@@ -776,6 +778,9 @@ def apply_liger_kernel_to_nemotron(
 
     from transformers.models.nemotron import modeling_nemotron
 
+    if relu_squared:
+        modeling_nemotron.ACT2FN["relu2"] = LigerReLUSquared
+
     if cross_entropy:
         modeling_nemotron.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
@@ -784,6 +789,11 @@ def apply_liger_kernel_to_nemotron(
         else:
             modeling_nemotron.NemotronForCausalLM.forward = nemotron_lce_forward
 
+    if model is not None:
+        for decoder_layer in model.model.layers:
+            if relu_squared:
+                decoder_layer.mlp.act_fn = LigerReLUSquared()
+
 
 def apply_liger_kernel_to_mixtral(
     rope: bool = True,
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py
@@ -3275,6 +3275,7 @@ def test_apply_liger_kernel_to_instance_for_hunyuan_v1_dense():
 @pytest.mark.skipif(not is_nemotron_available(), reason="nemotron not available")
 def test_apply_liger_kernel_to_instance_for_nemotron():
     from liger_kernel.transformers.model.nemotron import lce_forward as nemotron_lce_forward
+    from liger_kernel.transformers.relu_squared import LigerReLUSquared
 
     # Ensure any monkey patching is cleaned up for subsequent tests
     with patch("transformers.models.nemotron.modeling_nemotron"):
@@ -3292,14 +3293,19 @@ def test_apply_liger_kernel_to_instance_for_nemotron():
 
         # Check that model instance variables are not yet patched with Liger modules
         assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(nemotron_lce_forward)
+        for decoder_layer in dummy_model_instance.model.layers:
+            assert not isinstance(decoder_layer.mlp.act_fn, LigerReLUSquared)
 
         # Test applying kernels to the model instance
-        # Nemotron only supports rope and fused_linear_cross_entropy patching
         _apply_liger_kernel_to_instance(model=dummy_model_instance)
 
         # Check that the model's forward was correctly patched
         assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(nemotron_lce_forward)
 
+        # Check that the activation function was correctly patched
+        for decoder_layer in dummy_model_instance.model.layers:
+            assert isinstance(decoder_layer.mlp.act_fn, LigerReLUSquared)
+
         try:
             print(dummy_model_instance)
         except Exception as e: