feat: hardend weight tying against misconfigurations

le1nux · le1nux · commit 33c55a43d3f5 · 2026-06-15T11:50:30.000+02:00
diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
@@ -34,6 +34,7 @@
     PydanticTokenizerIFType,
 )
 from modalities.config.utils import parse_torch_device
+from modalities.models.weight_tying import has_tied_word_embeddings
 from modalities.running_env.env_utils import (
     FSDP2MixedPrecisionSettings,
     MixedPrecisionSettings,
@@ -342,6 +343,13 @@ def validate_tp_mesh_existence(self) -> "GPT2ModelTPConfig":
             raise ValueError("data_parallel_replicate_degree > 1 cannot be used with Tensor Parallelism.")
         return self
 
+    @model_validator(mode="after")
+    def validate_untied_word_embeddings(self) -> "GPT2ModelTPConfig":
+        models = self.model if isinstance(self.model, list) else [self.model]
+        if any(has_tied_word_embeddings(model) for model in models):
+            raise ValueError("Tied word embeddings are not supported with Tensor Parallelism.")
+        return self
+
 
 class CompiledModelConfig(BaseModel):
     model: PydanticPytorchModuleOrListType
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -938,6 +938,12 @@ def __init__(
                 self.transformer.lm_head.weight
             )  # https://paperswithcode.com/method/weight-tying
 
+    @property
+    def has_tied_word_embeddings(self) -> bool:
+        token_embedding_weight = getattr(self.transformer.wte, "weight", None)
+        lm_head_weight = getattr(self.transformer.lm_head, "weight", None)
+        return token_embedding_weight is not None and token_embedding_weight is lm_head_weight
+
     @overload
     def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
         """
diff --git a/src/modalities/models/model.py b/src/modalities/models/model.py
@@ -46,6 +46,11 @@ def weight_decay_groups(self) -> WeightDecayGroups:
         """
         return self._weight_decay_groups
 
+    @property
+    def has_tied_word_embeddings(self) -> bool:
+        """Whether the model currently uses tied token embedding and output weights."""
+        return False
+
     @abstractmethod
     def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
         """
diff --git a/src/modalities/models/parallelism/pipeline_parallelism_configs.py b/src/modalities/models/parallelism/pipeline_parallelism_configs.py
@@ -1,6 +1,6 @@
 from typing import Annotated
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 
 from modalities.config.pydantic_if_types import (
     PydanticDeviceMeshIFType,
@@ -11,6 +11,7 @@
     PydanticStagesGeneratorType,
 )
 from modalities.models.parallelism.pipeline_parallelism import PipelineSelectionTypes
+from modalities.models.weight_tying import has_tied_word_embeddings
 from modalities.utils.deprecated_alias import add_deprecated_alias
 
 
@@ -26,6 +27,12 @@ class StagedPipelineConfig(BaseModel):
     pp_schedule_name: str
     num_layers_per_stage: Annotated[int, Field(strict=True, ge=1)]
 
+    @model_validator(mode="after")
+    def validate_untied_word_embeddings(self) -> "StagedPipelineConfig":
+        if has_tied_word_embeddings(self.whole_model):
+            raise ValueError("Tied word embeddings are not supported with Pipeline Parallelism.")
+        return self
+
 
 class ScheduledPipelineConfig(BaseModel):
     loss_fn: PydanticLossIFType
diff --git a/src/modalities/models/weight_tying.py b/src/modalities/models/weight_tying.py
@@ -0,0 +1,11 @@
+import torch.nn as nn
+
+
+def has_tied_word_embeddings(model: nn.Module) -> bool:
+    model_has_tied_word_embeddings = getattr(model, "has_tied_word_embeddings", None)
+    if model_has_tied_word_embeddings is None:
+        raise TypeError(
+            f"{type(model).__name__} must define 'has_tied_word_embeddings' to be used with tied-embedding validation."
+        )
+
+    return bool(model_has_tied_word_embeddings)
diff --git a/tests/test_weight_tying.py b/tests/test_weight_tying.py
@@ -1,6 +1,9 @@
 import pytest
 import torch.nn as nn
+from pydantic import ValidationError
+from torch.distributed.device_mesh import DeviceMesh
 
+from modalities.config.config import GPT2ModelTPConfig
 from modalities.models.components.layer_norms import LayerNormConfig
 from modalities.models.gpt2.gpt2_model import (
     GPT2LLM,
@@ -11,6 +14,10 @@
     PositionTypes,
 )
 from modalities.models.model import ActivationType
+from modalities.models.parallelism.pipeline_parallelism_configs import StagedPipelineConfig
+from modalities.models.parallelism.stages_generator import GPT2LLMStagesGenerator
+from modalities.models.weight_tying import has_tied_word_embeddings
+from modalities.running_env.fsdp.device_mesh import ParallelismDegrees
 
 VOCAB_SIZE = 1000
 EMBEDDING_DIM = 64
@@ -79,9 +86,17 @@ def create_gpt2_model(use_weight_tying: bool) -> GPT2LLM:
     )
 
 
+def create_device_mesh_stub(*mesh_dim_names: str) -> DeviceMesh:
+    device_mesh = DeviceMesh.__new__(DeviceMesh)
+    device_mesh.mesh_dim_names = mesh_dim_names
+    return device_mesh
+
+
 @pytest.mark.parametrize("use_weight_tying", [True, False])
 def test_weight_tying_behavior(use_weight_tying):
     model = create_gpt2_model(use_weight_tying)
+    assert model.has_tied_word_embeddings is use_weight_tying
+
     if use_weight_tying:
         assert (
             model.transformer.wte.weight is model.transformer.lm_head.weight
@@ -118,3 +133,52 @@ def test_weight_tying_named_parameters(use_weight_tying):
         assert (
             "transformer.lm_head.weight" in named_params
         ), "transformer.lm_head.weight should appear in named_parameters when weight tying is not used."
+
+
+def test_has_tied_word_embeddings_requires_model_capability():
+    with pytest.raises(TypeError, match="must define 'has_tied_word_embeddings'"):
+        has_tied_word_embeddings(nn.Linear(1, 1))
+
+
+def test_tp_config_rejects_tied_word_embeddings():
+    model = create_gpt2_model(use_weight_tying=True)
+    device_mesh = create_device_mesh_stub(ParallelismDegrees.TP.value)
+
+    with pytest.raises(ValidationError, match="Tied word embeddings are not supported with Tensor Parallelism"):
+        GPT2ModelTPConfig(model=model, device_mesh=device_mesh)
+
+
+def test_tp_config_allows_untied_word_embeddings():
+    model = create_gpt2_model(use_weight_tying=False)
+    device_mesh = create_device_mesh_stub(ParallelismDegrees.TP.value)
+
+    GPT2ModelTPConfig(model=model, device_mesh=device_mesh)
+
+
+def test_pp_config_rejects_tied_word_embeddings():
+    model = create_gpt2_model(use_weight_tying=True)
+    device_mesh = create_device_mesh_stub(ParallelismDegrees.PP.value)
+
+    with pytest.raises(ValidationError, match="Tied word embeddings are not supported with Pipeline Parallelism"):
+        StagedPipelineConfig(
+            whole_model=model,
+            stages_generator=GPT2LLMStagesGenerator(num_model_layers=model.n_layer),
+            device_mesh=device_mesh,
+            local_rank=0,
+            pp_schedule_name="gpipe",
+            num_layers_per_stage=1,
+        )
+
+
+def test_pp_config_allows_untied_word_embeddings():
+    model = create_gpt2_model(use_weight_tying=False)
+    device_mesh = create_device_mesh_stub(ParallelismDegrees.PP.value)
+
+    StagedPipelineConfig(
+        whole_model=model,
+        stages_generator=GPT2LLMStagesGenerator(num_model_layers=model.n_layer),
+        device_mesh=device_mesh,
+        local_rank=0,
+        pp_schedule_name="gpipe",
+        num_layers_per_stage=1,
+    )