fix: mixed precision bug in ep layers

gbesposito · gbesposito · commit f638aecef376 · 2026-06-17T08:15:32.000+02:00
diff --git a/config_files/training/config_lorem_ipsum_long_moe_ep_fsdp2.yaml b/config_files/training/config_lorem_ipsum_long_moe_ep_fsdp2.yaml
@@ -238,6 +238,9 @@ ep_model:
     device_mesh:
       instance_key: device_mesh
       pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: BF_16
+      reduce_dtype: BF_16
     block_names: [TransformerBlock]
 
 ac_model:
diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
@@ -338,6 +338,7 @@ class EPWrappedModelConfig(BaseModel):
     model: PydanticPytorchModuleOrListType
     block_names: list[str]
     device_mesh: PydanticDeviceMeshIFType
+    mixed_precision_settings: FSDP2MixedPrecisionSettings
 
 
 class DebuggingEnrichedModelConfig(BaseModel):
diff --git a/src/modalities/models/moe/model_factory.py b/src/modalities/models/moe/model_factory.py
@@ -1,59 +1,21 @@
 import warnings
 
-import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._composable.fsdp import MixedPrecisionPolicy
 from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import DTensor
 
 from modalities.models.parallelism.expert_parallelism import ExpertParallel
+from modalities.running_env.env_utils import FSDP2MixedPrecisionSettings
 from modalities.running_env.fsdp.device_mesh import ParallelismDegrees, get_mesh_for_parallelism_method
 from modalities.util import get_module_class_from_name
 
 
-def _validate_moe_block_for_ep(module) -> None:
-    if not hasattr(module, "experts"):
-        raise ValueError(f"Module {type(module).__name__} has no 'experts' attribute")
-
-    experts = module.experts
-    required_attrs = ["w1", "w2"]
-    missing = [attr for attr in required_attrs if not hasattr(experts, attr)]
-    if missing:
-        raise ValueError(
-            f"Module {type(module).__name__}.experts is not grouped-experts compatible. Missing: {missing}"
-        )
-
-    if experts.w1.ndim != 3 or experts.w2.ndim != 3:
-        raise ValueError(
-            f"Expected grouped expert parameters with ndim=3. Got w1.ndim={experts.w1.ndim}, "
-            f"w2.ndim={experts.w2.ndim}"
-        )
-
-
-def _get_ep_target_module(module):
-    if hasattr(module, "experts"):
-        return module
-
-    ffn = getattr(module, "ffn", None)
-    if ffn is not None and hasattr(ffn, "experts"):
-        return ffn
-
-    return None
-
-
-def _attach_ep_metadata(module, ep_mesh) -> None:
-    setattr(module, "_ep_mesh", ep_mesh)
-    setattr(module, "_ep_group", ep_mesh.get_group())
-    setattr(module, "_ep_size", ep_mesh.size())
-    setattr(module, "_ep_rank", ep_mesh.get_local_rank())
-
-
 def get_ep_wrapped_model(
     model,
     block_names: list[str],
     device_mesh: DeviceMesh,
-    mp_param_dtype=torch.bfloat16,
-    mp_reduce_dtype=torch.bfloat16,
+    mixed_precision_settings: FSDP2MixedPrecisionSettings,
 ) -> nn.Module:
     block_types = []
     missing_block_names = []
@@ -76,34 +38,59 @@ def get_ep_wrapped_model(
         raise ValueError(f"None of the requested MoE block names were found: {block_names}")
 
     ep_mesh = get_mesh_for_parallelism_method(device_mesh, ParallelismDegrees.EP)
-    MixedPrecisionPolicy(param_dtype=mp_param_dtype, reduce_dtype=mp_reduce_dtype)
+    target_dtype = mixed_precision_settings.param_dtype.value
 
     wrapped_blocks = 0
     for module in model.modules():
         if isinstance(module, block_types):
-            ep_target_module = _get_ep_target_module(module)
-            if ep_target_module is None:
+            if hasattr(module, "experts"):
+                ep_target = module
+            elif (ffn := getattr(module, "ffn", None)) is not None and hasattr(ffn, "experts"):
+                ep_target = ffn
+            else:
                 raise ValueError(
                     f"Module {type(module).__name__} has no EP-compatible experts location. "
                     "Expected `experts` or `ffn.experts`."
                 )
 
-            if getattr(ep_target_module, "_ep_enabled", False):
+            if getattr(ep_target, "_ep_enabled", False):
                 continue
 
-            _validate_moe_block_for_ep(ep_target_module)
-            _attach_ep_metadata(ep_target_module, ep_mesh)
+            experts = ep_target.experts
+            missing = [a for a in ("w1", "w2") if not hasattr(experts, a)]
+            if missing:
+                raise ValueError(
+                    f"Module {type(ep_target).__name__}.experts is not grouped-experts compatible. Missing: {missing}"
+                )
+            if experts.w1.ndim != 3 or experts.w2.ndim != 3:
+                raise ValueError(
+                    f"Expected grouped expert parameters with ndim=3. Got w1.ndim={experts.w1.ndim}, "
+                    f"w2.ndim={experts.w2.ndim}"
+                )
+
+            ep_target._ep_mesh = ep_mesh
+            ep_target._ep_group = ep_mesh.get_group()
+            ep_target._ep_size = ep_mesh.size()
+            ep_target._ep_rank = ep_mesh.get_local_rank()
+
+            ep_target.experts = ExpertParallel()._apply(ep_target.experts, ep_mesh)
+            ep_target.experts._ep_enabled = True
 
-            ep_target_module.experts = ExpertParallel()._apply(ep_target_module.experts, ep_mesh)
-            setattr(ep_target_module.experts, "_ep_enabled", True)
+            for pname, p in list(ep_target.experts._parameters.items()):
+                if isinstance(p, DTensor) and p.dtype != target_dtype:
+                    local = p.to_local().to(target_dtype)
+                    ep_target.experts._parameters[pname] = nn.Parameter(
+                        DTensor.from_local(local, p.device_mesh, p.placements, run_check=False),
+                        requires_grad=p.requires_grad,
+                    )
 
             wrapped_blocks += 1
 
     if wrapped_blocks == 0:
         raise ValueError(f"No blocks matched the requested types: {[t.__name__ for t in block_types]}")
 
-    setattr(model, "_ep_wrapped", True)
-    setattr(model, "_ep_mesh", ep_mesh)
-    setattr(model, "_ep_num_wrapped_blocks", wrapped_blocks)
+    model._ep_wrapped = True
+    model._ep_mesh = ep_mesh
+    model._ep_num_wrapped_blocks = wrapped_blocks
 
     return model