test: Add test configs for context parallelism

rrutmann · rrutmann · commit 3cc6a72c7245 · 2026-05-12T17:57:24.000Z
diff --git a/tests/fsdp2_parallelization/cp_test_configs/cp_config.yaml b/tests/fsdp2_parallelization/cp_test_configs/cp_config.yaml
@@ -0,0 +1,98 @@
+model:
+  component_key: model
+  variant_key: fsdp2_wrapped
+  config:
+    model:
+      instance_key: gpt2_cp_model
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: FP_32 # must be FP32 in the test to prevent rounding issues
+      reduce_dtype: FP_32 # must be FP32 in the test to prevent rounding issues
+    block_names: [GPT2Block]
+
+gpt2_cp_model:
+  component_key: model
+  variant_key: gpt2_cp
+  config:
+    model:
+      instance_key: initialized_model
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+
+
+initialized_model:
+  component_key: model
+  variant_key: model_initialized
+  config:
+    model:
+      instance_key: model_raw
+      pass_type: BY_REFERENCE
+    model_initializer:
+      component_key: model_initialization
+      variant_key: composed
+      config:
+        model_type: gpt2
+        weight_init_type: scaled
+        mean: 0.0
+        std: 0.02
+        num_layers: ${model_raw.config.n_layer}
+
+model_raw:
+  component_key: model
+  variant_key: gpt2
+  config:
+    use_meta_device: false
+    use_weight_tying: false
+    sample_key: input_ids
+    poe_type: NOPE
+    sequence_length: 1024
+    prediction_key: logits
+    vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: 5
+    n_head_q: 8
+    n_head_kv: 4
+    ffn_hidden: 256
+    n_embd: 256
+    dropout: 0.0
+    bias: false # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    attention_config:
+      qkv_transforms:
+        - type_hint: RotaryTransform
+          config:
+            n_embd: ${model_raw.config.n_embd}
+            n_head: ${model_raw.config.n_head_q} #it has to be head_q here
+            seq_length_dim: -2
+            base_freq: 10000
+    attention_implementation: pytorch_flash
+    activation_type: swiglu
+    attention_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    ffn_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    lm_head_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+
+device_mesh:
+  component_key: device_mesh
+  variant_key: default
+  config:
+    device_type: cuda
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: -1
+    context_parallel_degree: 2
+    tensor_parallel_degree: 1
+    world_size: 4
diff --git a/tests/fsdp2_parallelization/cp_test_configs/fsdp2_config.yaml b/tests/fsdp2_parallelization/cp_test_configs/fsdp2_config.yaml
@@ -0,0 +1,87 @@
+model:
+  component_key: model
+  variant_key: fsdp2_wrapped
+  config:
+    model:
+      instance_key: initialized_model
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: FP_32    # must be FP32 in the test to prevent rounding issues
+      reduce_dtype: FP_32   # must be FP32 in the test to prevent rounding issues
+    block_names: [GPT2Block]
+
+
+initialized_model:
+  component_key: model
+  variant_key: model_initialized
+  config:
+    model:
+      instance_key: model_raw
+      pass_type: BY_REFERENCE
+    model_initializer:
+      component_key: model_initialization
+      variant_key: composed
+      config:
+        model_type: gpt2
+        weight_init_type: scaled
+        mean: 0.0
+        std: 0.02
+        num_layers: ${model_raw.config.n_layer}
+
+model_raw:
+  component_key: model
+  variant_key: gpt2
+  config:
+    use_meta_device: false
+    use_weight_tying: false
+    sample_key: input_ids
+    poe_type: NOPE
+    sequence_length: 1024
+    prediction_key: logits
+    vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: 5
+    n_head_q: 8
+    n_head_kv: 4
+    ffn_hidden: 256
+    n_embd: 256
+    dropout: 0.0
+    bias: false # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    attention_config:
+      qkv_transforms:
+        - type_hint: RotaryTransform
+          config:
+            n_embd: ${model_raw.config.n_embd}
+            n_head: ${model_raw.config.n_head_q} #it has to be head_q here
+            seq_length_dim: -2
+            base_freq: 10000
+    attention_implementation: pytorch_flash
+    activation_type: swiglu
+    attention_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    ffn_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    lm_head_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+
+device_mesh:
+  component_key: device_mesh
+  variant_key: default
+  config:
+    device_type: cuda
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: -1
+    context_parallel_degree: 1
+    tensor_parallel_degree: 1
+    world_size: 4