dinov3 -> 36 errors

enpasos · enpasos · commit 7ff2e5e45d64 · 2026-03-23T15:47:27.000+01:00
diff --git a/jaxamples/mnist_config.py b/jaxamples/mnist_config.py
@@ -221,6 +221,8 @@ class MnistDinoV3ModelConfig(ConfigMixin):
     head_hidden_dim: int = 192
     head_dropout_rate: float = 0.1
     pool_features: str = "cls_mean"
+    use_conv_stem: bool = True
+    stem_hidden_dim: int = 32
 
     def validate(self) -> None:
         _require(self.img_size > 0, "img_size must be > 0.")
@@ -239,6 +241,7 @@ def validate(self) -> None:
         _require(self.num_classes >= 2, "num_classes must be >= 2.")
         _require(self.num_storage_tokens >= 0, "num_storage_tokens must be >= 0.")
         _require(self.head_hidden_dim > 0, "head_hidden_dim must be > 0.")
+        _require(self.stem_hidden_dim > 0, "stem_hidden_dim must be > 0.")
         _validate_dropout(self.head_dropout_rate, "head_dropout_rate")
         _require(
             self.pool_features in {"cls", "cls_mean"},
diff --git a/jaxamples/mnist_dinov3.py b/jaxamples/mnist_dinov3.py
@@ -56,6 +56,41 @@ def prepare_dinov3_inputs(
     return nchw_images
 
 
+class MnistDinoV3ConvStem(nnx.Module):
+    """Light conv stem that gives the DINO backbone a stronger MNIST front-end."""
+
+    def __init__(self, *, hidden_dim: int, rngs: nnx.Rngs):
+        params_key = rngs.params()
+        conv1_key, norm1_key, conv2_key, norm2_key = jax.random.split(params_key, 4)
+
+        self.conv1 = nnx.Conv(
+            in_features=1,
+            out_features=hidden_dim,
+            kernel_size=(3, 3),
+            padding="SAME",
+            use_bias=False,
+            rngs=nnx.Rngs(conv1_key),
+        )
+        self.norm1 = nnx.LayerNorm(hidden_dim, rngs=nnx.Rngs(norm1_key))
+        self.conv2 = nnx.Conv(
+            in_features=hidden_dim,
+            out_features=3,
+            kernel_size=(3, 3),
+            padding="SAME",
+            use_bias=False,
+            rngs=nnx.Rngs(conv2_key),
+        )
+        self.norm2 = nnx.LayerNorm(3, rngs=nnx.Rngs(norm2_key))
+
+    def __call__(self, images: jax.Array) -> jax.Array:
+        features = self.conv1(images)
+        features = self.norm1(features)
+        features = nnx.gelu(features, approximate=False)
+        features = self.conv2(features)
+        features = self.norm2(features)
+        return nnx.gelu(features, approximate=False)
+
+
 class MnistDinoV3Classifier(nnx.Module):
     """Small MNIST classifier using the DINOv3 ViT backbone from jax2onnx."""
 
@@ -72,16 +107,23 @@ def __init__(
         head_hidden_dim: int = 192,
         head_dropout_rate: float = 0.1,
         pool_features: str = "cls_mean",
+        use_conv_stem: bool = True,
+        stem_hidden_dim: int = 32,
         rngs: nnx.Rngs,
     ):
         params_key = rngs.params()
-        backbone_key, head_key = jax.random.split(params_key)
+        stem_key, backbone_key, head_key = jax.random.split(params_key, 3)
         head_norm_key, head_hidden_key, head_dropout_key, head_out_key = jax.random.split(
             head_key, 4
         )
 
         self.img_size = int(img_size)
         self.pool_features = pool_features
+        self.input_stem = (
+            MnistDinoV3ConvStem(hidden_dim=stem_hidden_dim, rngs=nnx.Rngs(stem_key))
+            if use_conv_stem
+            else None
+        )
         self.backbone = DinoVisionTransformer(
             img_size=img_size,
             patch_size=patch_size,
@@ -144,7 +186,10 @@ def _pool_head_features(self, tokens: jax.Array) -> jax.Array:
     def __call__(
         self, images: jax.Array, *, deterministic: bool = True
     ) -> jax.Array:
-        backbone_inputs = prepare_dinov3_inputs(images, expected_size=self.img_size)
+        stemmed_images = self.input_stem(images) if self.input_stem is not None else images
+        backbone_inputs = prepare_dinov3_inputs(
+            stemmed_images, expected_size=self.img_size
+        )
         tokens = self._encode_backbone(backbone_inputs)
         head_features = self._pool_head_features(tokens)
         head_features = self.head_norm(head_features)
@@ -160,30 +205,34 @@ def get_default_config() -> MnistExampleConfig:
     model_config = MnistDinoV3ModelConfig(
         img_size=28,
         patch_size=4,
-        embed_dim=192,
-        depth=4,
-        num_heads=6,
+        embed_dim=256,
+        depth=6,
+        num_heads=8,
         num_classes=10,
         num_storage_tokens=0,
-        head_hidden_dim=192,
+        head_hidden_dim=256,
         head_dropout_rate=0.1,
         pool_features="cls_mean",
+        use_conv_stem=True,
+        stem_hidden_dim=32,
     )
     checkpoint_name = (
         "dinov3_"
         f"p{model_config.patch_size}_"
         f"dim{model_config.embed_dim}_"
         f"d{model_config.depth}_"
         f"h{model_config.num_heads}_"
-        f"{model_config.pool_features}_checkpoints"
+        f"{model_config.pool_features}_"
+        f"{'stem' + str(model_config.stem_hidden_dim) if model_config.use_conv_stem else 'nostem'}_checkpoints"
     )
     return MnistExampleConfig(
         seed=5678,
         training=shared_mnist_training_config(
             checkpoint_dir=os.path.abspath(os.path.join("./data", checkpoint_name)),
             output_dir=default_output_dir,
         ),
-        # Match the ViT example more closely on token count and parameter budget.
+        # Borrow a stronger local front-end and a slightly larger backbone to close the
+        # gap to the stronger MNIST ViT baseline.
         model=model_config,
         onnx=OnnxConfig(
             model_name="mnist_dinov3_model",
diff --git a/onnx/mnist_dinov3_model.onnx b/onnx/mnist_dinov3_model.onnx
diff --git a/onnx/mnist_dinov3_model_config.json b/onnx/mnist_dinov3_model_config.json
@@ -1,15 +1,17 @@
 {
   "model": {
-    "depth": 4,
-    "embed_dim": 192,
+    "depth": 6,
+    "embed_dim": 256,
     "head_dropout_rate": 0.1,
-    "head_hidden_dim": 192,
+    "head_hidden_dim": 256,
     "img_size": 28,
     "num_classes": 10,
-    "num_heads": 6,
+    "num_heads": 8,
     "num_storage_tokens": 0,
     "patch_size": 4,
-    "pool_features": "cls_mean"
+    "pool_features": "cls_mean",
+    "stem_hidden_dim": 32,
+    "use_conv_stem": true
   },
   "onnx": {
     "input_params": {
@@ -30,15 +32,15 @@
   "training": {
     "augmentation": {
       "elastic_alpha": 1.2,
-      "elastic_probability": 0.35,
-      "elastic_sigma": 0.9,
+      "elastic_probability": 0.3,
+      "elastic_sigma": 1.0,
       "enable_elastic": true,
       "enable_rect_erasing": false,
       "enable_rotation": true,
       "enable_scaling": true,
       "enable_translation": true,
-      "max_rotation": 10.0,
-      "max_translation": 2.5,
+      "max_rotation": 12.0,
+      "max_translation": 4.0,
       "rect_erase_height": 2,
       "rect_erase_width": 20,
       "rect_erasing_probability": 0.0,
@@ -47,15 +49,15 @@
       "scale_max_y": 1.1,
       "scale_min_x": 0.9,
       "scale_min_y": 0.9,
-      "scaling_probability": 0.7,
+      "scaling_probability": 0.6,
       "translation_probability": 0.8
     },
     "base_learning_rate": 0.0001,
     "batch_size": 64,
-    "checkpoint_dir": "/home/enpasos/projects/jaxamples/data/dinov3_p4_dim192_d4_h6_cls_mean_checkpoints",
+    "checkpoint_dir": "/home/enpasos/projects/jaxamples/data/dinov3_p4_dim256_d6_h8_cls_mean_stem32_checkpoints",
     "data_dir": "./data",
     "enable_training": true,
-    "num_epochs_to_train_now": 500,
+    "num_epochs_to_train_now": 700,
     "output_dir": "/home/enpasos/projects/jaxamples/output",
     "start_epoch": 0,
     "warmup_epochs": 5,
diff --git a/tests/test_mnist_dinov3.py b/tests/test_mnist_dinov3.py
@@ -104,15 +104,17 @@ def test_mnist_dinov3_default_config_uses_fairer_budget():
     assert config.training.num_epochs_to_train_now == 700
     assert config.training.weight_decay == pytest.approx(1e-4)
     assert config.training.checkpoint_dir.endswith(
-        "dinov3_p4_dim192_d4_h6_cls_mean_checkpoints"
+        "dinov3_p4_dim256_d6_h8_cls_mean_stem32_checkpoints"
     )
     assert config.model.patch_size == 4
-    assert config.model.embed_dim == 192
-    assert config.model.depth == 4
-    assert config.model.num_heads == 6
-    assert config.model.head_hidden_dim == 192
+    assert config.model.embed_dim == 256
+    assert config.model.depth == 6
+    assert config.model.num_heads == 8
+    assert config.model.head_hidden_dim == 256
     assert config.model.head_dropout_rate == pytest.approx(0.1)
     assert config.model.pool_features == "cls_mean"
+    assert config.model.use_conv_stem is True
+    assert config.model.stem_hidden_dim == 32
 
 
 def test_lr_schedule_applies_warmup_before_cosine_decay():