add KLShampoo

ClashLuke · ClashLuke · commit b35bd9336c10 · 2026-04-27T01:13:06.000+02:00
diff --git a/heavyball/__init__.py b/heavyball/__init__.py
@@ -737,6 +737,60 @@ def __init__(
         self._build_soap_defaults(locals(), fns=(C.scale_by_kl_soap,))
 
 
+class KLShampoo(SOAPBase):
+    """
+    KL-Shampoo
+
+    Shampoo with KL-corrected Kronecker factor accumulation, applied directly as
+    ⊗_i Q[i] diag(d_i^{-1/2}) Q[i].T to a momentum-EMA gradient. Unlike KL-SOAP,
+    no Adam runs in the projected space, and the eigenvalues d_i = diag(Q[i].T @ GG[i] @ Q[i])
+    are the preconditioner. GG is seeded with init_factor * I to keep the first preconditioner
+    uniform (= 1/sqrt(init_factor) * I) instead of exploding along the rank-1 null space.
+
+    Sources:
+        KL-Shampoo:
+            Understanding and Improving Shampoo and SOAP via Kullback-Leibler Minimization
+            Wu Lin, Scott C. Lowe, Felix Dangel, Runa Eschenhagen, Zikun Xu, Roger B. Grosse
+            https://arxiv.org/abs/2509.03378
+    """
+
+    def __init__(
+        self,
+        params,
+        lr: float = 3e-3,
+        betas=(0.9, 0.95),
+        shampoo_beta: float = 0.95,
+        eps: float = 1e-8,
+        weight_decay: float = 0.01,
+        precondition_frequency: int = 2,
+        max_precond_dim: int = 2048,
+        merge_dims: bool = True,
+        precondition_1d: bool = False,
+        warmup_steps: int = 0,
+        split: bool = False,
+        multi_tensor: bool = True,
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        palm: bool = C.use_default,
+        precond_scheduler=(1 / 3, 9),
+        beta2_scale: float = 0.8,
+        use_precond_schedule: bool = C.use_default,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        storage_dtype: str = "float32",
+        precond_grad_accum: bool = False,
+        compile_step: bool = C.use_default,
+        promote: bool = C.use_default,
+        ecc: str | None = None,
+        param_ecc: str | None = None,
+        orig_shapes: ShapeMap | None = None,
+        init_factor: float = 0.1,
+        **kwargs,
+    ):
+        self._build_soap_defaults(locals(), fns=(C.scale_by_kl_shampoo,))
+
+
 class SOAPNAdam(SOAPBase):
     def __init__(
         self,
diff --git a/heavyball/chainable.py b/heavyball/chainable.py
@@ -1073,7 +1073,9 @@ def scion_auto_norm(group, update, grad, param, scion_state):
 
 
 def _init_soap(state, group, update, grad, param):
-    utils.init_preconditioner(grad, state, group["max_precond_dim"], group["precondition_1d"])
+    utils.init_preconditioner(
+        grad, state, group["max_precond_dim"], group["precondition_1d"], group.get("init_factor", 0.0)
+    )
 
 
 def _apply_soap_preconditioner(group, update, Q, GG, *references, use_kl: bool = False, eps=1e-8):
@@ -1132,6 +1134,17 @@ def scale_by_kl_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG):
     return precond
 
 
+@needs_full_param
+@zero_guard("exp_avg")
+@general_guard("Q", "GG", init_fn=_init_soap)
+@no_state
+def scale_by_kl_shampoo(group, update, grad, param, exp_avg, Q, GG):
+    utils.stochastic_lerp_(exp_avg, update, 1 - utils.get_beta1(group))
+    precond = [utils.kl_shampoo_precondition(e, q, gg, group["eps"]) for e, q, gg in zip(exp_avg, Q, GG)]
+    _apply_soap_preconditioner(group, update, Q, GG, use_kl=True, eps=group["eps"])
+    return precond
+
+
 @needs_full_param
 @zero_guard("exp_avg", "exp_avg_sq")
 @general_guard("mu_product", init_fn=_init_mu_product, skip_first=False)
diff --git a/heavyball/utils.py b/heavyball/utils.py
@@ -744,18 +744,16 @@ def get_orthogonal_matrix_QR(GG: List[Tensor], Q: List[Tensor], *exp_avg: Tensor
     :param GG: List of accumulated gradient outer products.
     :param Q: List of current eigenbases (updated in-place to Q_new).
     :param exp_avg: Exponential moving average in the old eigenspace (updated in-place if provided).
+        Pass nothing (or only `None` entries) to refresh Q without rotating any state.
     """
-    if not exp_avg:
+    if isinstance(Q, list) and not Q:
         return
 
-    ref = exp_avg[0]
-    if ref.dim() == 0:  # preconditioning doesn't make sense here
+    ref = exp_avg[0] if exp_avg else None
+    if ref is not None and ref.dim() == 0:  # preconditioning doesn't make sense here
         Q.clear()
         return
 
-    if isinstance(Q, list) and not Q:
-        return
-
     if ref is not None and ref.dim() != len(Q):
         raise ValueError(f"ref dim {ref.dim()} does not match Q length {len(Q)}")
 
@@ -778,7 +776,8 @@ def get_orthogonal_matrix_QR(GG: List[Tensor], Q: List[Tensor], *exp_avg: Tensor
 
     if ref is None:
         for q, q_new in zip(Q, new_qs):
-            copy_stochastic_(q, q_new)
+            if q is not None:
+                copy_stochastic_(q, q_new)
         return
 
     assert ref.ndim < 13, "ref.ndim must be less than 13"
@@ -1145,6 +1144,28 @@ def update_ggt_kl(grad, GG, Q, max_precond_dim, precondition_1d, beta, eps):
         stochastic_lerp_(m, outer, 1 - beta)
 
 
+@decorator_knowngood
+def _kl_shampoo_kron_scale(grad: Tensor, Q: List[Optional[Tensor]], GG: List[Optional[Tensor]], eps: float):
+    out = promote(grad)
+    for idx, (q, m) in enumerate(zip(Q, GG)):
+        if q is None or m is None:
+            continue
+        q32, m32 = promote(q), promote(m)
+        d = ((q32.T @ m32) * q32.T).sum(dim=1).clamp_min(eps).rsqrt()
+        shape = [1] * out.ndim
+        shape[idx] = -1
+        out = out * d.view(shape)
+    return out.to(grad.dtype)
+
+
+def kl_shampoo_precondition(grad, Q, GG, eps):
+    """KL-Shampoo Kronecker preconditioner (arXiv:2509.03378).
+
+    Applies ⊗_i Q[i] diag(d_i^{-1/2}) Q[i].T to grad, with d_i = diag(Q[i].T @ GG[i] @ Q[i]).
+    """
+    return project(_kl_shampoo_kron_scale(project(grad, Q, back=False), Q, GG, eps), Q, back=True)
+
+
 def tree_apply(fn: Callable[[Any], Any]) -> Callable[[Any], Any]:
     def _fn(*args):
         return tree_map(fn, *args)
@@ -1241,22 +1262,29 @@ def update_preconditioner(grad, Q, GG, exp_avg, max_precond_dim, precondition_1d
         get_orthogonal_matrix_QR(GG, Q, *exp_avg)
 
 
-def init_preconditioner(grad, state, max_precond_dim, precondition_1d):
+def init_preconditioner(grad, state, max_precond_dim, precondition_1d, init_factor: float = 0.0):
     """
     Initializes the preconditioner matrices (L and R in the paper).
+
+    If init_factor > 0, GG starts as init_factor * I per side (uniform-eigval seed used by KL-Shampoo
+    to avoid the rank-1 explosion: 1/sqrt(eps) along null directions). Otherwise, seeds with one
+    outer product of grad (standard SOAP behavior).
     """
     state["GG"] = []  # Will hold all the preconditioner matrices (L and R in the paper).
     if grad.numel() > 1 and (grad.ndim > 1 or precondition_1d):
         for sh in grad.shape:
             if sh > max_precond_dim or sh == 1:
                 # via @francois-rozet: https://github.com/HomebrewML/HeavyBall/commit/8b86be04967e2d095136d5603724f488f2d46592#diff-a430393dd0a6ee393944a9ed16416115c175de2414cf4a96e647197697f265e9R621
                 state["GG"].append(None)
+            elif init_factor > 0:
+                state["GG"].append(torch.eye(sh, device=grad.device, dtype=grad.dtype) * init_factor)
             else:
                 state["GG"].append(torch.zeros(sh, sh, device=grad.device, dtype=grad.dtype))
     else:
         state["GG"].append(None)
 
-    update_ggt(grad, state["GG"], max_precond_dim, precondition_1d, 0)
+    if init_factor <= 0:
+        update_ggt(grad, state["GG"], max_precond_dim, precondition_1d, 0)
     state["Q"] = get_orthogonal_matrix(state["GG"])
 
 
diff --git a/test/test_chainable_cpu.py b/test/test_chainable_cpu.py
@@ -84,6 +84,8 @@ def state_fn(_x):
 # Optimizers whose chains use shape-dependent or global-reduction ops must need gather
 _EXPECT_GATHER = {
     "SOAP",
+    "KLSOAP",
+    "KLShampoo",
     "SOAPNAdam",
     "SOAPAdEMAMix",
     "SOLP",