tenstorrent · dgolubovicTT · Jun 24, 2026 · nvukobratTT · Jul 1, 2026
@@ -44,6 +44,12 @@ struct CompileOptions {
   // Enables experimental KV cache dtype override.
   std::optional<std::string> experimental_kv_cache_dtype = std::nullopt;
 
+  // Enable activation dtype lowering around CCL ops (matmul -> reduce_scatter
+  // / all_gather -> consumer). Pattern-matches Llama-style sub-graphs
+  // (O-proj+residual, MLP). Default off; flip on
+  // per-model after validating model accuracy doesn't degrade.
+  bool enable_activation_dtype_lowering = false;
+
   // Override math fidelity for all ttnn operations exposing compute kernel
   // config. Valid values: "lofi", "hifi2", "hifi3", "hifi4", "ttnn_default".
   // "ttnn_default" - means that we don't override math_fidelity in comiler,

@@ -25,6 +25,10 @@ CompileOptions CompileOptions::parse(
           .value_or(options.experimental_weight_dtype);
   options.experimental_kv_cache_dtype = internal::parseStringOption(
       compile_options, "experimental-kv-cache-dtype");
+  options.enable_activation_dtype_lowering =
+      internal::parseBoolOption(compile_options,
+                                "enable_activation_dtype_lowering")
+          .value_or(options.enable_activation_dtype_lowering);
   options.math_fidelity =
       internal::parseStringOption(compile_options, "math_fidelity");
 

@@ -984,6 +984,8 @@ tt_pjrt_status ModuleBuilder::convertFromTTIRToTTNN(
     }
     options.experimentalKVCacheDtype = dtype.value();
   }
+  options.enableActivationDtypeLowering =
+      compile_options.enable_activation_dtype_lowering;
   options.enableTrace = compile_options.enable_trace;
   options.systemDescPath = system_descriptor_path.data();
   options.enableConstEval = compile_options.enable_const_eval;

@@ -267,6 +267,7 @@ def benchmark_llm_torch_xla(
     use_indexer_cache: bool = False,
     enable_create_d2m_subgraphs: bool = False,
     experts_implementation: Optional[str] = None,
+    enable_activation_dtype_lowering: bool = False,
 ):
     """
     Benchmark an LLM (Large Language Model) using PyTorch and torch-xla.
@@ -477,6 +478,8 @@ def benchmark_llm_torch_xla(
         options["experimental-kv-cache-dtype"] = experimental_kv_cache_dtype
     if enable_create_d2m_subgraphs:
         options["enable_create_d2m_subgraphs"] = enable_create_d2m_subgraphs
+    if enable_activation_dtype_lowering:
+        options["enable_activation_dtype_lowering"] = "true"
 
     torch_xla.set_custom_compile_options(options)
 

@@ -30,6 +30,7 @@
 DEFAULT_EXPERIMENTAL_KV_CACHE_DTYPE = "bfp_bf8"
 DEFAULT_EXPERIMENTAL_ENABLE_PERMUTE_MATMUL_FUSION = False
 DEFAULT_REQUIRED_PCC = 0.94
+DEFAULT_ENABLE_ACTIVATION_DTYPE_LOWERING = False
 
 
 def default_read_logits_fn(output):
@@ -69,6 +70,7 @@ def test_llm(
     use_indexer_cache: bool = False,
     enable_create_d2m_subgraphs: bool = False,
     experts_implementation: Optional[str] = None,
+    enable_activation_dtype_lowering: bool = DEFAULT_ENABLE_ACTIVATION_DTYPE_LOWERING,
 ):
     """Test LLM model with the given variant and optional configuration overrides.
 
@@ -170,6 +172,7 @@ def test_llm(
         use_indexer_cache=use_indexer_cache,
         enable_create_d2m_subgraphs=enable_create_d2m_subgraphs,
         experts_implementation=experts_implementation,
+        enable_activation_dtype_lowering=enable_activation_dtype_lowering,
     )
 
     if output_file:
@@ -1894,6 +1897,10 @@ def test_llama_3_1_70b_tp_galaxy(
         max_output_tokens=max_output_tokens,
         decode_only=decode_only,
         optimization_level=1,
+        # Lower activations to bfp8 around the MLP/O-proj CCL ops to cut the bytes
+        # the collectives move. Validated on the full 80-layer model: TOP1 mean
+        # 95.95% vs 95.80% baseline, TOP5 100% in both, so accuracy is preserved.
+        enable_activation_dtype_lowering=True,
     )
 
 

@@ -34,6 +34,12 @@ class CompilerConfig:
     # Enables experimental KV cache dtype override in MLIR optimizer passes.
     experimental_kv_cache_dtype: Optional[str] = None
 
+    # Enable activation dtype lowering around CCL ops (matmul -> reduce_scatter
+    # all_gather -> consumer). Pattern-matches Llama-style sub-graphs
+    # (O-proj+residual, MLP). Default off; flip on
+    # per-model after validating model accuracy doesn't degrade.
+    enable_activation_dtype_lowering: bool = False
+
     # Override math fidelity for all ttnn operations exposing compute kernel
     # config. Valid values: "lofi", "hifi2", "hifi3", "hifi4", "ttnn_default".
     # "ttnn_default" - means that we don't override math_fidelity in comiler,
@@ -103,6 +109,9 @@ def to_jax_compiler_options(self) -> Dict[str, str]:
         if self.experimental_kv_cache_dtype is not None:
             options["experimental-kv-cache-dtype"] = self.experimental_kv_cache_dtype
 
+        if self.enable_activation_dtype_lowering:
+            options["enable_activation_dtype_lowering"] = "true"
+
         if self.math_fidelity is not None:
             options["math_fidelity"] = self.math_fidelity