Fix tf.function retracing in TensorFlow benchmark (#27665)

Rishi-Dave · web-flow · commit 727256f8d7c6 · 2026-03-22T22:23:49.000Z
## Summary - Move `tf.function`-decorated forward functions out of the inner benchmark loop to prevent unnecessary graph retracing on every `(batch_size, sequence_length)` iteration - Update deprecated `experimental_compile` to `jit_compile` (available since TF 2.4) - Hoist `import random` out of the inner loop Fixes #14953 ## Motivation When `run_with_tf_optimizations` is used as a decorator inside the innermost `(batch_size, sequence_length)` loop, each iteration creates a new Python function object. Since `tf.function` keys its trace cache on function identity, a new object means a forced retrace every iteration — the cached graph is never reused. This defeats the purpose of `tf.function` and adds significant overhead from repeated graph construction and optimization passes. The [TensorFlow documentation on tracing](https://www.tensorflow.org/guide/function#rules_of_tracing) explicitly warns against defining `tf.function`-decorated functions inside loops. ## Changes **`onnxruntime/python/tools/transformers/benchmark.py`** (1 file, ~35 insertions / ~31 deletions): 1. **Hoisted forward function definitions** (`encoder_forward`, `encoder_decoder_forward`, `lxmert_forward`) from the inner `batch_size × sequence_length` loop to the per-model scope. They are now defined once per model, and the `@run_with_tf_optimizations` decorator (which applies `@tf.function`) is only invoked once per model. 2. **Changed forward functions to accept `input_ids` as a parameter** instead of closing over the loop variable. This lets `tf.function` trace based on the tensor's `(dtype, shape)` spec and reuse cached concrete functions when shapes repeat across iterations. 3. **Updated `experimental_compile=use_xla`** to **`jit_compile=use_xla`**. The `experimental_compile` parameter was deprecated in TF 2.4 (Dec 2020) and removed in TF 2.12. 4. **Moved `import random`** from the innermost loop body to before the outer model loop — the module only needs to be imported once. 5. **Moved inference function selection** (`if config.is_encoder_decoder ... elif isinstance(config, LxmertConfig) ...`) outside the batch/sequence loops since it depends only on the model config, not on batch size or sequence length. The original priority order (`is_encoder_decoder` checked before `LxmertConfig`) is preserved. ## Test Plan - [x] `lintrunner -a` passes cleanly (no RUFF or RUFF-FORMAT violations) - [x] `python -m py_compile benchmark.py` — syntax verified - [x] Change is purely structural — function behavior (inputs, outputs, control flow) is identical - [ ] Manual verification with TensorFlow installed (TF is an optional dependency not present in the standard CI matrix; this code path is exercised via `python benchmark.py -e tensorflow`)
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
@@ -43,6 +43,7 @@
 import argparse
 import logging
 import os
+import random
 import timeit
 from datetime import datetime
 
@@ -431,7 +432,7 @@ def run_in_eager_mode(*args, **kwargs):
             return func(*args, **kwargs)
 
         @wraps(func)
-        @tf.function(experimental_compile=use_xla)
+        @tf.function(jit_compile=use_xla)
         def run_in_graph_mode(*args, **kwargs):
             return func(*args, **kwargs)
 
@@ -500,6 +501,36 @@ def run_tensorflow(
 
         max_input_size = tokenizer.model_max_length
 
+        # Define tf.function-decorated forward functions once per model, outside the
+        # batch_size/sequence_length loops. Passing input_ids as an argument (instead
+        # of closing over it) allows tf.function to cache traced graphs by input shape
+        # rather than retracing on every loop iteration. See issue #14953.
+        @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
+        def encoder_forward(input_ids):
+            return model(input_ids, training=False)  # noqa: B023
+
+        @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
+        def encoder_decoder_forward(input_ids):
+            return model(input_ids, decoder_input_ids=input_ids, training=False)  # noqa: B023
+
+        @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
+        def lxmert_forward(input_ids):
+            feats = tf.random.normal([1, 1, config.visual_feat_dim])  # noqa: B023
+            pos = tf.random.normal([1, 1, config.visual_pos_dim])  # noqa: B023
+            return model(  # noqa: B023
+                input_ids,
+                visual_feats=feats,
+                visual_pos=pos,
+                training=False,
+            )
+
+        if config.is_encoder_decoder:
+            inference = encoder_decoder_forward
+        elif isinstance(config, LxmertConfig):
+            inference = lxmert_forward
+        else:
+            inference = encoder_forward
+
         for batch_size in batch_sizes:
             if batch_size <= 0:
                 continue
@@ -510,42 +541,14 @@ def run_tensorflow(
 
                 logger.info(f"Run Tensorflow on {model_name} with input shape {[batch_size, sequence_length]}")
 
-                import random  # noqa: PLC0415
-
                 rng = random.Random()
                 values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
                 input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
 
                 try:
-                    # Disable both for better inference perf
-                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
-                    def encoder_forward():
-                        return model(input_ids, training=False)  # noqa: B023
-
-                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
-                    def encoder_decoder_forward():
-                        return model(input_ids, decoder_input_ids=input_ids, training=False)  # noqa: B023
-
-                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
-                    def lxmert_forward():
-                        feats = tf.random.normal([1, 1, config.visual_feat_dim])  # noqa: B023
-                        pos = tf.random.normal([1, 1, config.visual_pos_dim])  # noqa: B023
-                        return model(  # noqa: B023
-                            input_ids,  # noqa: B023
-                            visual_feats=feats,
-                            visual_pos=pos,
-                            training=False,
-                        )
-
-                    inference = encoder_forward
-                    if config.is_encoder_decoder:
-                        inference = encoder_decoder_forward
-                    elif isinstance(config, LxmertConfig):
-                        inference = lxmert_forward
-
-                    inference()
+                    inference(input_ids)
 
-                    runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)  # noqa: B023
+                    runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1)  # noqa: B023
 
                     result = {
                         "engine": "tensorflow",