GesuBackups · pull · Apr 2, 2026 · Mar 13, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/README.md b/README.md
@@ -52,20 +52,20 @@ To install the current release, which includes support for
 Windows)*:
 
 ```
-$ pip install tensorflow
+ pip install tensorflow
 ```
 
 Other devices (DirectX and MacOS-metal) are supported using
 [Device Plugins](https://www.tensorflow.org/install/gpu_plugins#available_devices).
 
-A smaller CPU-only package is also available:
+A smaller CPU-only TensorFlow package is also available:
 
 ```
-$ pip install tensorflow-cpu
+ pip install tensorflow-cpu
 ```
 
-To update TensorFlow to the latest version, add `--upgrade` flag to the above
-commands.
+To update TensorFlow to the latest version, add the `--upgrade` flag to the
+commands above.
 
 *Nightly binaries are available for testing using the
 [tf-nightly](https://pypi.python.org/pypi/tf-nightly) and

diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -101,7 +101,7 @@ jobs:
           GITHUB_EVENT_PULL_REQUEST_HEAD_REF: ${{ github.event.pull_request.head.ref }}
           GITHUB_EVENT_PULL_REQUEST_BASE_REF: ${{ github.event.pull_request.base.ref }}
       # Find the current PR number, if any, because github context doesn't have it for push events.
-      - uses: jwalton/gh-find-current-pr@89ee5799558265a1e0e31fab792ebb4ee91c016b # ratchet:jwalton/gh-find-current-pr@v1.3.3
+      - uses: jwalton/gh-find-current-pr@f3d61b485d2801773f7a07b2aaa3306bd8f8e653 # ratchet:jwalton/gh-find-current-pr@v1.3.5
         id: find_pr
         with:
           # Can be "open", "closed", or "all".  Defaults to "open".

diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -117,7 +117,7 @@ jobs:
           GITHUB_EVENT_PULL_REQUEST_HEAD_REF: ${{ github.event.pull_request.head.ref }}
           GITHUB_EVENT_PULL_REQUEST_BASE_REF: ${{ github.event.pull_request.base.ref }}
       # Find the current PR number, if any, because github context doesn't have it for push events.
-      - uses: jwalton/gh-find-current-pr@89ee5799558265a1e0e31fab792ebb4ee91c016b # ratchet:jwalton/gh-find-current-pr@v1.3.3
+      - uses: jwalton/gh-find-current-pr@f3d61b485d2801773f7a07b2aaa3306bd8f8e653 # ratchet:jwalton/gh-find-current-pr@v1.3.5
         id: findPr
         with:
           # Can be "open", "closed", or "all".  Defaults to "open".

diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -48,6 +48,8 @@ cc_library(
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_map",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",

diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -65,6 +64,9 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -129,8 +131,6 @@ bool Needs64BitIndices(const HloComputation* computation) {
 
 }  // namespace
 
-using mlir::AffineExpr;
-
 IndexingMap GetDefaultIndexingMap(
     absl::Span<const int64_t> thread_tile_sizes,
     absl::Span<const int64_t> shape,
@@ -144,15 +144,16 @@ IndexingMap GetDefaultIndexingMap(
     thread_tile_counts.push_back(CeilDiv(dim_size, tile_size));
   }
   // Delinearize thread_expr w.r.t. number of thread tiles per dimension.
-  auto thread_expr = mlir::getAffineDimExpr(0, mlir_context);
-  SmallVector<AffineExpr, 4> thread_ids =
+  auto thread_expr = CreateDimExpr(0, mlir_context);
+  SmallVector<SymbolicExpr, 4> thread_ids =
       DelinearizeInBoundsIndex(thread_expr, thread_tile_counts);
-  SmallVector<AffineExpr, 4> result;
+  SmallVector<SymbolicExpr> result;
   result.reserve(thread_ids.size());
-  auto linear_index = mlir::getAffineSymbolExpr(0, mlir_context);
-  SmallVector<AffineExpr, 4> indices_in_tile =
+  auto linear_index =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, mlir_context);
+  SmallVector<SymbolicExpr, 4> indices_in_tile =
       DelinearizeInBoundsIndex(linear_index, thread_tile_sizes);
-  SmallVector<std::pair<AffineExpr, Interval>, 4> constraints;
+  SmallVector<std::pair<SymbolicExpr, Interval>, 4> constraints;
   constraints.reserve(thread_ids.size());
   for (auto [tile_size, thread_id, index_in_tile, dim] :
        llvm::zip(thread_tile_sizes, thread_ids, indices_in_tile, shape)) {
@@ -162,10 +163,10 @@ IndexingMap GetDefaultIndexingMap(
   int64_t num_threads = Product(thread_tile_counts);
   int64_t num_tile_elements = Product(thread_tile_sizes);
 
-  auto affine_map = mlir::AffineMap::get(/*num_dims=*/1, /*num_symbols=*/1,
-                                         result, mlir_context);
+  auto symbolic_map = SymbolicMap::Get(mlir_context, /*num_dimensions=*/1,
+                                       /*num_symbols=*/1, result);
   return IndexingMap(
-      affine_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
+      symbolic_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
       {IndexingMap::Variable({0, num_tile_elements - 1, "linear_index"})}, {},
       constraints);
 }

diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -62,6 +61,9 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -210,13 +212,13 @@ CpuScatterFusion::CpuScatterFusion(const BufferAssignment& buffer_assignment,
 IndexingMap GetScatterIndexingMap(
     absl::Span<const int64_t> updates_operand_shape, int64_t num_threads,
     int64_t vector_size, mlir::MLIRContext* context) {
-  using mlir::AffineExpr;
-
   // Delinearize thread_expr w.r.t. number of thread tiles per dimension.
-  auto thread_expr = mlir::getAffineDimExpr(0, context);
-  auto index_id = mlir::getAffineSymbolExpr(0, context);
-  auto slice_linear_index = mlir::getAffineSymbolExpr(1, context);
-  auto vector_element_id = mlir::getAffineSymbolExpr(2, context);
+  auto thread_expr = CreateDimExpr(0, context);
+  auto index_id = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, context);
+  auto slice_linear_index =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, context);
+  auto vector_element_id =
+      CreateSymbolExpr(/*symbol_id=*/2, /*num_dims=*/1, context);
 
   int64_t num_updates = updates_operand_shape.front();
   int64_t num_updates_per_thread = CeilOfRatio(num_updates, num_threads);
@@ -226,22 +228,23 @@ IndexingMap GetScatterIndexingMap(
   int64_t num_vectors_per_slice = CeilOfRatio(num_slice_elements, vector_size);
 
   // Loop w.r.t. indices.
-  AffineExpr updates_id_expr = thread_expr * num_updates_per_thread + index_id;
-  AffineExpr slice_linear_index_expr =
+  SymbolicExpr updates_id_expr =
+      thread_expr * num_updates_per_thread + index_id;
+  SymbolicExpr slice_linear_index_expr =
       slice_linear_index * vector_size + vector_element_id;
-  llvm::SmallVector<AffineExpr, 4> indices_in_tile =
+  SmallVector<SymbolicExpr, 4> indices_in_tile =
       DelinearizeInBoundsIndex(slice_linear_index_expr, slice_shape);
-  llvm::SmallVector<AffineExpr, 4> result{updates_id_expr};
+  llvm::SmallVector<SymbolicExpr> result{updates_id_expr};
   result.append(indices_in_tile.begin(), indices_in_tile.end());
 
-  SmallVector<std::pair<AffineExpr, Interval>, 4> constraints{
+  SmallVector<std::pair<SymbolicExpr, Interval>, 4> constraints{
       {updates_id_expr, {0, num_updates}},
       {slice_linear_index_expr, {0, num_slice_elements - 1}}};
 
-  auto affine_map =
-      mlir::AffineMap::get(/*num_dims=*/1, /*num_symbols=*/3, result, context);
+  auto symbolic_map = SymbolicMap::Get(context, /*num_dimensions=*/1,
+                                       /*num_symbols=*/3, result);
   return IndexingMap(
-      affine_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
+      symbolic_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
       {IndexingMap::Variable({0, num_updates_per_thread - 1, "index_id"}),
        IndexingMap::Variable({0, num_vectors_per_slice - 1, "vector_id"}),
        IndexingMap::Variable({0, vector_size - 1, "vector_element_id"})},

diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -1052,12 +1052,12 @@ cc_library(
         "//xla/service/gpu:gpu_conv_runner",
         "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:engine_options",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
@@ -1227,6 +1227,7 @@ xla_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -1260,6 +1261,7 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",

diff --git a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
@@ -103,16 +103,16 @@ absl::Status Autotune(HloModule& module) {
   DebugOptions debug_options = GetDebugOptionsFromFlags();
   Compiler::GpuTargetConfig target_config(stream_executor);
 
+  std::unique_ptr<se::DeviceAddressAllocator> allocator =
+      std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+          stream_executor);
+
   mlir::MLIRContext mlir_context;
   xla::RegisterSymbolicExprStorage(&mlir_context);
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<CodegenBackend>> backends,
                       gpu_compiler->GetAutotunerBackends(
-                          stream_executor, &target_config, alias_info.get(),
-                          debug_options, &mlir_context));
-
-  std::unique_ptr<se::DeviceAddressAllocator> allocator =
-      std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
-          stream_executor);
+                          stream_executor, allocator.get(), &target_config,
+                          alias_info.get(), debug_options, &mlir_context));
 
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuner",
                                       tsl::port::MaxParallelism());

diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory.h b/third_party/xla/xla/backends/gpu/autotuner/factory.h
@@ -38,7 +38,8 @@ namespace gpu {
 // returned.
 struct GetCodegenBackends {
   using Type = std::function<std::vector<std::unique_ptr<CodegenBackend>>(
-      stream_executor::StreamExecutor*, const DebugOptions*, Compiler*,
+      stream_executor::StreamExecutor*,
+      stream_executor::DeviceAddressAllocator*, const DebugOptions*, Compiler*,
       const Compiler::GpuTargetConfig*, const AliasInfo* alias_info,
       mlir::MLIRContext* mlir_context,
       absl::Span<const autotuner::Backend> backend_allowlist)>;

diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
@@ -83,6 +83,7 @@ std::unique_ptr<HloPassPipeline> GetCustomKernelRewriterPipeline(
 
 std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
     stream_executor::StreamExecutor* stream_executor,
+    stream_executor::DeviceAddressAllocator* device_allocator,
     const DebugOptions* debug_options, Compiler* compiler,
     const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
     MLIRContext* mlir_context,

diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
@@ -78,15 +78,17 @@ std::unique_ptr<HloPassPipeline> GetGemmRewriterPipeline(
 
 std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
     stream_executor::StreamExecutor* stream_executor,
+    stream_executor::DeviceAddressAllocator* device_allocator,
     const DebugOptions* debug_options, Compiler* compiler,
     const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
     MLIRContext* mlir_context,
     absl::Span<const autotuner::Backend> backend_allowlist) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::make_unique<TritonBackend>(
       debug_options, compiler, target_config, alias_info, mlir_context));
-  backends.push_back(std::make_unique<MIOpenBackend>(
-      stream_executor, debug_options, compiler, target_config));
+  backends.push_back(
+      std::make_unique<MIOpenBackend>(stream_executor, debug_options, compiler,
+                                      target_config, device_allocator));
   backends.push_back(std::make_unique<RocblasBackend>(
       stream_executor, debug_options, compiler, target_config));
   backends.push_back(std::make_unique<HipblasLtBackend>(

diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_test.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
@@ -56,6 +57,7 @@ class FactoryTest : public xla::HloHardwareIndependentTestBase,
   se::StreamExecutor* stream_executor_;
   Compiler::GpuTargetConfig target_config_;
   DebugOptions debug_options_;
+  se::StreamExecutorMemoryAllocator allocator_;
 
   FactoryTest()
       : platform_(se::PlatformManager::PlatformWithName(
@@ -64,7 +66,8 @@ class FactoryTest : public xla::HloHardwareIndependentTestBase,
                       .value()),
         compiler_(xla::Compiler::GetForPlatform(platform_->id()).value()),
         stream_executor_(platform_->ExecutorForDevice(0).value()),
-        target_config_(stream_executor_) {}
+        target_config_(stream_executor_),
+        allocator_(stream_executor_) {}
 };
 
 TEST_P(FactoryTest, GetCodegenBackends) {
@@ -82,9 +85,9 @@ TEST_P(FactoryTest, GetCodegenBackends) {
     AliasInfo alias_info;
     xla::RegisterSymbolicExprStorage(&mlir_context);
     std::vector<std::unique_ptr<CodegenBackend>> backends =
-        get_codegen_backends(stream_executor_, &debug_options_, compiler_.get(),
-                             &target_config_, &alias_info, &mlir_context,
-                             GetParam().names);
+        get_codegen_backends(stream_executor_, &allocator_, &debug_options_,
+                             compiler_.get(), &target_config_, &alias_info,
+                             &mlir_context, GetParam().names);
     EXPECT_EQ(backends.size(), GetParam().expected_num_backends);
   } else {
     GTEST_SKIP() << "Skipping test for platform " << platform_->id();