Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
76f9f18
Fix grammar and formatting in README installation section
vamshikiran065-jpg Mar 13, 2026
432612d
Always pass TargetMachineOptions to FFI calls from CustomCallThunk.
khasanovaa Apr 2, 2026
01aa15c
[XLA:GPU] Add symmetric memory to collective memory cache to prevent …
PatriosTheGreat Apr 2, 2026
f19ff47
PR #39622: [ROCm] Use BFCAllocator for scratch allocations needed for…
draganmladjenovic Apr 2, 2026
b608e9b
PR #39843: Bump jwalton/gh-find-current-pr from 1.3.3 to 1.3.5
dependabot[bot] Apr 2, 2026
386b3e5
Improve parentheses in SymbolicExpr serialization
tensorflower-gardener Apr 2, 2026
a88ebff
PR #39951: [xla:gpu] Add missing scheduling for all-gather-start
ezhulenev Apr 2, 2026
e6f2c70
Roll-forward Migrate XLA GPU emitters and HWIR to use SymbolicMap.
tensorflower-gardener Apr 2, 2026
a209ca3
[XLA:GPU] Normalize two batch dimensions (which may appear as a resul…
mooskagh Apr 2, 2026
07be721
Merge pull request #112288 from vamshikiran065-jpg:master
tensorflower-gardener Apr 2, 2026
80671cb
Migrate XLA codegen attributes and operations to the new SymbolicMap
tensorflower-gardener Apr 2, 2026
88bb528
Make CpuTargetMachineOptions always available through GpuTopology
beckerhe Apr 2, 2026
dfbfe3e
[NFC] Annotate forwarding headers with `INLINER_FORWARD_TO`.
allanrenucci Apr 2, 2026
a1f7a81
PR #40252: [xla:gpu] Use Global HangWatchdog in se_gpu_pjrt_client
ezhulenev Apr 2, 2026
e2c9f36
Avoid crashing in HloVerifier for unexpected shapes in AllGather.
akuegel Apr 2, 2026
86788d3
[XLA:GPU] Introduce EmitterContext to group the params.
pifon2a Apr 2, 2026
e837795
[XLA:GPU] Accumulate TiledHloInstructions for the runtime variables d…
pifon2a Apr 2, 2026
69b74f6
Remove test timeout long from prng_test.
akuegel Apr 2, 2026
d11cc38
Remove unused `options` parameter from `CompileTargetBinary`.
beckerhe Apr 2, 2026
ab0c715
Migrate indexing analysis from MLIR Affine to SymbolicMap
tensorflower-gardener Apr 2, 2026
e73179c
PR #40232: Add missing kAllGatherStart/kAllGatherDone verification to…
kredd2506 Apr 2, 2026
6185942
Automated Code Change
tensorflower-gardener Apr 2, 2026
45f557d
PR #39373: [ROCm] Fix issue with unsupported types combinations for h…
zoranjovanovic-ns Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,20 @@ To install the current release, which includes support for
Windows)*:

```
$ pip install tensorflow
pip install tensorflow
```

Other devices (DirectX and MacOS-metal) are supported using
[Device Plugins](https://www.tensorflow.org/install/gpu_plugins#available_devices).

A smaller CPU-only package is also available:
A smaller CPU-only TensorFlow package is also available:

```
$ pip install tensorflow-cpu
pip install tensorflow-cpu
```

To update TensorFlow to the latest version, add `--upgrade` flag to the above
commands.
To update TensorFlow to the latest version, add the `--upgrade` flag to the
commands above.

*Nightly binaries are available for testing using the
[tf-nightly](https://pypi.python.org/pypi/tf-nightly) and
Expand Down
2 changes: 1 addition & 1 deletion third_party/xla/.github/workflows/benchmark_postsubmit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ jobs:
GITHUB_EVENT_PULL_REQUEST_HEAD_REF: ${{ github.event.pull_request.head.ref }}
GITHUB_EVENT_PULL_REQUEST_BASE_REF: ${{ github.event.pull_request.base.ref }}
# Find the current PR number, if any, because github context doesn't have it for push events.
- uses: jwalton/gh-find-current-pr@89ee5799558265a1e0e31fab792ebb4ee91c016b # ratchet:jwalton/gh-find-current-pr@v1.3.3
- uses: jwalton/gh-find-current-pr@f3d61b485d2801773f7a07b2aaa3306bd8f8e653 # ratchet:jwalton/gh-find-current-pr@v1.3.5
id: find_pr
with:
# Can be "open", "closed", or "all". Defaults to "open".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ jobs:
GITHUB_EVENT_PULL_REQUEST_HEAD_REF: ${{ github.event.pull_request.head.ref }}
GITHUB_EVENT_PULL_REQUEST_BASE_REF: ${{ github.event.pull_request.base.ref }}
# Find the current PR number, if any, because github context doesn't have it for push events.
- uses: jwalton/gh-find-current-pr@89ee5799558265a1e0e31fab792ebb4ee91c016b # ratchet:jwalton/gh-find-current-pr@v1.3.3
- uses: jwalton/gh-find-current-pr@f3d61b485d2801773f7a07b2aaa3306bd8f8e653 # ratchet:jwalton/gh-find-current-pr@v1.3.5
id: findPr
with:
# Can be "open", "closed", or "all". Defaults to "open".
Expand Down
2 changes: 2 additions & 0 deletions third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ cc_library(
"//xla/codegen/emitters/ir:xla",
"//xla/codegen/emitters/ir:xla_attrs_inc_gen",
"//xla/hlo/analysis:indexing_analysis",
"//xla/hlo/analysis:interval",
"//xla/hlo/analysis:symbolic_map",
"//xla/hlo/ir:hlo",
"//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
"//xla/mlir_hlo",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ limitations under the License.
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
Expand All @@ -65,6 +64,9 @@ limitations under the License.
#include "xla/codegen/emitters/type_util.h"
#include "xla/hlo/analysis/indexing_analysis.h"
#include "xla/hlo/analysis/indexing_map.h"
#include "xla/hlo/analysis/interval.h"
#include "xla/hlo/analysis/symbolic_expr.h"
#include "xla/hlo/analysis/symbolic_map.h"
#include "xla/hlo/ir/hlo_instruction.h"
#include "xla/hlo/ir/hlo_instructions.h"
#include "xla/hlo/ir/hlo_opcode.h"
Expand Down Expand Up @@ -129,8 +131,6 @@ bool Needs64BitIndices(const HloComputation* computation) {

} // namespace

using mlir::AffineExpr;

IndexingMap GetDefaultIndexingMap(
absl::Span<const int64_t> thread_tile_sizes,
absl::Span<const int64_t> shape,
Expand All @@ -144,15 +144,16 @@ IndexingMap GetDefaultIndexingMap(
thread_tile_counts.push_back(CeilDiv(dim_size, tile_size));
}
// Delinearize thread_expr w.r.t. number of thread tiles per dimension.
auto thread_expr = mlir::getAffineDimExpr(0, mlir_context);
SmallVector<AffineExpr, 4> thread_ids =
auto thread_expr = CreateDimExpr(0, mlir_context);
SmallVector<SymbolicExpr, 4> thread_ids =
DelinearizeInBoundsIndex(thread_expr, thread_tile_counts);
SmallVector<AffineExpr, 4> result;
SmallVector<SymbolicExpr> result;
result.reserve(thread_ids.size());
auto linear_index = mlir::getAffineSymbolExpr(0, mlir_context);
SmallVector<AffineExpr, 4> indices_in_tile =
auto linear_index =
CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, mlir_context);
SmallVector<SymbolicExpr, 4> indices_in_tile =
DelinearizeInBoundsIndex(linear_index, thread_tile_sizes);
SmallVector<std::pair<AffineExpr, Interval>, 4> constraints;
SmallVector<std::pair<SymbolicExpr, Interval>, 4> constraints;
constraints.reserve(thread_ids.size());
for (auto [tile_size, thread_id, index_in_tile, dim] :
llvm::zip(thread_tile_sizes, thread_ids, indices_in_tile, shape)) {
Expand All @@ -162,10 +163,10 @@ IndexingMap GetDefaultIndexingMap(
int64_t num_threads = Product(thread_tile_counts);
int64_t num_tile_elements = Product(thread_tile_sizes);

auto affine_map = mlir::AffineMap::get(/*num_dims=*/1, /*num_symbols=*/1,
result, mlir_context);
auto symbolic_map = SymbolicMap::Get(mlir_context, /*num_dimensions=*/1,
/*num_symbols=*/1, result);
return IndexingMap(
affine_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
symbolic_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
{IndexingMap::Variable({0, num_tile_elements - 1, "linear_index"})}, {},
constraints);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ limitations under the License.
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
Expand All @@ -62,6 +61,9 @@ limitations under the License.
#include "xla/codegen/mlir_kernel_source.h"
#include "xla/hlo/analysis/indexing_analysis.h"
#include "xla/hlo/analysis/indexing_map.h"
#include "xla/hlo/analysis/interval.h"
#include "xla/hlo/analysis/symbolic_expr.h"
#include "xla/hlo/analysis/symbolic_map.h"
#include "xla/hlo/ir/hlo_casting_utils.h"
#include "xla/hlo/ir/hlo_computation.h"
#include "xla/hlo/ir/hlo_instruction.h"
Expand Down Expand Up @@ -210,13 +212,13 @@ CpuScatterFusion::CpuScatterFusion(const BufferAssignment& buffer_assignment,
IndexingMap GetScatterIndexingMap(
absl::Span<const int64_t> updates_operand_shape, int64_t num_threads,
int64_t vector_size, mlir::MLIRContext* context) {
using mlir::AffineExpr;

// Delinearize thread_expr w.r.t. number of thread tiles per dimension.
auto thread_expr = mlir::getAffineDimExpr(0, context);
auto index_id = mlir::getAffineSymbolExpr(0, context);
auto slice_linear_index = mlir::getAffineSymbolExpr(1, context);
auto vector_element_id = mlir::getAffineSymbolExpr(2, context);
auto thread_expr = CreateDimExpr(0, context);
auto index_id = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, context);
auto slice_linear_index =
CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, context);
auto vector_element_id =
CreateSymbolExpr(/*symbol_id=*/2, /*num_dims=*/1, context);

int64_t num_updates = updates_operand_shape.front();
int64_t num_updates_per_thread = CeilOfRatio(num_updates, num_threads);
Expand All @@ -226,22 +228,23 @@ IndexingMap GetScatterIndexingMap(
int64_t num_vectors_per_slice = CeilOfRatio(num_slice_elements, vector_size);

// Loop w.r.t. indices.
AffineExpr updates_id_expr = thread_expr * num_updates_per_thread + index_id;
AffineExpr slice_linear_index_expr =
SymbolicExpr updates_id_expr =
thread_expr * num_updates_per_thread + index_id;
SymbolicExpr slice_linear_index_expr =
slice_linear_index * vector_size + vector_element_id;
llvm::SmallVector<AffineExpr, 4> indices_in_tile =
SmallVector<SymbolicExpr, 4> indices_in_tile =
DelinearizeInBoundsIndex(slice_linear_index_expr, slice_shape);
llvm::SmallVector<AffineExpr, 4> result{updates_id_expr};
llvm::SmallVector<SymbolicExpr> result{updates_id_expr};
result.append(indices_in_tile.begin(), indices_in_tile.end());

SmallVector<std::pair<AffineExpr, Interval>, 4> constraints{
SmallVector<std::pair<SymbolicExpr, Interval>, 4> constraints{
{updates_id_expr, {0, num_updates}},
{slice_linear_index_expr, {0, num_slice_elements - 1}}};

auto affine_map =
mlir::AffineMap::get(/*num_dims=*/1, /*num_symbols=*/3, result, context);
auto symbolic_map = SymbolicMap::Get(context, /*num_dimensions=*/1,
/*num_symbols=*/3, result);
return IndexingMap(
affine_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
symbolic_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
{IndexingMap::Variable({0, num_updates_per_thread - 1, "index_id"}),
IndexingMap::Variable({0, num_vectors_per_slice - 1, "vector_id"}),
IndexingMap::Variable({0, vector_size - 1, "vector_element_id"})},
Expand Down
4 changes: 3 additions & 1 deletion third_party/xla/xla/backends/gpu/autotuner/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1052,12 +1052,12 @@ cc_library(
"//xla/service/gpu:gpu_conv_runner",
"//xla/service/gpu:stream_executor_util",
"//xla/stream_executor:device_address",
"//xla/stream_executor:device_address_allocator",
"//xla/stream_executor:dnn",
"//xla/stream_executor:engine_options",
"//xla/stream_executor:scratch_allocator",
"//xla/stream_executor:stream",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/tsl/platform:errors",
"//xla/tsl/platform:status_macros",
"//xla/tsl/platform:statusor",
Expand Down Expand Up @@ -1227,6 +1227,7 @@ xla_test(
"//xla/stream_executor:device_description_proto_cc",
"//xla/stream_executor:platform",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/stream_executor/rocm:rocm_platform_id",
"//xla/tsl/lib/core:status_test_util",
"//xla/tsl/platform:statusor",
Expand Down Expand Up @@ -1260,6 +1261,7 @@ xla_test(
"//xla/stream_executor:platform",
"//xla/stream_executor:platform_manager",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/stream_executor/platform:platform_object_registry",
"//xla/tsl/platform:statusor",
"@com_google_absl//absl/strings",
Expand Down
12 changes: 6 additions & 6 deletions third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,16 @@ absl::Status Autotune(HloModule& module) {
DebugOptions debug_options = GetDebugOptionsFromFlags();
Compiler::GpuTargetConfig target_config(stream_executor);

std::unique_ptr<se::DeviceAddressAllocator> allocator =
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
stream_executor);

mlir::MLIRContext mlir_context;
xla::RegisterSymbolicExprStorage(&mlir_context);
TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<CodegenBackend>> backends,
gpu_compiler->GetAutotunerBackends(
stream_executor, &target_config, alias_info.get(),
debug_options, &mlir_context));

std::unique_ptr<se::DeviceAddressAllocator> allocator =
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
stream_executor);
stream_executor, allocator.get(), &target_config,
alias_info.get(), debug_options, &mlir_context));

tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuner",
tsl::port::MaxParallelism());
Expand Down
3 changes: 2 additions & 1 deletion third_party/xla/xla/backends/gpu/autotuner/factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ namespace gpu {
// returned.
struct GetCodegenBackends {
using Type = std::function<std::vector<std::unique_ptr<CodegenBackend>>(
stream_executor::StreamExecutor*, const DebugOptions*, Compiler*,
stream_executor::StreamExecutor*,
stream_executor::DeviceAddressAllocator*, const DebugOptions*, Compiler*,
const Compiler::GpuTargetConfig*, const AliasInfo* alias_info,
mlir::MLIRContext* mlir_context,
absl::Span<const autotuner::Backend> backend_allowlist)>;
Expand Down
1 change: 1 addition & 0 deletions third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ std::unique_ptr<HloPassPipeline> GetCustomKernelRewriterPipeline(

std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
stream_executor::StreamExecutor* stream_executor,
stream_executor::DeviceAddressAllocator* device_allocator,
const DebugOptions* debug_options, Compiler* compiler,
const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
MLIRContext* mlir_context,
Expand Down
6 changes: 4 additions & 2 deletions third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,17 @@ std::unique_ptr<HloPassPipeline> GetGemmRewriterPipeline(

std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
stream_executor::StreamExecutor* stream_executor,
stream_executor::DeviceAddressAllocator* device_allocator,
const DebugOptions* debug_options, Compiler* compiler,
const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
MLIRContext* mlir_context,
absl::Span<const autotuner::Backend> backend_allowlist) {
std::vector<std::unique_ptr<CodegenBackend>> backends;
backends.push_back(std::make_unique<TritonBackend>(
debug_options, compiler, target_config, alias_info, mlir_context));
backends.push_back(std::make_unique<MIOpenBackend>(
stream_executor, debug_options, compiler, target_config));
backends.push_back(
std::make_unique<MIOpenBackend>(stream_executor, debug_options, compiler,
target_config, device_allocator));
backends.push_back(std::make_unique<RocblasBackend>(
stream_executor, debug_options, compiler, target_config));
backends.push_back(std::make_unique<HipblasLtBackend>(
Expand Down
11 changes: 7 additions & 4 deletions third_party/xla/xla/backends/gpu/autotuner/factory_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ limitations under the License.
#include "xla/stream_executor/platform/platform_object_registry.h"
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_memory_allocator.h"
#include "xla/tsl/platform/statusor.h"
#include "xla/xla.pb.h"

Expand All @@ -56,6 +57,7 @@ class FactoryTest : public xla::HloHardwareIndependentTestBase,
se::StreamExecutor* stream_executor_;
Compiler::GpuTargetConfig target_config_;
DebugOptions debug_options_;
se::StreamExecutorMemoryAllocator allocator_;

FactoryTest()
: platform_(se::PlatformManager::PlatformWithName(
Expand All @@ -64,7 +66,8 @@ class FactoryTest : public xla::HloHardwareIndependentTestBase,
.value()),
compiler_(xla::Compiler::GetForPlatform(platform_->id()).value()),
stream_executor_(platform_->ExecutorForDevice(0).value()),
target_config_(stream_executor_) {}
target_config_(stream_executor_),
allocator_(stream_executor_) {}
};

TEST_P(FactoryTest, GetCodegenBackends) {
Expand All @@ -82,9 +85,9 @@ TEST_P(FactoryTest, GetCodegenBackends) {
AliasInfo alias_info;
xla::RegisterSymbolicExprStorage(&mlir_context);
std::vector<std::unique_ptr<CodegenBackend>> backends =
get_codegen_backends(stream_executor_, &debug_options_, compiler_.get(),
&target_config_, &alias_info, &mlir_context,
GetParam().names);
get_codegen_backends(stream_executor_, &allocator_, &debug_options_,
compiler_.get(), &target_config_, &alias_info,
&mlir_context, GetParam().names);
EXPECT_EQ(backends.size(), GetParam().expected_num_backends);
} else {
GTEST_SKIP() << "Skipping test for platform " << platform_->id();
Expand Down
Loading
Loading