Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
a0a9fef
Allow dots with multiple batch dimensions through GemmFusion - the ne…
vwbaker Mar 31, 2026
504ba72
Automated Code Change
tensorflower-gardener Mar 31, 2026
e594b2b
[XLA Presubmit] Add SM 12.0 to the list of known SM architecture
beckerhe Mar 31, 2026
85b0e68
PR #39693: Fix tracked_gpu_device_buffer_test in OSS by adding cuda_p…
and-ivanov Mar 31, 2026
22f615f
PR #39393: [xla:gpu] Add VA remapping for command buffer thunks
shawnwang18 Mar 31, 2026
3b29068
PR #39695: Fix xla aot compile gpu test
and-ivanov Mar 31, 2026
00af54d
Rollback of Migrate XLA GPU emitters and HWIR to use SymbolicMap.
tensorflower-gardener Mar 31, 2026
78346d7
[xla:cpu] Make small_while_loop_hoisting_pass_test use the same byte …
penpornk Mar 31, 2026
a31205c
PR #40141: [xla:gpu] Fix a bug when emitting degenerate collective pe…
ezhulenev Mar 31, 2026
b803e00
PR #40108: Update protobuf dependency version to 32.1
ezhulenev Mar 31, 2026
fb29b4d
PR #39871: [ROCm] Fix bf16 upcast handling for libdevice calls.
zoranjovanovic-ns Mar 31, 2026
6e8c95e
PR #39309: [ROCm] Add scope_range_id support to ROCm profiler
magaonka-amd Mar 31, 2026
5d6dc92
PR #39991: Add test coverage for BatchNormExpander untested code paths
kredd2506 Mar 31, 2026
bab67d9
PR #40111: Bump pygments from 2.18.0 to 2.20.0 in /xla/backends/cpu/b…
dependabot[bot] Mar 31, 2026
e41f77f
PR #40118: Fix tensor memory size check to use tcgen05 capability ins…
and-ivanov Mar 31, 2026
550c1ec
[XLA:GPU] Move triton emitter c++ tests closer to the lit tests.
pifon2a Mar 31, 2026
75ac789
[XLA:GPU] Add slice and iota emission via the new tiling.
pifon2a Mar 31, 2026
92ed57c
[XLA:GPU] Simplify xla_aot_compile_gpu_test by using HloPjRtTestBase.
akuegel Mar 31, 2026
2bb2829
Fully serialize stream_executor::DeviceDescription to/from proto.
beckerhe Mar 31, 2026
3176ece
[xla:cpu] Roll-back small_while_loop_hoisting_pass threshold refactor.
penpornk Mar 31, 2026
0dc3515
Remove gen_gpu_hlo_compile_tests bazel rule.
akuegel Mar 31, 2026
a516a5e
PR #39744: [ROCm] Porting CUB sort FFI handler consolidation to ROCm.
tsrw2048 Mar 31, 2026
aacef90
Automated Code Change
tensorflower-gardener Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion third_party/xla/MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ bazel_dep(name = "gutil", version = "20250502.0", repo_name = "com_google_gutil"
bazel_dep(name = "jsoncpp", version = "1.9.6", repo_name = "jsoncpp_git")
bazel_dep(name = "or-tools", version = "9.12", repo_name = "com_google_ortools")
bazel_dep(name = "platforms", version = "1.0.0")
bazel_dep(name = "protobuf", version = "31.1", repo_name = "com_google_protobuf")
bazel_dep(name = "protobuf", version = "32.1", repo_name = "com_google_protobuf")
bazel_dep(name = "pybind11_abseil", version = "202402.0")
bazel_dep(name = "pybind11_bazel", version = "2.13.6")
bazel_dep(name = "pybind11_protobuf", version = "0.0.0-20250210-f02a2b7")
Expand Down
2 changes: 1 addition & 1 deletion third_party/xla/build_tools/ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def commands(self) -> List[List[str]]:
return cmds


_CUDA_COMPUTE_CAPABILITIES = (60, 70, 80, 90, 100, 103)
_CUDA_COMPUTE_CAPABILITIES = (60, 70, 80, 90, 100, 103, 120)


def _tag_filters_only_for_compute_capability(
Expand Down
32 changes: 16 additions & 16 deletions third_party/xla/build_tools/ci/golden_commands.txt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ opt_einsum==3.4.0
optax==0.2.4
orbax-checkpoint==0.10.2
protobuf==6.33.5
Pygments==2.18.0
Pygments==2.20.0
PyYAML==6.0.2
rich==13.9.4
scipy==1.14.1
Expand Down
1 change: 0 additions & 1 deletion third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,6 @@ cc_library(
"//xla/codegen/emitters:utils",
"//xla/codegen/emitters/ir:xla",
"//xla/hlo/analysis:indexing_analysis",
"//xla/hlo/analysis:symbolic_map",
"//xla/hlo/ir:hlo",
"//xla/service:scatter_simplifier",
"//xla/service/gpu:gpu_fusible",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ SmallVector<Value> ReductionFusion::EmitterState::WriteToSharedMemory(
const HloValueMap& values, std::optional<int> padding) {
SmallVector<int64_t> shape;
auto map = owner.GetSharedMemoryWriteMap(mlir_context);
for (auto result : map.GetSymbolicMap().GetResults()) {
for (auto result : map.GetAffineMap().getResults()) {
shape.push_back(
map.GetRangeEvaluator().ComputeExpressionRange(result).upper + 1);
}
Expand Down Expand Up @@ -567,9 +567,9 @@ std::optional<IndexingMap> ReductionFusion::ComputeThreadIdToOutputIndexing(
auto projected_indexing = ComputeReductionOutputIndexing(mlir_context);
auto output_shape = reduction_dimensions_.GetOutputShape();
CHECK_EQ(output_shape.size(),
projected_indexing.GetSymbolicMap().GetNumResults());
projected_indexing.GetAffineMap().getNumResults());
for (auto [result, dim_size] : llvm::zip(
projected_indexing.GetSymbolicMap().GetResults(), output_shape)) {
projected_indexing.GetAffineMap().getResults(), output_shape)) {
projected_indexing.AddConstraint(result, {0, dim_size - 1});
}
AddGroupIdConstraint(projected_indexing, root_index, groups_);
Expand Down Expand Up @@ -796,7 +796,7 @@ IndexingMap SmallColumnReductionFusion::ComputeReductionInputIndexing(
mlir_context);

for (auto [result, dim_size] :
llvm::zip(map.GetSymbolicMap().GetResults(), input_shape_)) {
llvm::zip(map.GetAffineMap().getResults(), input_shape_)) {
map.AddConstraint(result, {0, dim_size - 1});
}
return map;
Expand Down
19 changes: 9 additions & 10 deletions third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ limitations under the License.
#include "xla/codegen/emitters/utils.h"
#include "xla/hlo/analysis/indexing_analysis.h"
#include "xla/hlo/analysis/indexing_map.h"
#include "xla/hlo/analysis/symbolic_map.h"
#include "xla/hlo/ir/hlo_casting_utils.h"
#include "xla/hlo/ir/hlo_instruction.h"
#include "xla/hlo/ir/hlo_instructions.h"
Expand Down Expand Up @@ -103,7 +102,6 @@ using mlir::func::FuncOp;
using mlir::func::ReturnOp;
using primitive_util::IsUnsignedIntegralType;

constexpr int64_t kGpuGridDims = 6;
constexpr int64_t kNumWarpsPerBlock = 4;
constexpr int64_t kMaxVectorizedBits = 128;
constexpr int64_t kScatterOperandIndex = 0;
Expand Down Expand Up @@ -431,9 +429,10 @@ void ScatterWithDistributedUpdates::ComputeIndexing(
if (indices_map) {
// Create a map from scatter update to scatter indices.
*indices_map = IndexingMap{
SymbolicMap::Get(mlir_context, kGpuGridDims, /*num_symbols=*/1,
{scatter_update_map.GetSymbolicMap().GetResult(0),
CreateSymbolExpr(0, kGpuGridDims, mlir_context)}),
AffineMap::get(6, 1,
{scatter_update_map.GetAffineMap().getResult(0),
getAffineSymbolExpr(0, mlir_context)},
mlir_context),
DimVarsFromGPUGrid({num_warps_ * warp_size_, 1, 1, num_blocks_, 1, 1}),
RangeVarsFromTensorSizes({description_.index_vector_length}),
/*rt_vars=*/{}};
Expand Down Expand Up @@ -480,8 +479,8 @@ void EmitNaiveImplementation(ImplicitLocOpBuilder& b,
.scatter_dims_to_operand_dims();
MLIRContext* mlir_context = b.getContext();
auto thread_id_to_update_id_map = IndexingMap(
SymbolicMap::Get(mlir_context, kGpuGridDims, /*num_symbols=*/0,
{updates_map.GetSymbolicMap().GetResult(0)}),
AffineMap::get(6, 0, {updates_map.GetAffineMap().getResult(0)},
mlir_context),
updates_map.GetDimVars(),
/*range_vars = */ {}, /*rt vars = */ {});
Value thread_id_to_index_id_value =
Expand Down Expand Up @@ -676,13 +675,13 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
MLIRContext* mlir_context = b.getContext();

auto thread_id_to_update_id_map = IndexingMap(
SymbolicMap::Get(mlir_context, kGpuGridDims, /*num_symbols=*/2,
{indices_map.GetSymbolicMap().GetResult(0)}),
AffineMap::get(6, 2, {indices_map.GetAffineMap().getResult(0)},
mlir_context),
indices_map.GetDimVars(),
/*range_vars = */
{indices_map.GetRangeVars().begin(),
indices_map.GetRangeVars().begin() + 2},
/*rt vars = */ {}, indices_map.GetSymbolicConstraints());
/*rt vars = */ {}, indices_map.GetConstraints());

// Convert index_id_loop and index_vector_id to dimension variables.
IndexingMap slice_indexing =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ IndexingMap TransposeFusion::GetIndexing(bool input, const xla::Shape& shape,
input ? ShapeUtil::MakeShape(shape.element_type(), input_shape_)
: ShapeUtil::MakeShape(shape.element_type(), transpose_.dimensions);
for (auto [size, dim] : llvm::zip(normalized_shape.dimensions(),
result.GetSymbolicMap().GetResults())) {
result.GetAffineMap().getResults())) {
result.AddConstraint(dim, {0, size - 1});
}
result = ComposeIndexingMaps(
Expand Down
203 changes: 0 additions & 203 deletions third_party/xla/xla/backends/gpu/codegen/triton/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -296,36 +296,6 @@ cc_library(
],
)

xla_cc_test(
name = "fusion_emitter_deviceless_test",
srcs = ["fusion_emitter_deviceless_test.cc"],
tags = ["no_oss"], # Doesn't pass in OSS when building with the `fusion_emitter_stub`.
deps = [
":xtile_compiler",
"//xla:xla_proto_cc",
"//xla/hlo/analysis:symbolic_map",
"//xla/hlo/ir:hlo",
"//xla/hlo/testlib:filecheck",
"//xla/hlo/testlib:hlo_hardware_independent_test_base",
"//xla/hlo/testlib:verified_hlo_module",
"//xla/service/gpu:backend_configs_cc",
"//xla/service/gpu:gpu_device_info_for_tests",
"//xla/service/gpu:target_constants",
"//xla/service/gpu/model:block_level_parameters",
"//xla/stream_executor:device_description",
"//xla/stream_executor/cuda:cuda_compute_capability",
"//xla/tests:xla_internal_test_main",
"//xla/tsl/platform:statusor",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:status_matchers",
"@com_google_absl//absl/strings:string_view",
"@com_google_googletest//:gtest",
"@llvm-project//llvm:TargetParser",
"@llvm-project//llvm:ir_headers",
"@llvm-project//mlir:IR",
],
)

xla_test(
name = "triton_gemm_fusion_test",
srcs = ["triton_gemm_fusion_test.cc"],
Expand Down Expand Up @@ -379,35 +349,6 @@ xla_test(
],
)

xla_test(
name = "fusion_emitter_int4_device_test",
srcs = ["fusion_emitter_int4_device_test.cc"],
backends = [
"a100",
"h100",
"b200",
"amdgpu_any",
],
shard_count = 10,
tags = ["no_mac"],
deps = [
"//xla:autotuning_proto_cc",
"//xla:error_spec",
"//xla:xla_proto_cc",
"//xla/backends/gpu/tests:hlo_pjrt_gpu_test_base",
"//xla/hlo/ir:hlo",
"//xla/hlo/testlib:filecheck",
"//xla/service/gpu:backend_configs_cc",
"//xla/stream_executor:device_description",
"//xla/tests:hlo_pjrt_interpreter_reference_mixin",
"//xla/tests:xla_internal_test_main", # fixdeps: keep
"//xla/tsl/platform:statusor",
"@com_google_absl//absl/strings",
"@com_google_googletest//:gtest",
"@tsl//tsl/platform:path",
],
)

xla_test(
name = "dot_algorithms_test",
srcs = ["dot_algorithms_test.cc"],
Expand Down Expand Up @@ -472,67 +413,6 @@ xla_test(
],
)

xla_test(
name = "fusion_emitter_device_test",
srcs = ["fusion_emitter_device_test.cc"],
backends = [
"a100",
"h100",
"b200",
"amdgpu_any",
],
shard_count = 10,
tags = ["no_mac"],
deps = [
":support",
":test_utils",
":xtile_compiler",
":xtile_test_base",
"//xla:autotuning_proto_cc",
"//xla:error_spec",
"//xla:literal",
"//xla:literal_util",
"//xla:shape_util",
"//xla:types",
"//xla:util",
"//xla:xla_data_proto_cc",
"//xla:xla_proto_cc",
"//xla/backends/gpu/tests:gpu_pjrt_codegen_test",
"//xla/hlo/analysis:symbolic_map",
"//xla/hlo/ir:hlo",
"//xla/hlo/testlib:filecheck",
"//xla/hlo/testlib:hlo_hardware_independent_test_base",
"//xla/hlo/testlib:verified_hlo_module",
"//xla/service:algorithm_util",
"//xla/service/gpu:backend_configs_cc",
"//xla/service/gpu:gpu_device_info_for_tests",
"//xla/service/gpu:target_constants",
"//xla/service/gpu/model:block_level_parameters",
"//xla/stream_executor:device_description",
"//xla/stream_executor/cuda:cuda_compute_capability",
"//xla/stream_executor/rocm:rocm_compute_capability",
"//xla/tests:hlo_pjrt_interpreter_reference_mixin",
"//xla/tests:test_utils",
"//xla/tests:xla_internal_test_main", # fixdeps: keep
"//xla/tsl/lib/core:status_test_util",
"//xla/tsl/platform:env",
"//xla/tsl/platform:errors",
"//xla/tsl/platform:statusor",
"//xla/tsl/platform:test",
"@com_google_absl//absl/log",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:status_matchers",
"@com_google_absl//absl/strings",
"@com_google_googletest//:gtest",
"@eigen_archive//:eigen3",
"@llvm-project//llvm:TargetParser",
"@llvm-project//llvm:ir_headers",
"@llvm-project//mlir:IR",
"@llvm-project//mlir:Pass",
"@tsl//tsl/platform:path",
],
)

cc_library(
name = "test_utils",
testonly = True,
Expand Down Expand Up @@ -632,89 +512,6 @@ cc_library(
],
)

xla_test(
name = "fusion_emitter_large_test",
srcs = ["fusion_emitter_large_test.cc"],
backend_tags = {
"h100": [
# Needs full H100, otherwise we run OOM.
"full",
],
},
backends = [
"a100",
"h100",
"b200",
"amdgpu_any",
],
shard_count = 3,
tags = [
"no_mac",
"no_oss", # requires-mem:16g tag doesn't work in open source
"nozapfhahn", # Times out under coverage
] + if_google([
"requires-mem:16g",
]),
deps = [
"//xla:error_spec",
"//xla:xla_proto_cc",
"//xla/tests:hlo_pjrt_interpreter_reference_mixin",
"//xla/tests:hlo_pjrt_test_base",
"//xla/tests:xla_internal_test_main", # fixdeps: keep
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings:string_view",
"@com_google_googletest//:gtest",
],
)

xla_test(
name = "fusion_emitter_parametrized_test",
srcs = ["fusion_emitter_parametrized_test.cc"],
backends = [
"a100",
"h100",
"b200",
"amdgpu_any",
],
tags = ["no_mac"],
use_legacy_runtime = True,
deps = [
":support",
":test_utils",
"//xla:comparison_util",
"//xla:error_spec",
"//xla:xla_data_proto_cc",
"//xla:xla_proto_cc",
"//xla/backends/gpu/tests:gpu_codegen_test",
"//xla/hlo/ir:hlo",
"//xla/stream_executor:device_description",
"//xla/tests:xla_internal_test_main", # fixdeps: keep
"//xla/tsl/lib/core:status_test_util",
"//xla/tsl/platform:statusor",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/strings",
"@com_google_googletest//:gtest",
],
)

xla_cc_test(
name = "fusion_emitter_shared_dialect_test",
srcs = ["fusion_emitter_shared_dialect_test.cc"],
tags = ["no_mac"],
deps = [
":xtile_test_base",
"//xla/hlo/ir:hlo",
"//xla/hlo/parser:hlo_parser",
"//xla/hlo/testlib:hlo_hardware_independent_test_base",
"//xla/service/gpu/model:block_level_parameters",
"//xla/tests:xla_internal_test_main", # fixdeps: keep
"//xla/tsl/lib/core:status_test_util",
"//xla/tsl/platform:statusor",
"@com_google_absl//absl/strings:string_view",
"@com_google_googletest//:gtest",
],
)

cc_library(
name = "support",
srcs = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,13 +339,6 @@ CodegenDecision CanTritonHandleGEMM(
return decision;
}

const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();

// TODO(b/269580541): support multiple batch dimensions.
if (dim_numbers.lhs_batch_dimensions().size() > 1) {
return CodegenDecision::Forbid("Multiple batch dimensions.");
}

return CodegenDecision::Allow();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -448,10 +448,8 @@ ENTRY e {
kHloTest, /*data_type=*/{}, HloOpcode::kDot));
const se::DeviceDescription dev_info =
TestGpuDeviceInfo::RTXA6000DeviceInfo(GetComputeCapability());
EXPECT_THAT(legacy_triton::IsTritonSupportedInstruction(
ti.Instruction(), GetComputeCapability())
.Explain(),
::testing::HasSubstr("Multiple batch dimensions"));
EXPECT_TRUE(legacy_triton::IsTritonSupportedInstruction(
ti.Instruction(), GetComputeCapability()));
auto block_level_parameters =
BlockLevelParameters::FromBlockLevelFusionConfig(
ti.TritonFusion()
Expand Down
Loading
Loading