Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions tensorflow/lite/core/signature_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,13 @@ class SignatureRunner {
const char* output_name, const TfLiteCustomAllocation& allocation,
int64_t flags = kTfLiteCustomAllocationFlagsNone);

/// \brief Clears all custom memory allocations in the signature runner.
/// \warning This is an experimental API and subject to change. \n
TfLiteStatus ClearCustomAllocations() {
subgraph_->ClearCustomAllocations();
return kTfLiteOk;
}

/// \brief Set if buffer handle output is allowed.
///
/// When using hardware delegation, Interpreter will make the data of output
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/lite/core/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,11 @@ class Subgraph {
int tensor_index, const TfLiteCustomAllocation& allocation,
int64_t flags = kTfLiteCustomAllocationFlagsNone);

// WARNING: This is an experimental interface that is subject to change.
// Clears all custom memory allocations for the tensors in the subgraph.
// User should call this before resizing input tensors.
void ClearCustomAllocations() { custom_allocations_.clear(); }

void SetName(const char* name);
const std::string& GetName() const;

Expand Down
4 changes: 2 additions & 2 deletions third_party/xla/third_party/stablehlo/workspace.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")

def repo():
# LINT.IfChange
STABLEHLO_COMMIT = "fef90093ed233adae94ca41f07abde51d962d1c0"
STABLEHLO_SHA256 = "ffb10b078c2f59c6c498d54bcf017993975daa51f6dee4e24d862193a75a42a0"
STABLEHLO_COMMIT = "05fdca09eefdcacaa32a27823e258aeb935b56d0"
STABLEHLO_SHA256 = "3d119dad5288d1b87f9c467b386b26d4906f9117a2a924407f53adcd3b1b5c2a"
# LINT.ThenChange(Google-internal path)

tf_http_archive(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ TEST(NanoIfrtClientTest, BigResult) {

auto a_array = client->MakeArrayFromHostBuffer(
&a, dtype, shape, std::nullopt, client->default_sharding(),
ifrt::Client::HostBufferSemantics::kImmutableZeroCopy,
/*layout=*/nullptr, ifrt::Client::HostBufferSemantics::kImmutableZeroCopy,
/*on_done_with_host_buffer=*/nullptr);
CHECK_OK(a_array);

Expand Down
55 changes: 54 additions & 1 deletion third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
#include <vector>

#include <gtest/gtest.h>
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_replace.h"
#include "absl/strings/string_view.h"
Expand Down Expand Up @@ -49,7 +50,7 @@ class YnnFusionTest
absl::StrReplaceAll(hlo_template, {{"$dtype", params.in_dtype},
{"$in_dtype", params.in_dtype},
{"$out_dtype", params.out_dtype}});
bool bf16_compute = params.in_dtype == "bf16" || params.out_dtype == "bf16";
bool bf16_compute = absl::StrContains(hlo_text, "bf16");
double tolerance = bf16_compute ? 1e-2 : 1e-7;
EXPECT_TRUE(RunAndCompareNoHloPasses(
hlo_text, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
Expand Down Expand Up @@ -141,6 +142,58 @@ TEST_P(YnnFusionReduceWindowTest, ReduceWindowAndReduce) {
RunTest(kModuleStr);
}

TEST_P(YnnFusionReduceWindowTest, ReduceConvert) {
constexpr absl::string_view kModuleStr = R"(
HloModule reduce_convert

%add {
%lhs = $dtype[] parameter(0)
%rhs = $dtype[] parameter(1)
ROOT %add = $dtype[] add(%lhs, %rhs)
}

ynn_fusion {
%input = $dtype[64, 2] parameter(0)
%zero = $dtype[] constant(0)
%reduced = $dtype[64] reduce(%input, %zero), dimensions={1}, to_apply=%add
ROOT %convert = bf16[64] convert(%reduced)
}

ENTRY entry {
%p0 = $dtype[64, 2] parameter(0)
ROOT %fusion = bf16[64] fusion(%p0), kind=kCustom, calls=ynn_fusion,
backend_config={"fusion_config": {kind: "__ynn_fusion"}}
})";

RunTest(kModuleStr);
}

TEST_P(YnnFusionReduceWindowTest, ConvertReduce) {
constexpr absl::string_view kModuleStr = R"(
HloModule convert_reduce

%add {
%lhs = $dtype[] parameter(0)
%rhs = $dtype[] parameter(1)
ROOT %add = $dtype[] add(%lhs, %rhs)
}

ynn_fusion {
%input = bf16[64, 2] parameter(0)
%zero = $dtype[] constant(0)
%converted = $dtype[64, 2] convert(%input)
ROOT %reduce = $dtype[64] reduce(%converted, %zero), dimensions={1}, to_apply=%add
}

ENTRY entry {
%p0 = bf16[64, 2] parameter(0)
ROOT %fusion = $dtype[64] fusion(%p0), kind=kCustom, calls=ynn_fusion,
backend_config={"fusion_config": {kind: "__ynn_fusion"}}
})";

RunTest(kModuleStr);
}

INSTANTIATE_TEST_SUITE_P(YnnFusionReduceWindowTestInstantiation,
YnnFusionReduceWindowTest,
::testing::Values(YnnFusionTestParams{"f32", "f32"}),
Expand Down
4 changes: 3 additions & 1 deletion third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ class YnnMatcher : public LibraryMatcher {
if (instr->opcode() == HloOpcode::kBitcast) {
return IsBitcastOpSupportedByYnn(instr);
}

if (instr->opcode() == HloOpcode::kConvert) {
return IsElementwiseOpSupportedByYnn(instr);
}
return false;
}
if (instr->IsElementwise()) {
Expand Down
54 changes: 54 additions & 0 deletions third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,59 @@ TEST_F(YnnReduceTest, ReshapeReduce) {
)");
}

TEST_F(YnnReduceTest, ReduceConvert) {
const char* hlo_text = R"(
HloModule reduce_convert

add {
lhs = f32[] parameter(0)
rhs = f32[] parameter(1)
ROOT add = f32[] add(lhs, rhs)
}

ENTRY main {
input = f32[512,512] parameter(0)
init = f32[] constant(0)
reduced = f32[512] reduce(input, init), dimensions={1}, to_apply=add
ROOT result = bf16[512] convert(reduced)
}
)";

MatchOptimizedHlo(hlo_text, R"(
CHECK: %[[reduce:.+]] = {{.+}} reduce({{.+}})
CHECK: ROOT {{.+}} = {{.+}} convert(%[[reduce]])
CHECK: ENTRY
CHECK: kind=kCustom
CHECK: "kind":"__ynn_fusion"
)");
}

TEST_F(YnnReduceTest, ConvertReduce) {
const char* hlo_text = R"(
HloModule convert_reduce

add {
lhs = f32[] parameter(0)
rhs = f32[] parameter(1)
ROOT add = f32[] add(lhs, rhs)
}

ENTRY main {
input = bf16[512,512] parameter(0)
init = f32[] constant(0)
converted = f32[512,512] convert(input)
ROOT result = f32[] reduce(converted, init), dimensions={0,1}, to_apply=add
}
)";

MatchOptimizedHlo(hlo_text, R"(
CHECK: %[[convert:.+]] = {{.+}} convert({{.+}})
CHECK: ROOT {{.+}} = {{.+}} reduce-window(%[[convert]], {{.+}})
CHECK: ENTRY
CHECK: kind=kCustom
CHECK: "kind":"__ynn_fusion"
)");
}

} // namespace
} // namespace xla::cpu
1 change: 0 additions & 1 deletion third_party/xla/xla/backends/cpu/ynn_emitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ class Literals {
}
};


absl::StatusOr<uint32_t> DefineConstant(ynn_subgraph_t subgraph,
Literals& literals,
const HloInstruction* instr) {
Expand Down
12 changes: 10 additions & 2 deletions third_party/xla/xla/backends/cpu/ynn_support.cc
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ bool IsReduceLikeOpSupportedByYnn(const HloInstruction* hlo) {
HloInstruction* init = reduce_like_op->init_values().front();
const PrimitiveType type = init->shape().element_type();
// TODO(ashaposhnikov): The list of supported types can be extended.
return type == F32 && type == reduce_like_op->shape().element_type();
return type == reduce_like_op->shape().element_type() &&
(type == F32 || type == BF16);
};

if (hlo->opcode() == HloOpcode::kReduce) {
Expand Down Expand Up @@ -363,12 +364,19 @@ bool IsReduceLikeOpOffloadedToYnn(const HloInstruction* hlo) {
return false;
}
switch (input->opcode()) {
// We may consider allowing the ops below as input in the future.
// For now they are excluded because the codegen for the fusion with reduce
// can be faster.
case HloOpcode::kMultiply:
case HloOpcode::kBroadcast:
case HloOpcode::kSlice:
case HloOpcode::kConcatenate:
case HloOpcode::kConvert:
return false;
case HloOpcode::kConvert: {
PrimitiveType from = input->operand(0)->shape().element_type();
PrimitiveType to = input->shape().element_type();
return (from == BF16 && to == F32) || (from == S8 && to == S32);
}
default: {
return true;
}
Expand Down
14 changes: 8 additions & 6 deletions third_party/xla/xla/backends/gpu/runtime/async_execution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,12 @@ absl::StatusOr<AsyncExecution::ExecutionGuard> AsyncExecution::Start(
ExecutionState * es,
GetExecutionState(state, start_thunk_->thunk_info().thunk_id));

// TODO(ezhulenev): We should harden async executions and do not allow
// multiple async executions in flight, but today send/recv pipelining might
// emit a thunk sequence with multiple starts back to back.A
++es->counter;
if (++es->counter > 1) {
return Internal(
"Async execution for `%s` already started (counter=%d). Async "
"execution must be completed by Done before it can be started again.",
start_thunk_->profile_annotation(), es->counter - 1);
}

se::Event* event = es->event->get();

Expand All @@ -154,9 +156,9 @@ absl::Status AsyncExecution::Done(Thunk::ExecutionScopedState* state,
ExecutionState * es,
GetExecutionState(state, start_thunk_->thunk_info().thunk_id));

if (es->counter-- == 0) {
if (--es->counter < 0) {
return Internal("Async execution for `%s` not started (counter=%d)",
start_thunk_->profile_annotation(), es->counter);
start_thunk_->profile_annotation(), es->counter + 1);
}

// Wait for the async operation to complete by waiting for the event that was
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ absl::Status RunOneShotRaggedAllToAllWithNccl(
int64_t num_total_updates, int64_t num_input_rows, int64_t num_row_elements,
absl::Span<DeviceBufferPair const> buffers) {
int device_ordinal = stream.parent()->device_ordinal();
const int64_t num_ranks = clique_key.num_local_participants();
const int64_t num_ranks = clique_key.num_devices();

XLA_VLOG_DEVICE(3, device_ordinal)
<< "Performing one-shot ragged-all-to-all with NCCL barrier rank: "
Expand Down
Loading
Loading