-
Notifications
You must be signed in to change notification settings - Fork 417
Expand file tree
/
Copy pathSmallLoopTripCount.cpp
More file actions
102 lines (85 loc) · 3.65 KB
/
SmallLoopTripCount.cpp
File metadata and controls
102 lines (85 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// This program tests the performance impact of vectorization in loops with
// small trip counts. These cases exercise the LoopVectorize path that accepts
// trip counts one larger than the vectorization factor.
#include <array>
#include <cstddef>
#include <cstdint>
#include <limits>
#include "benchmark/benchmark.h"
#define NOINLINE __attribute__((noinline))
#define LOOP_VECTORIZE_ENABLE \
_Pragma("clang loop vectorize(enable) unroll(disable)")
#define LOOP_VECTORIZE_DISABLE \
_Pragma("clang loop vectorize(disable) interleave(disable) unroll(disable)")
#define LOOP_INTERLEAVE_COUNT_2 \
_Pragma("clang loop vectorize(enable) interleave_count(2) unroll(disable)")
static uint64_t g_small_loop_trip_count_sum = 0;
template <typename Ty>
NOINLINE void loopTc5Vector(const Ty *__restrict A, Ty *__restrict B) {
LOOP_VECTORIZE_ENABLE
for (uint64_t I = 0; I != 5; ++I)
B[I] = A[I] + static_cast<Ty>(1);
}
template <typename Ty>
NOINLINE void loopTc5Scalar(const Ty *__restrict A, Ty *__restrict B) {
LOOP_VECTORIZE_DISABLE
for (uint64_t I = 0; I != 5; ++I)
B[I] = A[I] + static_cast<Ty>(1);
}
NOINLINE void loopTc5I64InterleaveCount2Vector(const uint64_t *__restrict A,
uint64_t *__restrict B) {
LOOP_INTERLEAVE_COUNT_2
for (uint64_t I = 0; I != 5; ++I)
B[I] = A[I] + 1;
}
template <typename Ty> using KernelFn = void (*)(const Ty *, Ty *);
template <typename Ty> static void initData(std::array<Ty, 16> &A) {
for (size_t I = 0; I != A.size(); ++I)
A[I] = static_cast<Ty>(0x0102030405060708ULL + I);
}
template <typename Ty> static uint64_t checksum(const std::array<Ty, 16> &A) {
uint64_t Sum = 0;
for (size_t I = 0; I != 5; ++I) {
auto Value = static_cast<uint64_t>(A[I]);
for (size_t Byte = 0; Byte != sizeof(Ty); ++Byte) {
Sum = Sum * 131 + (Value & std::numeric_limits<uint8_t>::max());
Value >>= 8;
}
}
return Sum;
}
template <typename Ty>
static void runBenchForSmallLoopTripCount(benchmark::State &State,
KernelFn<Ty> Fn) {
std::array<Ty, 16> A;
std::array<Ty, 16> B = {};
initData(A);
for (auto _ : State) {
benchmark::DoNotOptimize(A.data());
benchmark::DoNotOptimize(B.data());
Fn(A.data(), B.data());
benchmark::ClobberMemory();
}
g_small_loop_trip_count_sum ^= checksum(B);
benchmark::DoNotOptimize(g_small_loop_trip_count_sum);
State.SetItemsProcessed(State.iterations() * 5);
}
template <typename Ty> void benchTc5Vector(benchmark::State &State) {
runBenchForSmallLoopTripCount<Ty>(State, loopTc5Vector<Ty>);
}
template <typename Ty> void benchTc5Scalar(benchmark::State &State) {
runBenchForSmallLoopTripCount<Ty>(State, loopTc5Scalar<Ty>);
}
void benchTc5I64InterleaveCount2Vector(benchmark::State &State) {
runBenchForSmallLoopTripCount<uint64_t>(State,
loopTc5I64InterleaveCount2Vector);
}
BENCHMARK_TEMPLATE(benchTc5Vector, uint8_t)->Name("tc5/i8/vector");
BENCHMARK_TEMPLATE(benchTc5Scalar, uint8_t)->Name("tc5/i8/scalar");
BENCHMARK_TEMPLATE(benchTc5Vector, uint16_t)->Name("tc5/i16/vector");
BENCHMARK_TEMPLATE(benchTc5Scalar, uint16_t)->Name("tc5/i16/scalar");
BENCHMARK_TEMPLATE(benchTc5Vector, uint32_t)->Name("tc5/i32/vector");
BENCHMARK_TEMPLATE(benchTc5Scalar, uint32_t)->Name("tc5/i32/scalar");
BENCHMARK_TEMPLATE(benchTc5Vector, uint64_t)->Name("tc5/i64/vector");
BENCHMARK_TEMPLATE(benchTc5Scalar, uint64_t)->Name("tc5/i64/scalar");
BENCHMARK(benchTc5I64InterleaveCount2Vector)->Name("tc5/i64/ic2/vector");