1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/raddexpminusmax.h>
20 #include <xnnpack/rmax.h>
21
22
f32_raddexpminusmax(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void f32_raddexpminusmax(
24 benchmark::State& state,
25 xnn_f32_rmax_ukernel_function rmax,
26 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
27 benchmark::utils::IsaCheckFunction isa_check = nullptr)
28 {
29 if (isa_check && !isa_check(state)) {
30 return;
31 }
32
33 const size_t elements = state.range(0);
34 const size_t cache_line_size_max = 128;
35 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
36
37 std::random_device random_device;
38 auto rng = std::mt19937(random_device());
39 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
40
41 const size_t num_buffers = 1 +
42 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
43 std::vector<float, AlignedAllocator<float, 64>> x(elements);
44
45 std::generate(x.begin(), x.end(), std::ref(f32rng));
46
47 benchmark::utils::DisableDenormals();
48
49 size_t buffer_index = 0;
50 for (auto _ : state) {
51 state.PauseTiming();
52 float x_max = nanf("");
53 rmax(elements * sizeof(float), x.data(), &x_max);
54 if (++buffer_index == num_buffers) {
55 buffer_index = 0;
56 }
57 state.ResumeTiming();
58
59 float y_sum = nanf("");
60 raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
61 }
62
63 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
64 if (cpu_frequency != 0) {
65 state.counters["cpufreq"] = cpu_frequency;
66 }
67
68 const size_t elements_per_iteration = elements;
69 state.counters["elements"] =
70 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
71
72 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
73 state.counters["bytes"] =
74 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
75 }
76
CharacteristicArguments(benchmark::internal::Benchmark * b)77 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
78 b->ArgName("N");
79 for (int32_t n = 10000; n <= 100000000; n *= 10) {
80 b->Arg(n);
81 }
82 }
83
84 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
85 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64,
86 xnn_f32_rmax_ukernel__avx,
87 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64,
88 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
89 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64_acc2,
90 xnn_f32_rmax_ukernel__avx,
91 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2,
92 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
93 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64_acc4,
94 xnn_f32_rmax_ukernel__avx,
95 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4,
96 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
97
98 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x72,
99 xnn_f32_rmax_ukernel__avx,
100 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72,
101 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
102 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x72_acc3,
103 xnn_f32_rmax_ukernel__avx,
104 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3,
105 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
106
107 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80,
108 xnn_f32_rmax_ukernel__avx,
109 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80,
110 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
111 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80_acc2,
112 xnn_f32_rmax_ukernel__avx,
113 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
114 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
115 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80_acc5,
116 xnn_f32_rmax_ukernel__avx,
117 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc5,
118 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
119
120 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96,
121 xnn_f32_rmax_ukernel__avx,
122 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
123 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
124 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc2,
125 xnn_f32_rmax_ukernel__avx,
126 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc2,
127 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
128 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc3,
129 xnn_f32_rmax_ukernel__avx,
130 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc3,
131 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
132 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc6,
133 xnn_f32_rmax_ukernel__avx,
134 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc6,
135 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
136
137 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128,
138 xnn_f32_rmax_ukernel__avx,
139 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128,
140 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
141 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128_acc2,
142 xnn_f32_rmax_ukernel__avx,
143 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
144 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
145 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128_acc4,
146 xnn_f32_rmax_ukernel__avx,
147 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
148 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
149
150 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x144,
151 xnn_f32_rmax_ukernel__avx,
152 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x144,
153 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
154 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x144_acc3,
155 xnn_f32_rmax_ukernel__avx,
156 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x144_acc3,
157 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
158
159 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160,
160 xnn_f32_rmax_ukernel__avx,
161 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160,
162 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
163 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160_acc2,
164 xnn_f32_rmax_ukernel__avx,
165 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160_acc2,
166 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
167 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160_acc5,
168 xnn_f32_rmax_ukernel__avx,
169 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160_acc5,
170 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
171
172 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192,
173 xnn_f32_rmax_ukernel__avx,
174 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192,
175 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
176 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc2,
177 xnn_f32_rmax_ukernel__avx,
178 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc2,
179 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
180 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc3,
181 xnn_f32_rmax_ukernel__avx,
182 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3,
183 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
184 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc6,
185 xnn_f32_rmax_ukernel__avx,
186 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6,
187 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
188 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
189
190 #ifndef XNNPACK_BENCHMARK_NO_MAIN
191 BENCHMARK_MAIN();
192 #endif
193