1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8
9 #include <benchmark/benchmark.h>
10 #include "bench/utils.h"
11
12 #include <xnnpack.h>
13 #include <xnnpack/aligned-allocator.h>
14 #include <xnnpack/common.h>
15 #include <xnnpack/microfnptr.h>
16 #include <xnnpack/rmax.h>
17 #include <xnnpack/raddexpminusmax.h>
18 #include <xnnpack/vscaleexpminusmax.h>
19
20
f32_vscaleexpminusmax(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)21 static void f32_vscaleexpminusmax(
22 benchmark::State& state,
23 xnn_f32_rmax_ukernel_function rmax,
24 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
25 xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
26 benchmark::utils::IsaCheckFunction isa_check = nullptr)
27 {
28 if (isa_check && !isa_check(state)) {
29 return;
30 }
31
32 const size_t elements = state.range(0);
33 const size_t cache_line_size_max = 128;
34 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
39
40 const size_t num_buffers = 1 +
41 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
42 std::vector<float, AlignedAllocator<float, 64>> x(elements);
43 std::vector<float, AlignedAllocator<float, 64>> y(packed_elements * num_buffers);
44
45 std::generate(x.begin(), x.end(), std::ref(f32rng));
46
47 benchmark::utils::DisableDenormals();
48
49 size_t buffer_index = 0;
50 for (auto _ : state) {
51 state.PauseTiming();
52 float x_max = nanf("");
53 rmax(elements * sizeof(float), x.data(), &x_max);
54 float y_sum = nanf("");
55 raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
56 if (++buffer_index == num_buffers) {
57 buffer_index = 0;
58 }
59 state.ResumeTiming();
60
61 vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
62 }
63
64 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
65 if (cpu_frequency != 0) {
66 state.counters["cpufreq"] = cpu_frequency;
67 }
68
69 const size_t elements_per_iteration = elements;
70 state.counters["elements"] =
71 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
72
73 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
74 state.counters["bytes"] =
75 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
76 }
77
CharacteristicArguments(benchmark::internal::Benchmark * b)78 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
79 b->ArgName("N");
80 for (int32_t n = 10000; n <= 100000000; n *= 10) {
81 b->Arg(n);
82 }
83 }
84
85 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
86 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x16,
87 xnn_f32_rmax_ukernel__avx512f,
88 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
89 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
90 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
91 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x32,
92 xnn_f32_rmax_ukernel__avx512f,
93 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
94 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32,
95 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
96 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x48,
97 xnn_f32_rmax_ukernel__avx512f,
98 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
99 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x48,
100 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
101 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x64,
102 xnn_f32_rmax_ukernel__avx512f,
103 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
104 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x64,
105 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
106 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x80,
107 xnn_f32_rmax_ukernel__avx512f,
108 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
109 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80,
110 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
111 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x96,
112 xnn_f32_rmax_ukernel__avx512f,
113 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
114 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96,
115 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
116 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x112,
117 xnn_f32_rmax_ukernel__avx512f,
118 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
119 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112,
120 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
121 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x128,
122 xnn_f32_rmax_ukernel__avx512f,
123 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
124 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x128,
125 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
126 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x144,
127 xnn_f32_rmax_ukernel__avx512f,
128 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
129 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x144,
130 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
131 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x160,
132 xnn_f32_rmax_ukernel__avx512f,
133 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
134 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x160,
135 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
136 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x176,
137 xnn_f32_rmax_ukernel__avx512f,
138 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
139 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x176,
140 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
141 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x192,
142 xnn_f32_rmax_ukernel__avx512f,
143 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
144 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x192,
145 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
146
147 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x8,
148 xnn_f32_rmax_ukernel__avx,
149 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
150 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8,
151 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
152 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x16,
153 xnn_f32_rmax_ukernel__avx,
154 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
155 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16,
156 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
157 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x24,
158 xnn_f32_rmax_ukernel__avx,
159 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
160 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
161 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
162 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x32,
163 xnn_f32_rmax_ukernel__avx,
164 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
165 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32,
166 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
167 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x40,
168 xnn_f32_rmax_ukernel__avx,
169 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
170 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40,
171 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
172 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x48,
173 xnn_f32_rmax_ukernel__avx,
174 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
175 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48,
176 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
177 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x56,
178 xnn_f32_rmax_ukernel__avx,
179 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
180 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56,
181 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
182 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x64,
183 xnn_f32_rmax_ukernel__avx,
184 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
185 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64,
186 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
187 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x72,
188 xnn_f32_rmax_ukernel__avx,
189 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
190 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72,
191 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
192 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x80,
193 xnn_f32_rmax_ukernel__avx,
194 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
195 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x80,
196 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
197 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x88,
198 xnn_f32_rmax_ukernel__avx,
199 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
200 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x88,
201 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
202 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x96,
203 xnn_f32_rmax_ukernel__avx,
204 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
205 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x96,
206 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
207 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
208
209 #ifndef XNNPACK_BENCHMARK_NO_MAIN
210 BENCHMARK_MAIN();
211 #endif
212