1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8
9 #include <benchmark/benchmark.h>
10 #include "bench/utils.h"
11
12 #include <xnnpack.h>
13 #include <xnnpack/aligned-allocator.h>
14 #include <xnnpack/common.h>
15 #include <xnnpack/microfnptr.h>
16 #include <xnnpack/raddextexp.h>
17 #include <xnnpack/vscaleextexp.h>
18
19
f32_vscaleextexp(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)20 static void f32_vscaleextexp(
21 benchmark::State& state,
22 xnn_f32_raddextexp_ukernel_function raddextexp,
23 xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
24 benchmark::utils::IsaCheckFunction isa_check = nullptr)
25 {
26 if (isa_check && !isa_check(state)) {
27 return;
28 }
29
30 const size_t elements = state.range(0);
31 const size_t cache_line_size_max = 128;
32 const size_t packed_n = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
33
34 std::random_device random_device;
35 auto rng = std::mt19937(random_device());
36 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
37
38 const size_t num_buffers = 1 +
39 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
40 std::vector<float, AlignedAllocator<float, 64>> x(elements);
41 std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
42
43 std::generate(x.begin(), x.end(), std::ref(f32rng));
44
45 benchmark::utils::DisableDenormals();
46
47 size_t buffer_index = 0;
48 for (auto _ : state) {
49 state.PauseTiming();
50 float scale[2];
51 raddextexp(elements * sizeof(float), x.data(), scale);
52 const float ext_mantissa = 1.0f / scale[0];
53 const float ext_exponent = -scale[1];
54 if (++buffer_index == num_buffers) {
55 buffer_index = 0;
56 }
57 state.ResumeTiming();
58
59 vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_n * buffer_index, ext_mantissa, ext_exponent);
60 }
61
62 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
63 if (cpu_frequency != 0) {
64 state.counters["cpufreq"] = cpu_frequency;
65 }
66
67 const size_t elements_per_iteration = elements;
68 state.counters["elements"] =
69 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
70
71 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
72 state.counters["bytes"] =
73 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
74 }
75
CharacteristicArguments(benchmark::internal::Benchmark * b)76 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
77 b->ArgName("N");
78 for (int32_t n = 10000; n <= 100000000; n *= 10) {
79 b->Arg(n);
80 }
81 }
82
83 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
84 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x16,
85 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
86 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
87 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
88 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x32,
89 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
90 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x32,
91 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
92 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x48,
93 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
94 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x48,
95 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
96 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x64,
97 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
98 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x64,
99 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
100 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x80,
101 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
102 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80,
103 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
104 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x96,
105 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
106 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96,
107 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
108 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x112,
109 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
110 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112,
111 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
112 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x128,
113 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
114 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x128,
115 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
116 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x144,
117 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
118 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x144,
119 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
120 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x160,
121 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
122 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x160,
123 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
124 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x176,
125 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
126 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x176,
127 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
128 BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x192,
129 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
130 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x192,
131 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
132
133 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x8,
134 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
135 xnn_f32_vscaleextexp_ukernel__avx2_p5_x8,
136 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
137 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x16,
138 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
139 xnn_f32_vscaleextexp_ukernel__avx2_p5_x16,
140 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
141 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x24,
142 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
143 xnn_f32_vscaleextexp_ukernel__avx2_p5_x24,
144 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
145 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x32,
146 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
147 xnn_f32_vscaleextexp_ukernel__avx2_p5_x32,
148 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
149 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x40,
150 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
151 xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
152 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
153 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x48,
154 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
155 xnn_f32_vscaleextexp_ukernel__avx2_p5_x48,
156 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
157 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x56,
158 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
159 xnn_f32_vscaleextexp_ukernel__avx2_p5_x56,
160 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
161 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x64,
162 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
163 xnn_f32_vscaleextexp_ukernel__avx2_p5_x64,
164 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
165 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x72,
166 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
167 xnn_f32_vscaleextexp_ukernel__avx2_p5_x72,
168 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
169 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x80,
170 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
171 xnn_f32_vscaleextexp_ukernel__avx2_p5_x80,
172 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
173 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x88,
174 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
175 xnn_f32_vscaleextexp_ukernel__avx2_p5_x88,
176 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
177 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x96,
178 xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
179 xnn_f32_vscaleextexp_ukernel__avx2_p5_x96,
180 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
181 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
182
183 #ifndef XNNPACK_BENCHMARK_NO_MAIN
184 BENCHMARK_MAIN();
185 #endif
186