xref: /aosp_15_r20/external/XNNPACK/bench/f32-vscaleextexp.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8 
9 #include <benchmark/benchmark.h>
10 #include "bench/utils.h"
11 
12 #include <xnnpack.h>
13 #include <xnnpack/aligned-allocator.h>
14 #include <xnnpack/common.h>
15 #include <xnnpack/microfnptr.h>
16 #include <xnnpack/raddextexp.h>
17 #include <xnnpack/vscaleextexp.h>
18 
19 
f32_vscaleextexp(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)20 static void f32_vscaleextexp(
21   benchmark::State& state,
22   xnn_f32_raddextexp_ukernel_function raddextexp,
23   xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
24   benchmark::utils::IsaCheckFunction isa_check = nullptr)
25 {
26   if (isa_check && !isa_check(state)) {
27     return;
28   }
29 
30   const size_t elements = state.range(0);
31   const size_t cache_line_size_max = 128;
32   const size_t packed_n = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
33 
34   std::random_device random_device;
35   auto rng = std::mt19937(random_device());
36   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
37 
38   const size_t num_buffers = 1 +
39     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
40   std::vector<float, AlignedAllocator<float, 64>> x(elements);
41   std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
42 
43   std::generate(x.begin(), x.end(), std::ref(f32rng));
44 
45   benchmark::utils::DisableDenormals();
46 
47   size_t buffer_index = 0;
48   for (auto _ : state) {
49     state.PauseTiming();
50     float scale[2];
51     raddextexp(elements * sizeof(float), x.data(), scale);
52     const float ext_mantissa = 1.0f / scale[0];
53     const float ext_exponent = -scale[1];
54     if (++buffer_index == num_buffers) {
55       buffer_index = 0;
56     }
57     state.ResumeTiming();
58 
59     vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_n * buffer_index, ext_mantissa, ext_exponent);
60   }
61 
62   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
63   if (cpu_frequency != 0) {
64     state.counters["cpufreq"] = cpu_frequency;
65   }
66 
67   const size_t elements_per_iteration = elements;
68   state.counters["elements"] =
69     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
70 
71   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
72   state.counters["bytes"] =
73     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
74 }
75 
CharacteristicArguments(benchmark::internal::Benchmark * b)76 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
77   b->ArgName("N");
78   for (int32_t n = 10000; n <= 100000000; n *= 10) {
79     b->Arg(n);
80   }
81 }
82 
83 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
84   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x16,
85     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
86     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
87     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
88   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x32,
89     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
90     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x32,
91     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
92   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x48,
93     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
94     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x48,
95     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
96   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x64,
97     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
98     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x64,
99     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
100   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x80,
101     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
102     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80,
103     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
104   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x96,
105     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
106     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96,
107     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
108   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x112,
109     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
110     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112,
111     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
112   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x128,
113     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
114     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x128,
115     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
116   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x144,
117     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
118     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x144,
119     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
120   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x160,
121     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
122     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x160,
123     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
124   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x176,
125     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
126     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x176,
127     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
128   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_x192,
129     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x128_acc2,
130     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x192,
131     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
132 
133   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x8,
134     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
135     xnn_f32_vscaleextexp_ukernel__avx2_p5_x8,
136     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
137   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x16,
138     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
139     xnn_f32_vscaleextexp_ukernel__avx2_p5_x16,
140     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
141   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x24,
142     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
143     xnn_f32_vscaleextexp_ukernel__avx2_p5_x24,
144     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
145   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x32,
146     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
147     xnn_f32_vscaleextexp_ukernel__avx2_p5_x32,
148     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
149   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x40,
150     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
151     xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
152     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
153   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x48,
154     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
155     xnn_f32_vscaleextexp_ukernel__avx2_p5_x48,
156     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
157   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x56,
158     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
159     xnn_f32_vscaleextexp_ukernel__avx2_p5_x56,
160     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
161   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x64,
162     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
163     xnn_f32_vscaleextexp_ukernel__avx2_p5_x64,
164     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
165   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x72,
166     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
167     xnn_f32_vscaleextexp_ukernel__avx2_p5_x72,
168     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
169   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x80,
170     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
171     xnn_f32_vscaleextexp_ukernel__avx2_p5_x80,
172     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
173   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x88,
174     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
175     xnn_f32_vscaleextexp_ukernel__avx2_p5_x88,
176     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
177   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_x96,
178     xnn_f32_raddextexp_ukernel__avx2_p5_x80_acc2,
179     xnn_f32_vscaleextexp_ukernel__avx2_p5_x96,
180     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
181 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
182 
183 #ifndef XNNPACK_BENCHMARK_NO_MAIN
184 BENCHMARK_MAIN();
185 #endif
186