xref: /aosp_15_r20/external/XNNPACK/bench/f32-raddexpminusmax.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/raddexpminusmax.h>
20 #include <xnnpack/rmax.h>
21 
22 
f32_raddexpminusmax(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void f32_raddexpminusmax(
24   benchmark::State& state,
25   xnn_f32_rmax_ukernel_function rmax,
26   xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
27   benchmark::utils::IsaCheckFunction isa_check = nullptr)
28 {
29   if (isa_check && !isa_check(state)) {
30     return;
31   }
32 
33   const size_t elements = state.range(0);
34   const size_t cache_line_size_max = 128;
35   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
36 
37   std::random_device random_device;
38   auto rng = std::mt19937(random_device());
39   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
40 
41   const size_t num_buffers = 1 +
42     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
43   std::vector<float, AlignedAllocator<float, 64>> x(elements);
44 
45   std::generate(x.begin(), x.end(), std::ref(f32rng));
46 
47   benchmark::utils::DisableDenormals();
48 
49   size_t buffer_index = 0;
50   for (auto _ : state) {
51     state.PauseTiming();
52     float x_max = nanf("");
53     rmax(elements * sizeof(float), x.data(), &x_max);
54     if (++buffer_index == num_buffers) {
55       buffer_index = 0;
56     }
57     state.ResumeTiming();
58 
59     float y_sum = nanf("");
60     raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
61   }
62 
63   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
64   if (cpu_frequency != 0) {
65     state.counters["cpufreq"] = cpu_frequency;
66   }
67 
68   const size_t elements_per_iteration = elements;
69   state.counters["elements"] =
70     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
71 
72   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
73   state.counters["bytes"] =
74     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
75 }
76 
CharacteristicArguments(benchmark::internal::Benchmark * b)77 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
78   b->ArgName("N");
79   for (int32_t n = 10000; n <= 100000000; n *= 10) {
80     b->Arg(n);
81   }
82 }
83 
84 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
85   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64,
86     xnn_f32_rmax_ukernel__avx,
87     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64,
88     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
89   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64_acc2,
90     xnn_f32_rmax_ukernel__avx,
91     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2,
92     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
93   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64_acc4,
94     xnn_f32_rmax_ukernel__avx,
95     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4,
96     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
97 
98   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x72,
99     xnn_f32_rmax_ukernel__avx,
100     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72,
101     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
102   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x72_acc3,
103     xnn_f32_rmax_ukernel__avx,
104     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3,
105     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
106 
107   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80,
108     xnn_f32_rmax_ukernel__avx,
109     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80,
110     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
111   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80_acc2,
112     xnn_f32_rmax_ukernel__avx,
113     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
114     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
115   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80_acc5,
116     xnn_f32_rmax_ukernel__avx,
117     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc5,
118     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
119 
120   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96,
121     xnn_f32_rmax_ukernel__avx,
122     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
123     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
124   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc2,
125     xnn_f32_rmax_ukernel__avx,
126     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc2,
127     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
128   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc3,
129     xnn_f32_rmax_ukernel__avx,
130     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc3,
131     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
132   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc6,
133     xnn_f32_rmax_ukernel__avx,
134     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc6,
135     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
136 
137   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128,
138     xnn_f32_rmax_ukernel__avx,
139     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128,
140     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
141   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128_acc2,
142     xnn_f32_rmax_ukernel__avx,
143     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
144     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
145   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128_acc4,
146     xnn_f32_rmax_ukernel__avx,
147     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
148     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
149 
150   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x144,
151     xnn_f32_rmax_ukernel__avx,
152     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x144,
153     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
154   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x144_acc3,
155     xnn_f32_rmax_ukernel__avx,
156     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x144_acc3,
157     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
158 
159   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160,
160     xnn_f32_rmax_ukernel__avx,
161     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160,
162     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
163   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160_acc2,
164     xnn_f32_rmax_ukernel__avx,
165     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160_acc2,
166     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
167   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160_acc5,
168     xnn_f32_rmax_ukernel__avx,
169     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160_acc5,
170     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
171 
172   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192,
173     xnn_f32_rmax_ukernel__avx,
174     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192,
175     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
176   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc2,
177     xnn_f32_rmax_ukernel__avx,
178     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc2,
179     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
180   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc3,
181     xnn_f32_rmax_ukernel__avx,
182     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3,
183     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
184   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc6,
185     xnn_f32_rmax_ukernel__avx,
186     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6,
187     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
188 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
189 
190 #ifndef XNNPACK_BENCHMARK_NO_MAIN
191 BENCHMARK_MAIN();
192 #endif
193