1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15
16 #include <cpuinfo.h>
17
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20
21 #include <xnnpack/aligned-allocator.h>
22 #include <xnnpack/common.h>
23 #include <xnnpack/requantization-stubs.h>
24
25
26 class Requantization : public benchmark::Fixture {
27 public:
Requantization()28 inline Requantization()
29 {
30 cpuinfo_initialize();
31 const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
32 const size_t l1d_reserve = 1024;
33 n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(uint8_t));
34 n_ = n_ / 16 * 16;
35 }
36
SetUp(benchmark::State & state)37 virtual void SetUp(benchmark::State& state) override
38 {
39 std::random_device random_device;
40 auto rng = std::mt19937(random_device());
41 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
42
43 input_.resize(n());
44 std::generate(input_.begin(), input_.end(), std::ref(i32rng));
45 output_.resize(n());
46 std::fill(output_.begin(), output_.end(), 0xA5);
47
48 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49 if (cpu_frequency != 0) {
50 state.counters["cpufreq"] = cpu_frequency;
51 }
52 }
53
TearDown(benchmark::State & state)54 virtual void TearDown(benchmark::State& state) override
55 {
56 state.SetItemsProcessed(uint64_t(state.iterations()) * n());
57 state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(uint8_t)));
58 input_.clear();
59 output_.clear();
60 }
61
input() const62 inline const int32_t* input() const
63 {
64 return input_.data();
65 }
66
output()67 inline uint8_t* output()
68 {
69 return output_.data();
70 }
71
n() const72 inline size_t n() const
73 {
74 return n_;
75 }
76
77 protected:
78 std::vector<int32_t, AlignedAllocator<int32_t, 64>> input_;
79 std::vector<uint8_t> output_;
80 size_t n_;
81 };
82
83
84 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,fp32__neon)85 BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
86 for (auto _ : state) {
87 xnn_qu8_requantize_fp32__neon(
88 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
89 }
90 }
91
BENCHMARK_F(Requantization,gemmlowp__neon)92 BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state) {
93 for (auto _ : state) {
94 xnn_qu8_requantize_gemmlowp__neon(
95 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
96 }
97 }
98
BENCHMARK_F(Requantization,rndna__neon)99 BENCHMARK_F(Requantization, rndna__neon)(benchmark::State& state) {
100 for (auto _ : state) {
101 xnn_qu8_requantize_rndna__neon(
102 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
103 }
104 }
105 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
106
107
108 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,fp32__sse2)109 BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
110 for (auto _ : state) {
111 xnn_qu8_requantize_fp32__sse2(
112 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
113 }
114 }
115
BENCHMARK_F(Requantization,gemmlowp__sse2)116 BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state) {
117 for (auto _ : state) {
118 xnn_qu8_requantize_gemmlowp__sse2(
119 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
120 }
121 }
122
BENCHMARK_F(Requantization,gemmlowp__ssse3)123 BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state) {
124 for (auto _ : state) {
125 xnn_qu8_requantize_gemmlowp__ssse3(
126 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
127 }
128 }
129
BENCHMARK_F(Requantization,gemmlowp__sse4)130 BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state) {
131 for (auto _ : state) {
132 xnn_qu8_requantize_gemmlowp__sse4(
133 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
134 }
135 }
136
BENCHMARK_F(Requantization,rndna__sse2)137 BENCHMARK_F(Requantization, rndna__sse2)(benchmark::State& state) {
138 for (auto _ : state) {
139 xnn_qu8_requantize_rndna__sse2(
140 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
141 }
142 }
143
BENCHMARK_F(Requantization,rndna__ssse3)144 BENCHMARK_F(Requantization, rndna__ssse3)(benchmark::State& state) {
145 for (auto _ : state) {
146 xnn_qu8_requantize_rndna__ssse3(
147 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
148 }
149 }
150
BENCHMARK_F(Requantization,rndna__sse4)151 BENCHMARK_F(Requantization, rndna__sse4)(benchmark::State& state) {
152 for (auto _ : state) {
153 xnn_qu8_requantize_rndna__sse4(
154 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
155 }
156 }
157 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
158
159
160 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)161 BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
162 for (auto _ : state) {
163 xnn_qu8_requantize_fp32__wasmsimd(
164 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
165 }
166 }
167
BENCHMARK_F(Requantization,gemmlowp__wasmsimd)168 BENCHMARK_F(Requantization, gemmlowp__wasmsimd)(benchmark::State& state) {
169 for (auto _ : state) {
170 xnn_qu8_requantize_gemmlowp__wasmsimd(
171 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
172 }
173 }
174 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
175
176
BENCHMARK_F(Requantization,fp32__scalar_lrintf)177 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
178 for (auto _ : state) {
179 xnn_qu8_requantize_fp32__scalar_lrintf(
180 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
181 }
182 }
183
BENCHMARK_F(Requantization,fp32__scalar_fmagic)184 BENCHMARK_F(Requantization, fp32__scalar_fmagic)(benchmark::State& state) {
185 for (auto _ : state) {
186 xnn_qu8_requantize_fp32__scalar_fmagic(
187 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
188 }
189 }
190
BENCHMARK_F(Requantization,gemmlowp__scalar)191 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state) {
192 for (auto _ : state) {
193 xnn_qu8_requantize_gemmlowp__scalar(
194 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
195 }
196 }
197
BENCHMARK_F(Requantization,rndna__scalar_signed64)198 BENCHMARK_F(Requantization, rndna__scalar_signed64)(benchmark::State& state) {
199 for (auto _ : state) {
200 xnn_qu8_requantize_rndna__scalar_signed64(
201 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
202 }
203 }
204
BENCHMARK_F(Requantization,rndna__scalar_unsigned32)205 BENCHMARK_F(Requantization, rndna__scalar_unsigned32)(benchmark::State& state) {
206 for (auto _ : state) {
207 xnn_qu8_requantize_rndna__scalar_unsigned32(
208 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
209 }
210 }
211
BENCHMARK_F(Requantization,rndna__scalar_unsigned64)212 BENCHMARK_F(Requantization, rndna__scalar_unsigned64)(benchmark::State& state) {
213 for (auto _ : state) {
214 xnn_qu8_requantize_rndna__scalar_unsigned64(
215 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
216 }
217 }
218
219
220 #ifndef XNNPACK_BENCHMARK_NO_MAIN
221 BENCHMARK_MAIN();
222 #endif
223