1 #include <benchmark/benchmark.h>
2
3 #include <fp16.h>
4 #ifndef EMSCRIPTEN
5 #include <fp16/psimd.h>
6 #endif
7
8 #include <vector>
9 #include <random>
10 #include <chrono>
11 #include <functional>
12 #include <algorithm>
13
14 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
15 #include <immintrin.h>
16 #endif
17
18 #if defined(__ARM_NEON__) || defined(__aarch64__)
19 #include <arm_neon.h>
20 #endif
21
22 #ifdef FP16_COMPARATIVE_BENCHMARKS
23 #include <third-party/THHalf.h>
24 #include <third-party/npy-halffloat.h>
25 #include <third-party/eigen-half.h>
26 #include <third-party/float16-compressor.h>
27 #include <third-party/half.hpp>
28 #endif
29
30
fp16_ieee_from_fp32_value(benchmark::State & state)31 static void fp16_ieee_from_fp32_value(benchmark::State& state) {
32 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
33 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
34
35 std::vector<float> fp32(state.range(0));
36 std::vector<uint16_t> fp16(state.range(0));
37 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
38
39 while (state.KeepRunning()) {
40 float* input = fp32.data();
41 benchmark::DoNotOptimize(input);
42
43 uint16_t* output = fp16.data();
44 const size_t n = state.range(0);
45 for (size_t i = 0; i < n; i++) {
46 output[i] = fp16_ieee_from_fp32_value(input[i]);
47 }
48
49 benchmark::DoNotOptimize(output);
50 }
51 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
52 }
53 BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
54
55 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
hardware_mm_cvtps_ph(benchmark::State & state)56 static void hardware_mm_cvtps_ph(benchmark::State& state) {
57 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
58 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
59
60 std::vector<float> fp32(state.range(0));
61 std::vector<uint16_t> fp16(state.range(0));
62 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
63
64 while (state.KeepRunning()) {
65 float* input = fp32.data();
66 benchmark::DoNotOptimize(input);
67
68 uint16_t* output = fp16.data();
69 const size_t n = state.range(0);
70 for (size_t i = 0; i < n; i += 4) {
71 _mm_storel_epi64(
72 static_cast<__m128i*>(static_cast<void*>(&output[i])),
73 _mm_cvtps_ph(_mm_loadu_ps(&input[i]), _MM_FROUND_CUR_DIRECTION));
74 }
75
76 benchmark::DoNotOptimize(output);
77 }
78 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
79 }
80 BENCHMARK(hardware_mm_cvtps_ph)->RangeMultiplier(2)->Range(1<<10, 64<<20);
81
hardware_mm256_cvtps_ph(benchmark::State & state)82 static void hardware_mm256_cvtps_ph(benchmark::State& state) {
83 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
84 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
85
86 std::vector<float> fp32(state.range(0));
87 std::vector<uint16_t> fp16(state.range(0));
88 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
89
90 while (state.KeepRunning()) {
91 float* input = fp32.data();
92 benchmark::DoNotOptimize(input);
93
94 uint16_t* output = fp16.data();
95 const size_t n = state.range(0);
96 for (size_t i = 0; i < n; i += 8) {
97 _mm_storeu_si128(
98 static_cast<__m128i*>(static_cast<void*>(&output[i])),
99 _mm256_cvtps_ph(_mm256_loadu_ps(&input[i]), _MM_FROUND_CUR_DIRECTION));
100 }
101
102 benchmark::DoNotOptimize(output);
103 }
104 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
105 }
106 BENCHMARK(hardware_mm256_cvtps_ph)->RangeMultiplier(2)->Range(1<<10, 64<<20);
107 #endif
108
109 #if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
hardware_vcvt_f16_f32(benchmark::State & state)110 static void hardware_vcvt_f16_f32(benchmark::State& state) {
111 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
112 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
113
114 std::vector<float> fp32(state.range(0));
115 std::vector<uint16_t> fp16(state.range(0));
116 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
117
118 while (state.KeepRunning()) {
119 float* input = fp32.data();
120 benchmark::DoNotOptimize(input);
121
122 uint16_t* output = fp16.data();
123 const size_t n = state.range(0);
124 #if defined(__aarch64__)
125 const unsigned int fpcr = __builtin_aarch64_get_fpcr();
126 /* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
127 __builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu);
128 #else
129 unsigned int fpscr;
130 __asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
131 /* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
132 __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
133 : [fpscr] "r" (fpscr & 0xF6FFFFFFu));
134 #endif
135 for (size_t i = 0; i < n; i += 4) {
136 vst1_u16(&output[i],
137 (uint16x4_t) vcvt_f16_f32(
138 vld1q_f32(&input[i])));
139 }
140 #if defined(__aarch64__)
141 __builtin_aarch64_set_fpcr(fpcr);
142 #else
143 __asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
144 #endif
145
146 benchmark::DoNotOptimize(output);
147 }
148 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
149 }
150 BENCHMARK(hardware_vcvt_f16_f32)->RangeMultiplier(2)->Range(1<<10, 64<<20);
151 #endif
152
153 #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_float2halfbits(benchmark::State & state)154 static void TH_float2halfbits(benchmark::State& state) {
155 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
156 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
157
158 std::vector<float> fp32(state.range(0));
159 std::vector<uint16_t> fp16(state.range(0));
160 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
161
162 while (state.KeepRunning()) {
163 float* input = fp32.data();
164 benchmark::DoNotOptimize(input);
165
166 uint16_t* output = fp16.data();
167 const size_t n = state.range(0);
168 for (size_t i = 0; i < n; i++) {
169 TH_float2halfbits(&input[i], &output[i]);
170 }
171
172 benchmark::DoNotOptimize(output);
173 }
174 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
175 }
176 BENCHMARK(TH_float2halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
177
npy_floatbits_to_halfbits(benchmark::State & state)178 static void npy_floatbits_to_halfbits(benchmark::State& state) {
179 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
180 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
181
182 std::vector<float> fp32(state.range(0));
183 std::vector<uint16_t> fp16(state.range(0));
184 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
185
186 while (state.KeepRunning()) {
187 float* input = fp32.data();
188 benchmark::DoNotOptimize(input);
189
190 uint16_t* output = fp16.data();
191 const size_t n = state.range(0);
192 for (size_t i = 0; i < n; i++) {
193 output[i] = npy_floatbits_to_halfbits(fp32_to_bits(input[i]));
194 }
195
196 benchmark::DoNotOptimize(output);
197 }
198 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
199 }
200 BENCHMARK(npy_floatbits_to_halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
201
Eigen_float_to_half_rtne(benchmark::State & state)202 static void Eigen_float_to_half_rtne(benchmark::State& state) {
203 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
204 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
205
206 std::vector<float> fp32(state.range(0));
207 std::vector<uint16_t> fp16(state.range(0));
208 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
209
210 while (state.KeepRunning()) {
211 float* input = fp32.data();
212 benchmark::DoNotOptimize(input);
213
214 uint16_t* output = fp16.data();
215 const size_t n = state.range(0);
216 for (size_t i = 0; i < n; i++) {
217 output[i] = Eigen::half_impl::float_to_half_rtne(input[i]).x;
218 }
219
220 benchmark::DoNotOptimize(output);
221 }
222 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
223 }
224 BENCHMARK(Eigen_float_to_half_rtne)->RangeMultiplier(2)->Range(1<<10, 64<<20);
225
Float16Compressor_compress(benchmark::State & state)226 static void Float16Compressor_compress(benchmark::State& state) {
227 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
228 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
229
230 std::vector<float> fp32(state.range(0));
231 std::vector<uint16_t> fp16(state.range(0));
232 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
233
234 while (state.KeepRunning()) {
235 float* input = fp32.data();
236 benchmark::DoNotOptimize(input);
237
238 uint16_t* output = fp16.data();
239 const size_t n = state.range(0);
240 for (size_t i = 0; i < n; i++) {
241 output[i] = Float16Compressor::compress(input[i]);
242 }
243
244 benchmark::DoNotOptimize(output);
245 }
246 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
247 }
248 BENCHMARK(Float16Compressor_compress)->RangeMultiplier(2)->Range(1<<10, 64<<20);
249
half_float_detail_float2half_table(benchmark::State & state)250 static void half_float_detail_float2half_table(benchmark::State& state) {
251 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
252 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
253
254 std::vector<float> fp32(state.range(0));
255 std::vector<uint16_t> fp16(state.range(0));
256 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
257
258 while (state.KeepRunning()) {
259 float* input = fp32.data();
260 benchmark::DoNotOptimize(input);
261
262 uint16_t* output = fp16.data();
263 const size_t n = state.range(0);
264 for (size_t i = 0; i < n; i++) {
265 output[i] =
266 half_float::detail::float2half_impl<std::round_to_nearest>(
267 input[i], half_float::detail::true_type());
268 }
269
270 benchmark::DoNotOptimize(output);
271 }
272 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
273 }
274 BENCHMARK(half_float_detail_float2half_table)->RangeMultiplier(2)->Range(1<<10, 64<<20);
275
half_float_detail_float2half_branch(benchmark::State & state)276 static void half_float_detail_float2half_branch(benchmark::State& state) {
277 const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
278 auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
279
280 std::vector<float> fp32(state.range(0));
281 std::vector<uint16_t> fp16(state.range(0));
282 std::generate(fp32.begin(), fp32.end(), std::ref(rng));
283
284 while (state.KeepRunning()) {
285 float* input = fp32.data();
286 benchmark::DoNotOptimize(input);
287
288 uint16_t* output = fp16.data();
289 const size_t n = state.range(0);
290 for (size_t i = 0; i < n; i++) {
291 output[i] =
292 half_float::detail::float2half_impl<std::round_to_nearest>(
293 input[i], half_float::detail::false_type());
294 }
295
296 benchmark::DoNotOptimize(output);
297 }
298 state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
299 }
300 BENCHMARK(half_float_detail_float2half_branch)->RangeMultiplier(2)->Range(1<<10, 64<<20);
301 #endif
302
303 BENCHMARK_MAIN();
304