xref: /aosp_15_r20/external/FP16/bench/to-ieee-array.cc (revision 5f32b7105932ea8520a0e8811c640f936367d707)
1 #include <benchmark/benchmark.h>
2 
3 #include <fp16.h>
4 #ifndef EMSCRIPTEN
5 	#include <fp16/psimd.h>
6 #endif
7 
8 #include <vector>
9 #include <random>
10 #include <chrono>
11 #include <functional>
12 #include <algorithm>
13 
14 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
15 	#include <immintrin.h>
16 #endif
17 
18 #if defined(__ARM_NEON__) || defined(__aarch64__)
19 	#include <arm_neon.h>
20 #endif
21 
22 #ifdef FP16_COMPARATIVE_BENCHMARKS
23 	#include <third-party/THHalf.h>
24 	#include <third-party/npy-halffloat.h>
25 	#include <third-party/eigen-half.h>
26 	#include <third-party/float16-compressor.h>
27 	#include <third-party/half.hpp>
28 #endif
29 
30 
fp16_ieee_from_fp32_value(benchmark::State & state)31 static void fp16_ieee_from_fp32_value(benchmark::State& state) {
32 	const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
33 	auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
34 
35 	std::vector<float> fp32(state.range(0));
36 	std::vector<uint16_t> fp16(state.range(0));
37 	std::generate(fp32.begin(), fp32.end(), std::ref(rng));
38 
39 	while (state.KeepRunning()) {
40 		float* input = fp32.data();
41 		benchmark::DoNotOptimize(input);
42 
43 		uint16_t* output = fp16.data();
44 		const size_t n = state.range(0);
45 		for (size_t i = 0; i < n; i++) {
46 			output[i] = fp16_ieee_from_fp32_value(input[i]);
47 		}
48 
49 		benchmark::DoNotOptimize(output);
50 	}
51 	state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
52 }
53 BENCHMARK(fp16_ieee_from_fp32_value)->RangeMultiplier(2)->Range(1<<10, 64<<20);
54 
55 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
hardware_mm_cvtps_ph(benchmark::State & state)56 	static void hardware_mm_cvtps_ph(benchmark::State& state) {
57 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
58 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
59 
60 		std::vector<float> fp32(state.range(0));
61 		std::vector<uint16_t> fp16(state.range(0));
62 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
63 
64 		while (state.KeepRunning()) {
65 			float* input = fp32.data();
66 			benchmark::DoNotOptimize(input);
67 
68 			uint16_t* output = fp16.data();
69 			const size_t n = state.range(0);
70 			for (size_t i = 0; i < n; i += 4) {
71 				_mm_storel_epi64(
72 					static_cast<__m128i*>(static_cast<void*>(&output[i])),
73 					_mm_cvtps_ph(_mm_loadu_ps(&input[i]), _MM_FROUND_CUR_DIRECTION));
74 			}
75 
76 			benchmark::DoNotOptimize(output);
77 		}
78 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
79 	}
80 	BENCHMARK(hardware_mm_cvtps_ph)->RangeMultiplier(2)->Range(1<<10, 64<<20);
81 
hardware_mm256_cvtps_ph(benchmark::State & state)82 	static void hardware_mm256_cvtps_ph(benchmark::State& state) {
83 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
84 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
85 
86 		std::vector<float> fp32(state.range(0));
87 		std::vector<uint16_t> fp16(state.range(0));
88 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
89 
90 		while (state.KeepRunning()) {
91 			float* input = fp32.data();
92 			benchmark::DoNotOptimize(input);
93 
94 			uint16_t* output = fp16.data();
95 			const size_t n = state.range(0);
96 			for (size_t i = 0; i < n; i += 8) {
97 				_mm_storeu_si128(
98 					static_cast<__m128i*>(static_cast<void*>(&output[i])),
99 					_mm256_cvtps_ph(_mm256_loadu_ps(&input[i]), _MM_FROUND_CUR_DIRECTION));
100 			}
101 
102 			benchmark::DoNotOptimize(output);
103 		}
104 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
105 	}
106 	BENCHMARK(hardware_mm256_cvtps_ph)->RangeMultiplier(2)->Range(1<<10, 64<<20);
107 #endif
108 
109 #if defined(__ARM_NEON_FP) && (__ARM_NEON_FP & 0x2) || defined(__aarch64__)
hardware_vcvt_f16_f32(benchmark::State & state)110 	static void hardware_vcvt_f16_f32(benchmark::State& state) {
111 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
112 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
113 
114 		std::vector<float> fp32(state.range(0));
115 		std::vector<uint16_t> fp16(state.range(0));
116 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
117 
118 		while (state.KeepRunning()) {
119 			float* input = fp32.data();
120 			benchmark::DoNotOptimize(input);
121 
122 			uint16_t* output = fp16.data();
123 			const size_t n = state.range(0);
124 			#if defined(__aarch64__)
125 				const unsigned int fpcr = __builtin_aarch64_get_fpcr();
126 				/* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
127 				__builtin_aarch64_set_fpcr(fpcr & 0xF6FFFFFFu);
128 			#else
129 				unsigned int fpscr;
130 				__asm__ __volatile__ ("VMRS %[fpscr], fpscr" : [fpscr] "=r" (fpscr));
131 				/* Disable flush-to-zero (bit 24) and Alternative FP16 format (bit 26) */
132 				__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :
133 					: [fpscr] "r" (fpscr & 0xF6FFFFFFu));
134 			#endif
135 			for (size_t i = 0; i < n; i += 4) {
136 				vst1_u16(&output[i],
137 					(uint16x4_t) vcvt_f16_f32(
138 						vld1q_f32(&input[i])));
139 			}
140 			#if defined(__aarch64__)
141 				__builtin_aarch64_set_fpcr(fpcr);
142 			#else
143 				__asm__ __volatile__ ("VMSR fpscr, %[fpscr]" :: [fpscr] "r" (fpscr));
144 			#endif
145 
146 			benchmark::DoNotOptimize(output);
147 		}
148 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
149 	}
150 	BENCHMARK(hardware_vcvt_f16_f32)->RangeMultiplier(2)->Range(1<<10, 64<<20);
151 #endif
152 
153 #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_float2halfbits(benchmark::State & state)154 	static void TH_float2halfbits(benchmark::State& state) {
155 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
156 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
157 
158 		std::vector<float> fp32(state.range(0));
159 		std::vector<uint16_t> fp16(state.range(0));
160 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
161 
162 		while (state.KeepRunning()) {
163 			float* input = fp32.data();
164 			benchmark::DoNotOptimize(input);
165 
166 			uint16_t* output = fp16.data();
167 			const size_t n = state.range(0);
168 			for (size_t i = 0; i < n; i++) {
169 				TH_float2halfbits(&input[i], &output[i]);
170 			}
171 
172 			benchmark::DoNotOptimize(output);
173 		}
174 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
175 	}
176 	BENCHMARK(TH_float2halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
177 
npy_floatbits_to_halfbits(benchmark::State & state)178 	static void npy_floatbits_to_halfbits(benchmark::State& state) {
179 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
180 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
181 
182 		std::vector<float> fp32(state.range(0));
183 		std::vector<uint16_t> fp16(state.range(0));
184 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
185 
186 		while (state.KeepRunning()) {
187 			float* input = fp32.data();
188 			benchmark::DoNotOptimize(input);
189 
190 			uint16_t* output = fp16.data();
191 			const size_t n = state.range(0);
192 			for (size_t i = 0; i < n; i++) {
193 				output[i] = npy_floatbits_to_halfbits(fp32_to_bits(input[i]));
194 			}
195 
196 			benchmark::DoNotOptimize(output);
197 		}
198 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
199 	}
200 	BENCHMARK(npy_floatbits_to_halfbits)->RangeMultiplier(2)->Range(1<<10, 64<<20);
201 
Eigen_float_to_half_rtne(benchmark::State & state)202 	static void Eigen_float_to_half_rtne(benchmark::State& state) {
203 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
204 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
205 
206 		std::vector<float> fp32(state.range(0));
207 		std::vector<uint16_t> fp16(state.range(0));
208 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
209 
210 		while (state.KeepRunning()) {
211 			float* input = fp32.data();
212 			benchmark::DoNotOptimize(input);
213 
214 			uint16_t* output = fp16.data();
215 			const size_t n = state.range(0);
216 			for (size_t i = 0; i < n; i++) {
217 				output[i] = Eigen::half_impl::float_to_half_rtne(input[i]).x;
218 			}
219 
220 			benchmark::DoNotOptimize(output);
221 		}
222 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
223 	}
224 	BENCHMARK(Eigen_float_to_half_rtne)->RangeMultiplier(2)->Range(1<<10, 64<<20);
225 
Float16Compressor_compress(benchmark::State & state)226 	static void Float16Compressor_compress(benchmark::State& state) {
227 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
228 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
229 
230 		std::vector<float> fp32(state.range(0));
231 		std::vector<uint16_t> fp16(state.range(0));
232 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
233 
234 		while (state.KeepRunning()) {
235 			float* input = fp32.data();
236 			benchmark::DoNotOptimize(input);
237 
238 			uint16_t* output = fp16.data();
239 			const size_t n = state.range(0);
240 			for (size_t i = 0; i < n; i++) {
241 				output[i] = Float16Compressor::compress(input[i]);
242 			}
243 
244 			benchmark::DoNotOptimize(output);
245 		}
246 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
247 	}
248 	BENCHMARK(Float16Compressor_compress)->RangeMultiplier(2)->Range(1<<10, 64<<20);
249 
half_float_detail_float2half_table(benchmark::State & state)250 	static void half_float_detail_float2half_table(benchmark::State& state) {
251 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
252 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
253 
254 		std::vector<float> fp32(state.range(0));
255 		std::vector<uint16_t> fp16(state.range(0));
256 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
257 
258 		while (state.KeepRunning()) {
259 			float* input = fp32.data();
260 			benchmark::DoNotOptimize(input);
261 
262 			uint16_t* output = fp16.data();
263 			const size_t n = state.range(0);
264 			for (size_t i = 0; i < n; i++) {
265 				output[i] =
266 					half_float::detail::float2half_impl<std::round_to_nearest>(
267 						input[i], half_float::detail::true_type());
268 			}
269 
270 			benchmark::DoNotOptimize(output);
271 		}
272 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
273 	}
274 	BENCHMARK(half_float_detail_float2half_table)->RangeMultiplier(2)->Range(1<<10, 64<<20);
275 
half_float_detail_float2half_branch(benchmark::State & state)276 	static void half_float_detail_float2half_branch(benchmark::State& state) {
277 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
278 		auto rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::mt19937(seed));
279 
280 		std::vector<float> fp32(state.range(0));
281 		std::vector<uint16_t> fp16(state.range(0));
282 		std::generate(fp32.begin(), fp32.end(), std::ref(rng));
283 
284 		while (state.KeepRunning()) {
285 			float* input = fp32.data();
286 			benchmark::DoNotOptimize(input);
287 
288 			uint16_t* output = fp16.data();
289 			const size_t n = state.range(0);
290 			for (size_t i = 0; i < n; i++) {
291 				output[i] =
292 					half_float::detail::float2half_impl<std::round_to_nearest>(
293 						input[i], half_float::detail::false_type());
294 			}
295 
296 			benchmark::DoNotOptimize(output);
297 		}
298 		state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(state.range(0)));
299 	}
300 	BENCHMARK(half_float_detail_float2half_branch)->RangeMultiplier(2)->Range(1<<10, 64<<20);
301 #endif
302 
303 BENCHMARK_MAIN();
304