xref: /aosp_15_r20/external/FP16/bench/ieee-element.cc (revision 5f32b7105932ea8520a0e8811c640f936367d707)
1*5f32b710SXin Li #include <benchmark/benchmark.h>
2*5f32b710SXin Li 
3*5f32b710SXin Li #include <fp16.h>
4*5f32b710SXin Li #ifndef EMSCRIPTEN
5*5f32b710SXin Li 	#include <fp16/psimd.h>
6*5f32b710SXin Li #endif
7*5f32b710SXin Li 
8*5f32b710SXin Li #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
9*5f32b710SXin Li 	#include <immintrin.h>
10*5f32b710SXin Li #endif
11*5f32b710SXin Li 
12*5f32b710SXin Li #ifdef FP16_COMPARATIVE_BENCHMARKS
13*5f32b710SXin Li 	#include <third-party/THHalf.h>
14*5f32b710SXin Li 	#include <third-party/npy-halffloat.h>
15*5f32b710SXin Li 	#include <third-party/eigen-half.h>
16*5f32b710SXin Li 	#include <third-party/float16-compressor.h>
17*5f32b710SXin Li 	#include <third-party/half.hpp>
18*5f32b710SXin Li #endif
19*5f32b710SXin Li 
next_xorshift16(uint16_t x)20*5f32b710SXin Li static inline uint16_t next_xorshift16(uint16_t x) {
21*5f32b710SXin Li 	x ^= x >> 8;
22*5f32b710SXin Li 	x ^= x << 9;
23*5f32b710SXin Li 	x ^= x >> 5;
24*5f32b710SXin Li 	return x;
25*5f32b710SXin Li }
26*5f32b710SXin Li 
next_xorshift32(uint32_t x)27*5f32b710SXin Li static inline uint32_t next_xorshift32(uint32_t x) {
28*5f32b710SXin Li 	x ^= x >> 13;
29*5f32b710SXin Li 	x ^= x << 17;
30*5f32b710SXin Li 	x ^= x >> 5;
31*5f32b710SXin Li 	return x;
32*5f32b710SXin Li }
33*5f32b710SXin Li #ifndef EMSCRIPTEN
next_xorshift16_psimd(psimd_u16 x)34*5f32b710SXin Li 	PSIMD_INTRINSIC psimd_u16 next_xorshift16_psimd(psimd_u16 x) {
35*5f32b710SXin Li 		x ^= x >> psimd_splat_u16(8);
36*5f32b710SXin Li 		x ^= x << psimd_splat_u16(9);
37*5f32b710SXin Li 		x ^= x >> psimd_splat_u16(5);
38*5f32b710SXin Li 		return x;
39*5f32b710SXin Li 	}
40*5f32b710SXin Li #endif
41*5f32b710SXin Li 
42*5f32b710SXin Li 
43*5f32b710SXin Li /* Conversion from IEEE FP16 to IEEE FP32 */
44*5f32b710SXin Li 
fp16_ieee_to_fp32_bits(benchmark::State & state)45*5f32b710SXin Li static void fp16_ieee_to_fp32_bits(benchmark::State& state) {
46*5f32b710SXin Li 	uint16_t fp16 = UINT16_C(0x7C00);
47*5f32b710SXin Li 	while (state.KeepRunning()) {
48*5f32b710SXin Li 		const uint32_t fp32 = fp16_ieee_to_fp32_bits(fp16);
49*5f32b710SXin Li 
50*5f32b710SXin Li 		fp16 = next_xorshift16(fp16);
51*5f32b710SXin Li 		benchmark::DoNotOptimize(fp32);
52*5f32b710SXin Li 	}
53*5f32b710SXin Li }
54*5f32b710SXin Li BENCHMARK(fp16_ieee_to_fp32_bits);
55*5f32b710SXin Li 
fp16_ieee_to_fp32_value(benchmark::State & state)56*5f32b710SXin Li static void fp16_ieee_to_fp32_value(benchmark::State& state) {
57*5f32b710SXin Li 	uint16_t fp16 = UINT16_C(0x7C00);
58*5f32b710SXin Li 	while (state.KeepRunning()) {
59*5f32b710SXin Li 		const float fp32 = fp16_ieee_to_fp32_value(fp16);
60*5f32b710SXin Li 
61*5f32b710SXin Li 		fp16 = next_xorshift16(fp16);
62*5f32b710SXin Li 		benchmark::DoNotOptimize(fp32);
63*5f32b710SXin Li 	}
64*5f32b710SXin Li }
65*5f32b710SXin Li BENCHMARK(fp16_ieee_to_fp32_value);
66*5f32b710SXin Li 
67*5f32b710SXin Li #ifndef EMSCRIPTEN
fp16_ieee_to_fp32_psimd(benchmark::State & state)68*5f32b710SXin Li 	static void fp16_ieee_to_fp32_psimd(benchmark::State& state) {
69*5f32b710SXin Li 		psimd_u16 fp16 = (psimd_u16) { 0x7C00, 0x7C01, 0x7C02, 0x7C03 };
70*5f32b710SXin Li 		while (state.KeepRunning()) {
71*5f32b710SXin Li 			const psimd_f32 fp32 = fp16_ieee_to_fp32_psimd(fp16);
72*5f32b710SXin Li 
73*5f32b710SXin Li 			fp16 = next_xorshift16_psimd(fp16);
74*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
75*5f32b710SXin Li 		}
76*5f32b710SXin Li 	}
77*5f32b710SXin Li 	BENCHMARK(fp16_ieee_to_fp32_psimd);
78*5f32b710SXin Li 
fp16_ieee_to_fp32x2_psimd(benchmark::State & state)79*5f32b710SXin Li 	static void fp16_ieee_to_fp32x2_psimd(benchmark::State& state) {
80*5f32b710SXin Li 		psimd_u16 fp16 =
81*5f32b710SXin Li 			(psimd_u16) { 0x7C00, 0x7C01, 0x7C02, 0x7C03, 0x7C04, 0x7C05, 0x7C06, 0x7C07 };
82*5f32b710SXin Li 		while (state.KeepRunning()) {
83*5f32b710SXin Li 			const psimd_f32x2 fp32 = fp16_ieee_to_fp32x2_psimd(fp16);
84*5f32b710SXin Li 
85*5f32b710SXin Li 			fp16 = next_xorshift16_psimd(fp16);
86*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
87*5f32b710SXin Li 		}
88*5f32b710SXin Li 	}
89*5f32b710SXin Li 	BENCHMARK(fp16_ieee_to_fp32x2_psimd);
90*5f32b710SXin Li #endif
91*5f32b710SXin Li 
92*5f32b710SXin Li #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_halfbits2float(benchmark::State & state)93*5f32b710SXin Li 	static void TH_halfbits2float(benchmark::State& state) {
94*5f32b710SXin Li 		uint16_t fp16 = UINT16_C(0x7C00);
95*5f32b710SXin Li 		while (state.KeepRunning()) {
96*5f32b710SXin Li 			float fp32;
97*5f32b710SXin Li 			TH_halfbits2float(&fp16, &fp32);
98*5f32b710SXin Li 
99*5f32b710SXin Li 			fp16 = next_xorshift16(fp16);
100*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
101*5f32b710SXin Li 		}
102*5f32b710SXin Li 	}
103*5f32b710SXin Li 	BENCHMARK(TH_halfbits2float);
104*5f32b710SXin Li 
npy_halfbits_to_floatbits(benchmark::State & state)105*5f32b710SXin Li 	static void npy_halfbits_to_floatbits(benchmark::State& state) {
106*5f32b710SXin Li 		uint16_t fp16 = UINT16_C(0x7C00);
107*5f32b710SXin Li 		while (state.KeepRunning()) {
108*5f32b710SXin Li 			const uint32_t fp32 = npy_halfbits_to_floatbits(fp16);
109*5f32b710SXin Li 
110*5f32b710SXin Li 			fp16 = next_xorshift16(fp16);
111*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
112*5f32b710SXin Li 		}
113*5f32b710SXin Li 	}
114*5f32b710SXin Li 	BENCHMARK(npy_halfbits_to_floatbits);
115*5f32b710SXin Li 
Eigen_half_to_float(benchmark::State & state)116*5f32b710SXin Li 	static void Eigen_half_to_float(benchmark::State& state) {
117*5f32b710SXin Li 		uint16_t fp16 = UINT16_C(0x7C00);
118*5f32b710SXin Li 		while (state.KeepRunning()) {
119*5f32b710SXin Li 			const float fp32 =
120*5f32b710SXin Li 				Eigen::half_impl::half_to_float(
121*5f32b710SXin Li 					Eigen::half_impl::raw_uint16_to_half(fp16));
122*5f32b710SXin Li 
123*5f32b710SXin Li 			fp16 = next_xorshift16(fp16);
124*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
125*5f32b710SXin Li 		}
126*5f32b710SXin Li 	}
127*5f32b710SXin Li 	BENCHMARK(Eigen_half_to_float);
128*5f32b710SXin Li 
Float16Compressor_decompress(benchmark::State & state)129*5f32b710SXin Li 	static void Float16Compressor_decompress(benchmark::State& state) {
130*5f32b710SXin Li 		uint16_t fp16 = UINT16_C(0x7C00);
131*5f32b710SXin Li 		while (state.KeepRunning()) {
132*5f32b710SXin Li 			const float fp32 = Float16Compressor::decompress(fp16);
133*5f32b710SXin Li 
134*5f32b710SXin Li 			fp16 = next_xorshift16(fp16);
135*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
136*5f32b710SXin Li 		}
137*5f32b710SXin Li 	}
138*5f32b710SXin Li 	BENCHMARK(Float16Compressor_decompress);
139*5f32b710SXin Li 
half_float_detail_half2float_table(benchmark::State & state)140*5f32b710SXin Li 	static void half_float_detail_half2float_table(benchmark::State& state) {
141*5f32b710SXin Li 		uint16_t fp16 = UINT16_C(0x7C00);
142*5f32b710SXin Li 		while (state.KeepRunning()) {
143*5f32b710SXin Li 			const float fp32 =
144*5f32b710SXin Li 				half_float::detail::half2float_impl(fp16,
145*5f32b710SXin Li 					half_float::detail::true_type());
146*5f32b710SXin Li 
147*5f32b710SXin Li 			fp16 = next_xorshift16(fp16);
148*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
149*5f32b710SXin Li 		}
150*5f32b710SXin Li 	}
151*5f32b710SXin Li 	BENCHMARK(half_float_detail_half2float_table);
152*5f32b710SXin Li 
half_float_detail_half2float_branch(benchmark::State & state)153*5f32b710SXin Li 	static void half_float_detail_half2float_branch(benchmark::State& state) {
154*5f32b710SXin Li 		uint16_t fp16 = UINT16_C(0x7C00);
155*5f32b710SXin Li 		while (state.KeepRunning()) {
156*5f32b710SXin Li 			const float fp32 =
157*5f32b710SXin Li 				half_float::detail::half2float_impl(fp16,
158*5f32b710SXin Li 					half_float::detail::false_type());
159*5f32b710SXin Li 
160*5f32b710SXin Li 			fp16 = next_xorshift16(fp16);
161*5f32b710SXin Li 			benchmark::DoNotOptimize(fp32);
162*5f32b710SXin Li 		}
163*5f32b710SXin Li 	}
164*5f32b710SXin Li 	BENCHMARK(half_float_detail_half2float_branch);
165*5f32b710SXin Li #endif
166*5f32b710SXin Li 
167*5f32b710SXin Li /* Conversion from IEEE FP32 to IEEE FP16 */
168*5f32b710SXin Li 
fp16_ieee_from_fp32_value(benchmark::State & state)169*5f32b710SXin Li static void fp16_ieee_from_fp32_value(benchmark::State& state) {
170*5f32b710SXin Li 	uint32_t fp32 = UINT32_C(0x7F800000);
171*5f32b710SXin Li 	while (state.KeepRunning()) {
172*5f32b710SXin Li 		const uint16_t fp16 = fp16_ieee_from_fp32_value(fp32_from_bits(fp32));
173*5f32b710SXin Li 
174*5f32b710SXin Li 		fp32 = next_xorshift32(fp32);
175*5f32b710SXin Li 		benchmark::DoNotOptimize(fp16);
176*5f32b710SXin Li 	}
177*5f32b710SXin Li }
178*5f32b710SXin Li BENCHMARK(fp16_ieee_from_fp32_value);
179*5f32b710SXin Li 
180*5f32b710SXin Li #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
fp16_ieee_from_fp32_hardware(benchmark::State & state)181*5f32b710SXin Li 	static void fp16_ieee_from_fp32_hardware(benchmark::State& state) {
182*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
183*5f32b710SXin Li 		while (state.KeepRunning()) {
184*5f32b710SXin Li 			const uint16_t fp16 = static_cast<uint16_t>(
185*5f32b710SXin Li 				_mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(fp32), _MM_FROUND_CUR_DIRECTION)));
186*5f32b710SXin Li 
187*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
188*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
189*5f32b710SXin Li 		}
190*5f32b710SXin Li 	}
191*5f32b710SXin Li 	BENCHMARK(fp16_ieee_from_fp32_hardware);
192*5f32b710SXin Li #endif
193*5f32b710SXin Li 
194*5f32b710SXin Li #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_float2halfbits(benchmark::State & state)195*5f32b710SXin Li 	static void TH_float2halfbits(benchmark::State& state) {
196*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
197*5f32b710SXin Li 		while (state.KeepRunning()) {
198*5f32b710SXin Li 			uint16_t fp16;
199*5f32b710SXin Li 			float fp32_value = fp32_from_bits(fp32);
200*5f32b710SXin Li 			TH_float2halfbits(&fp32_value, &fp16);
201*5f32b710SXin Li 
202*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
203*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
204*5f32b710SXin Li 		}
205*5f32b710SXin Li 	}
206*5f32b710SXin Li 	BENCHMARK(TH_float2halfbits);
207*5f32b710SXin Li 
npy_floatbits_to_halfbits(benchmark::State & state)208*5f32b710SXin Li 	static void npy_floatbits_to_halfbits(benchmark::State& state) {
209*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
210*5f32b710SXin Li 		while (state.KeepRunning()) {
211*5f32b710SXin Li 			const uint16_t fp16 = npy_floatbits_to_halfbits(fp32);
212*5f32b710SXin Li 
213*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
214*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
215*5f32b710SXin Li 		}
216*5f32b710SXin Li 	}
217*5f32b710SXin Li 	BENCHMARK(npy_floatbits_to_halfbits);
218*5f32b710SXin Li 
Eigen_float_to_half_rtne(benchmark::State & state)219*5f32b710SXin Li 	static void Eigen_float_to_half_rtne(benchmark::State& state) {
220*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
221*5f32b710SXin Li 		while (state.KeepRunning()) {
222*5f32b710SXin Li 			const Eigen::half_impl::__half fp16 =
223*5f32b710SXin Li 				Eigen::half_impl::float_to_half_rtne(
224*5f32b710SXin Li 					fp32_from_bits(fp32));
225*5f32b710SXin Li 
226*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
227*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
228*5f32b710SXin Li 		}
229*5f32b710SXin Li 	}
230*5f32b710SXin Li 	BENCHMARK(Eigen_float_to_half_rtne);
231*5f32b710SXin Li 
Float16Compressor_compress(benchmark::State & state)232*5f32b710SXin Li 	static void Float16Compressor_compress(benchmark::State& state) {
233*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
234*5f32b710SXin Li 		while (state.KeepRunning()) {
235*5f32b710SXin Li 			const uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32));
236*5f32b710SXin Li 
237*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
238*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
239*5f32b710SXin Li 		}
240*5f32b710SXin Li 	}
241*5f32b710SXin Li 	BENCHMARK(Float16Compressor_compress);
242*5f32b710SXin Li 
half_float_detail_float2half_table(benchmark::State & state)243*5f32b710SXin Li 	static void half_float_detail_float2half_table(benchmark::State& state) {
244*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
245*5f32b710SXin Li 		while (state.KeepRunning()) {
246*5f32b710SXin Li 			const uint16_t fp16 =
247*5f32b710SXin Li 				half_float::detail::float2half_impl<std::round_to_nearest>(
248*5f32b710SXin Li 					fp32_from_bits(fp32),
249*5f32b710SXin Li 						half_float::detail::true_type());
250*5f32b710SXin Li 
251*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
252*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
253*5f32b710SXin Li 		}
254*5f32b710SXin Li 	}
255*5f32b710SXin Li 	BENCHMARK(half_float_detail_float2half_table);
256*5f32b710SXin Li 
half_float_detail_float2half_branch(benchmark::State & state)257*5f32b710SXin Li 	static void half_float_detail_float2half_branch(benchmark::State& state) {
258*5f32b710SXin Li 		uint32_t fp32 = UINT32_C(0x7F800000);
259*5f32b710SXin Li 		while (state.KeepRunning()) {
260*5f32b710SXin Li 			const uint16_t fp16 =
261*5f32b710SXin Li 				half_float::detail::float2half_impl<std::round_to_nearest>(
262*5f32b710SXin Li 					fp32_from_bits(fp32),
263*5f32b710SXin Li 						half_float::detail::false_type());
264*5f32b710SXin Li 
265*5f32b710SXin Li 			fp32 = next_xorshift32(fp32);
266*5f32b710SXin Li 			benchmark::DoNotOptimize(fp16);
267*5f32b710SXin Li 		}
268*5f32b710SXin Li 	}
269*5f32b710SXin Li 	BENCHMARK(half_float_detail_float2half_branch);
270*5f32b710SXin Li #endif
271*5f32b710SXin Li 
272*5f32b710SXin Li BENCHMARK_MAIN();
273