xref: /aosp_15_r20/external/FP16/include/fp16/psimd.h (revision 5f32b7105932ea8520a0e8811c640f936367d707)
1*5f32b710SXin Li #pragma once
2*5f32b710SXin Li #ifndef FP16_PSIMD_H
3*5f32b710SXin Li #define FP16_PSIMD_H
4*5f32b710SXin Li 
5*5f32b710SXin Li #if defined(__cplusplus) && (__cplusplus >= 201103L)
6*5f32b710SXin Li 	#include <cstdint>
7*5f32b710SXin Li #elif !defined(__OPENCL_VERSION__)
8*5f32b710SXin Li 	#include <stdint.h>
9*5f32b710SXin Li #endif
10*5f32b710SXin Li 
11*5f32b710SXin Li #include <psimd.h>
12*5f32b710SXin Li 
13*5f32b710SXin Li 
fp16_ieee_to_fp32_psimd(psimd_u16 half)14*5f32b710SXin Li PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) {
15*5f32b710SXin Li 	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
16*5f32b710SXin Li 
17*5f32b710SXin Li 	const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000));
18*5f32b710SXin Li 	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
19*5f32b710SXin Li 
20*5f32b710SXin Li 	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
21*5f32b710SXin Li #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
22*5f32b710SXin Li 	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
23*5f32b710SXin Li #else
24*5f32b710SXin Li 	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
25*5f32b710SXin Li #endif
26*5f32b710SXin Li 	const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale);
27*5f32b710SXin Li 
28*5f32b710SXin Li 	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
29*5f32b710SXin Li 	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
30*5f32b710SXin Li 	const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias);
31*5f32b710SXin Li 
32*5f32b710SXin Li 	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
33*5f32b710SXin Li 	const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff;
34*5f32b710SXin Li 	return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign));
35*5f32b710SXin Li }
36*5f32b710SXin Li 
fp16_ieee_to_fp32x2_psimd(psimd_u16 half)37*5f32b710SXin Li PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) {
38*5f32b710SXin Li 	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
39*5f32b710SXin Li 	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
40*5f32b710SXin Li 
41*5f32b710SXin Li 	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
42*5f32b710SXin Li 	const psimd_u32 sign_lo = word_lo & sign_mask;
43*5f32b710SXin Li 	const psimd_u32 sign_hi = word_hi & sign_mask;
44*5f32b710SXin Li 	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
45*5f32b710SXin Li 	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
46*5f32b710SXin Li 
47*5f32b710SXin Li 	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
48*5f32b710SXin Li #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
49*5f32b710SXin Li 	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
50*5f32b710SXin Li #else
51*5f32b710SXin Li 	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
52*5f32b710SXin Li #endif
53*5f32b710SXin Li 	const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale);
54*5f32b710SXin Li 	const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale);
55*5f32b710SXin Li 
56*5f32b710SXin Li 	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
57*5f32b710SXin Li 	const psimd_u16 shl1_half = half + half;
58*5f32b710SXin Li 	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
59*5f32b710SXin Li 	const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias);
60*5f32b710SXin Li 	const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias);
61*5f32b710SXin Li 
62*5f32b710SXin Li 	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
63*5f32b710SXin Li 	const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff;
64*5f32b710SXin Li 	const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff;
65*5f32b710SXin Li 
66*5f32b710SXin Li 	psimd_f32x2 result;
67*5f32b710SXin Li 	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo));
68*5f32b710SXin Li 	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi));
69*5f32b710SXin Li 	return result;
70*5f32b710SXin Li }
71*5f32b710SXin Li 
fp16_alt_to_fp32_psimd(psimd_u16 half)72*5f32b710SXin Li PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) {
73*5f32b710SXin Li 	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
74*5f32b710SXin Li 
75*5f32b710SXin Li 	const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000));
76*5f32b710SXin Li 	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
77*5f32b710SXin Li 
78*5f32b710SXin Li #if 0
79*5f32b710SXin Li 	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
80*5f32b710SXin Li 	const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset;
81*5f32b710SXin Li 	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
82*5f32b710SXin Li 	const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset);
83*5f32b710SXin Li 	const psimd_s32 exp113_offset = exp112_offset | exp1_offset;
84*5f32b710SXin Li 	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset)));
85*5f32b710SXin Li #else
86*5f32b710SXin Li 	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
87*5f32b710SXin Li 	const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset);
88*5f32b710SXin Li #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
89*5f32b710SXin Li 	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
90*5f32b710SXin Li #else
91*5f32b710SXin Li 	const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000)));
92*5f32b710SXin Li #endif
93*5f32b710SXin Li 	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias)));
94*5f32b710SXin Li #endif
95*5f32b710SXin Li }
96*5f32b710SXin Li 
fp16_alt_to_fp32x2_psimd(psimd_u16 half)97*5f32b710SXin Li PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) {
98*5f32b710SXin Li 	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
99*5f32b710SXin Li 	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
100*5f32b710SXin Li 
101*5f32b710SXin Li 	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
102*5f32b710SXin Li 	const psimd_u32 sign_lo = word_lo & sign_mask;
103*5f32b710SXin Li 	const psimd_u32 sign_hi = word_hi & sign_mask;
104*5f32b710SXin Li 	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
105*5f32b710SXin Li 	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
106*5f32b710SXin Li 
107*5f32b710SXin Li #if 1
108*5f32b710SXin Li 	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
109*5f32b710SXin Li 	const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset;
110*5f32b710SXin Li 	const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset;
111*5f32b710SXin Li 	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
112*5f32b710SXin Li 	const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset);
113*5f32b710SXin Li 	const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset);
114*5f32b710SXin Li 	const psimd_s32 exp113_offset = exp1_offset | exp112_offset;
115*5f32b710SXin Li 	psimd_f32x2 result;
116*5f32b710SXin Li 	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset)));
117*5f32b710SXin Li 	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset)));
118*5f32b710SXin Li 	return result;
119*5f32b710SXin Li #else
120*5f32b710SXin Li 	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
121*5f32b710SXin Li 	const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset);
122*5f32b710SXin Li 	const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset);
123*5f32b710SXin Li 	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
124*5f32b710SXin Li 	psimd_f32x2 result;
125*5f32b710SXin Li 	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias)));
126*5f32b710SXin Li 	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias)));
127*5f32b710SXin Li 	return result;
128*5f32b710SXin Li #endif
129*5f32b710SXin Li }
130*5f32b710SXin Li 
131*5f32b710SXin Li #endif /* FP16_PSIMD_H */
132