xref: /aosp_15_r20/external/swiftshader/src/System/Half.hpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef sw_Half_hpp
16 #define sw_Half_hpp
17 
18 #include "Math.hpp"
19 
20 #include <algorithm>
21 #include <cmath>
22 
23 namespace sw {
24 
25 class half
26 {
27 public:
28 	half() = default;
29 	explicit half(float f);
30 
31 	operator float() const;
32 
33 	half &operator=(float f);
34 
35 private:
36 	unsigned short fp16i;
37 };
38 
shortAsHalf(short s)39 inline half shortAsHalf(short s)
40 {
41 	union
42 	{
43 		half h;
44 		short s;
45 	} hs;
46 
47 	hs.s = s;
48 
49 	return hs.h;
50 }
51 
52 class RGB9E5
53 {
54 	union
55 	{
56 		struct
57 		{
58 			unsigned int R : 9;
59 			unsigned int G : 9;
60 			unsigned int B : 9;
61 			unsigned int E : 5;
62 		};
63 		uint32_t packed;
64 	};
65 
66 public:
RGB9E5(const float rgb[3])67 	RGB9E5(const float rgb[3])
68 	    : RGB9E5(rgb[0], rgb[1], rgb[2])
69 	{
70 	}
71 
RGB9E5(float r,float g,float b)72 	RGB9E5(float r, float g, float b)
73 	{
74 		// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
75 
76 		// B is the exponent bias (15)
77 		constexpr int g_sharedexp_bias = 15;
78 
79 		// N is the number of mantissa bits per component (9)
80 		constexpr int g_sharedexp_mantissabits = 9;
81 
82 		// Emax is the maximum allowed biased exponent value (31)
83 		constexpr int g_sharedexp_maxexponent = 31;
84 
85 		constexpr float g_sharedexp_max =
86 		    ((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) /
87 		     static_cast<float>(1 << g_sharedexp_mantissabits)) *
88 		    static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias));
89 
90 		// Clamp components to valid range. NaN becomes 0.
91 		const float red_c = std::min(!(r > 0) ? 0 : r, g_sharedexp_max);
92 		const float green_c = std::min(!(g > 0) ? 0 : g, g_sharedexp_max);
93 		const float blue_c = std::min(!(b > 0) ? 0 : b, g_sharedexp_max);
94 
95 		// We're reducing the mantissa to 9 bits, so we must round up if the next
96 		// bit is 1. In other words add 0.5 to the new mantissa's position and
97 		// allow overflow into the exponent so we can scale correctly.
98 		constexpr int half = 1 << (23 - g_sharedexp_mantissabits);
99 		const float red_r = bit_cast<float>(bit_cast<int>(red_c) + half);
100 		const float green_r = bit_cast<float>(bit_cast<int>(green_c) + half);
101 		const float blue_r = bit_cast<float>(bit_cast<int>(blue_c) + half);
102 
103 		// The largest component determines the shared exponent. It can't be lower
104 		// than 0 (after bias subtraction) so also limit to the mimimum representable.
105 		constexpr float min_s = 0.5f / (1 << g_sharedexp_bias);
106 		float max_s = std::max(std::max(red_r, green_r), std::max(blue_r, min_s));
107 
108 		// Obtain the reciprocal of the shared exponent by inverting the bits,
109 		// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
110 		// format has an implicit leading 1, but this shared component format does not.
111 		float scale = bit_cast<float>((bit_cast<int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2));
112 
113 		R = static_cast<unsigned int>(round(red_c * scale));
114 		G = static_cast<unsigned int>(round(green_c * scale));
115 		B = static_cast<unsigned int>(round(blue_c * scale));
116 		E = (bit_cast<unsigned int>(max_s) >> 23) - 127 + 15 + 1;
117 	}
118 
operator unsigned int() const119 	operator unsigned int() const
120 	{
121 		return packed;
122 	}
123 
toRGB16F(half rgb[3]) const124 	void toRGB16F(half rgb[3]) const
125 	{
126 		constexpr int offset = 24;  // Exponent bias (15) + number of mantissa bits per component (9) = 24
127 
128 		const float factor = (1u << E) * (1.0f / (1 << offset));
129 		rgb[0] = half(R * factor);
130 		rgb[1] = half(G * factor);
131 		rgb[2] = half(B * factor);
132 	}
133 };
134 
135 class R11G11B10F
136 {
137 	union
138 	{
139 		struct
140 		{
141 			unsigned int R : 11;
142 			unsigned int G : 11;
143 			unsigned int B : 10;
144 		};
145 		uint32_t packed;
146 	};
147 
148 public:
R11G11B10F(const float rgb[3])149 	R11G11B10F(const float rgb[3])
150 	{
151 		R = float32ToFloat11(rgb[0]);
152 		G = float32ToFloat11(rgb[1]);
153 		B = float32ToFloat10(rgb[2]);
154 	}
155 
operator unsigned int() const156 	operator unsigned int() const
157 	{
158 		return packed;
159 	}
160 
toRGB16F(half rgb[3]) const161 	void toRGB16F(half rgb[3]) const
162 	{
163 		rgb[0] = float11ToFloat16(R);
164 		rgb[1] = float11ToFloat16(G);
165 		rgb[2] = float10ToFloat16(B);
166 	}
167 
float11ToFloat16(unsigned short fp11)168 	static inline half float11ToFloat16(unsigned short fp11)
169 	{
170 		return shortAsHalf(fp11 << 4);  // Sign bit 0
171 	}
172 
float10ToFloat16(unsigned short fp10)173 	static inline half float10ToFloat16(unsigned short fp10)
174 	{
175 		return shortAsHalf(fp10 << 5);  // Sign bit 0
176 	}
177 
float32ToFloat11(float fp32)178 	static inline unsigned short float32ToFloat11(float fp32)
179 	{
180 		const unsigned int float32MantissaMask = 0x7FFFFF;
181 		const unsigned int float32ExponentMask = 0x7F800000;
182 		const unsigned int float32SignMask = 0x80000000;
183 		const unsigned int float32ValueMask = ~float32SignMask;
184 		const unsigned int float32ExponentFirstBit = 23;
185 		const unsigned int float32ExponentBias = 127;
186 
187 		const unsigned short float11Max = 0x7BF;
188 		const unsigned short float11MantissaMask = 0x3F;
189 		const unsigned short float11ExponentMask = 0x7C0;
190 		const unsigned short float11BitMask = 0x7FF;
191 		const unsigned int float11ExponentBias = 14;
192 
193 		const unsigned int float32Maxfloat11 = 0x477E0000;
194 		const unsigned int float32MinNormfloat11 = 0x38800000;
195 		const unsigned int float32MinDenormfloat11 = 0x35000080;
196 
197 		const unsigned int float32Bits = bit_cast<unsigned int>(fp32);
198 		const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
199 
200 		unsigned int float32Val = float32Bits & float32ValueMask;
201 
202 		if((float32Val & float32ExponentMask) == float32ExponentMask)
203 		{
204 			// INF or NAN
205 			if((float32Val & float32MantissaMask) != 0)
206 			{
207 				return float11ExponentMask |
208 				       (((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) &
209 				        float11MantissaMask);
210 			}
211 			else if(float32Sign)
212 			{
213 				// -INF is clamped to 0 since float11 is positive only
214 				return 0;
215 			}
216 			else
217 			{
218 				return float11ExponentMask;
219 			}
220 		}
221 		else if(float32Sign)
222 		{
223 			// float11 is positive only, so clamp to zero
224 			return 0;
225 		}
226 		else if(float32Val > float32Maxfloat11)
227 		{
228 			// The number is too large to be represented as a float11, set to max
229 			return float11Max;
230 		}
231 		else if(float32Val < float32MinDenormfloat11)
232 		{
233 			// The number is too small to be represented as a denormalized float11, set to 0
234 			return 0;
235 		}
236 		else
237 		{
238 			if(float32Val < float32MinNormfloat11)
239 			{
240 				// The number is too small to be represented as a normalized float11
241 				// Convert it to a denormalized value.
242 				const unsigned int shift = (float32ExponentBias - float11ExponentBias) -
243 				                           (float32Val >> float32ExponentFirstBit);
244 				float32Val =
245 				    ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
246 			}
247 			else
248 			{
249 				// Rebias the exponent to represent the value as a normalized float11
250 				float32Val += 0xC8000000;
251 			}
252 
253 			return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask;
254 		}
255 	}
256 
float32ToFloat10(float fp32)257 	static inline unsigned short float32ToFloat10(float fp32)
258 	{
259 		const unsigned int float32MantissaMask = 0x7FFFFF;
260 		const unsigned int float32ExponentMask = 0x7F800000;
261 		const unsigned int float32SignMask = 0x80000000;
262 		const unsigned int float32ValueMask = ~float32SignMask;
263 		const unsigned int float32ExponentFirstBit = 23;
264 		const unsigned int float32ExponentBias = 127;
265 
266 		const unsigned short float10Max = 0x3DF;
267 		const unsigned short float10MantissaMask = 0x1F;
268 		const unsigned short float10ExponentMask = 0x3E0;
269 		const unsigned short float10BitMask = 0x3FF;
270 		const unsigned int float10ExponentBias = 14;
271 
272 		const unsigned int float32Maxfloat10 = 0x477C0000;
273 		const unsigned int float32MinNormfloat10 = 0x38800000;
274 		const unsigned int float32MinDenormfloat10 = 0x35800040;
275 
276 		const unsigned int float32Bits = bit_cast<unsigned int>(fp32);
277 		const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
278 
279 		unsigned int float32Val = float32Bits & float32ValueMask;
280 
281 		if((float32Val & float32ExponentMask) == float32ExponentMask)
282 		{
283 			// INF or NAN
284 			if((float32Val & float32MantissaMask) != 0)
285 			{
286 				return float10ExponentMask |
287 				       (((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) &
288 				        float10MantissaMask);
289 			}
290 			else if(float32Sign)
291 			{
292 				// -INF is clamped to 0 since float10 is positive only
293 				return 0;
294 			}
295 			else
296 			{
297 				return float10ExponentMask;
298 			}
299 		}
300 		else if(float32Sign)
301 		{
302 			// float10 is positive only, so clamp to zero
303 			return 0;
304 		}
305 		else if(float32Val > float32Maxfloat10)
306 		{
307 			// The number is too large to be represented as a float10, set to max
308 			return float10Max;
309 		}
310 		else if(float32Val < float32MinDenormfloat10)
311 		{
312 			// The number is too small to be represented as a denormalized float10, set to 0
313 			return 0;
314 		}
315 		else
316 		{
317 			if(float32Val < float32MinNormfloat10)
318 			{
319 				// The number is too small to be represented as a normalized float10
320 				// Convert it to a denormalized value.
321 				const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
322 				                           (float32Val >> float32ExponentFirstBit);
323 				float32Val =
324 				    ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
325 			}
326 			else
327 			{
328 				// Rebias the exponent to represent the value as a normalized float10
329 				float32Val += 0xC8000000;
330 			}
331 
332 			return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask;
333 		}
334 	}
335 };
336 
337 }  // namespace sw
338 
339 #endif  // sw_Half_hpp
340