1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef sw_Half_hpp
16 #define sw_Half_hpp
17
18 #include "Math.hpp"
19
20 #include <algorithm>
21 #include <cmath>
22
23 namespace sw {
24
25 class half
26 {
27 public:
28 half() = default;
29 explicit half(float f);
30
31 operator float() const;
32
33 half &operator=(float f);
34
35 private:
36 unsigned short fp16i;
37 };
38
shortAsHalf(short s)39 inline half shortAsHalf(short s)
40 {
41 union
42 {
43 half h;
44 short s;
45 } hs;
46
47 hs.s = s;
48
49 return hs.h;
50 }
51
52 class RGB9E5
53 {
54 union
55 {
56 struct
57 {
58 unsigned int R : 9;
59 unsigned int G : 9;
60 unsigned int B : 9;
61 unsigned int E : 5;
62 };
63 uint32_t packed;
64 };
65
66 public:
RGB9E5(const float rgb[3])67 RGB9E5(const float rgb[3])
68 : RGB9E5(rgb[0], rgb[1], rgb[2])
69 {
70 }
71
RGB9E5(float r,float g,float b)72 RGB9E5(float r, float g, float b)
73 {
74 // Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
75
76 // B is the exponent bias (15)
77 constexpr int g_sharedexp_bias = 15;
78
79 // N is the number of mantissa bits per component (9)
80 constexpr int g_sharedexp_mantissabits = 9;
81
82 // Emax is the maximum allowed biased exponent value (31)
83 constexpr int g_sharedexp_maxexponent = 31;
84
85 constexpr float g_sharedexp_max =
86 ((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) /
87 static_cast<float>(1 << g_sharedexp_mantissabits)) *
88 static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias));
89
90 // Clamp components to valid range. NaN becomes 0.
91 const float red_c = std::min(!(r > 0) ? 0 : r, g_sharedexp_max);
92 const float green_c = std::min(!(g > 0) ? 0 : g, g_sharedexp_max);
93 const float blue_c = std::min(!(b > 0) ? 0 : b, g_sharedexp_max);
94
95 // We're reducing the mantissa to 9 bits, so we must round up if the next
96 // bit is 1. In other words add 0.5 to the new mantissa's position and
97 // allow overflow into the exponent so we can scale correctly.
98 constexpr int half = 1 << (23 - g_sharedexp_mantissabits);
99 const float red_r = bit_cast<float>(bit_cast<int>(red_c) + half);
100 const float green_r = bit_cast<float>(bit_cast<int>(green_c) + half);
101 const float blue_r = bit_cast<float>(bit_cast<int>(blue_c) + half);
102
103 // The largest component determines the shared exponent. It can't be lower
104 // than 0 (after bias subtraction) so also limit to the mimimum representable.
105 constexpr float min_s = 0.5f / (1 << g_sharedexp_bias);
106 float max_s = std::max(std::max(red_r, green_r), std::max(blue_r, min_s));
107
108 // Obtain the reciprocal of the shared exponent by inverting the bits,
109 // and scale by the new mantissa's size. Note that the IEEE-754 single-precision
110 // format has an implicit leading 1, but this shared component format does not.
111 float scale = bit_cast<float>((bit_cast<int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2));
112
113 R = static_cast<unsigned int>(round(red_c * scale));
114 G = static_cast<unsigned int>(round(green_c * scale));
115 B = static_cast<unsigned int>(round(blue_c * scale));
116 E = (bit_cast<unsigned int>(max_s) >> 23) - 127 + 15 + 1;
117 }
118
operator unsigned int() const119 operator unsigned int() const
120 {
121 return packed;
122 }
123
toRGB16F(half rgb[3]) const124 void toRGB16F(half rgb[3]) const
125 {
126 constexpr int offset = 24; // Exponent bias (15) + number of mantissa bits per component (9) = 24
127
128 const float factor = (1u << E) * (1.0f / (1 << offset));
129 rgb[0] = half(R * factor);
130 rgb[1] = half(G * factor);
131 rgb[2] = half(B * factor);
132 }
133 };
134
135 class R11G11B10F
136 {
137 union
138 {
139 struct
140 {
141 unsigned int R : 11;
142 unsigned int G : 11;
143 unsigned int B : 10;
144 };
145 uint32_t packed;
146 };
147
148 public:
R11G11B10F(const float rgb[3])149 R11G11B10F(const float rgb[3])
150 {
151 R = float32ToFloat11(rgb[0]);
152 G = float32ToFloat11(rgb[1]);
153 B = float32ToFloat10(rgb[2]);
154 }
155
operator unsigned int() const156 operator unsigned int() const
157 {
158 return packed;
159 }
160
toRGB16F(half rgb[3]) const161 void toRGB16F(half rgb[3]) const
162 {
163 rgb[0] = float11ToFloat16(R);
164 rgb[1] = float11ToFloat16(G);
165 rgb[2] = float10ToFloat16(B);
166 }
167
float11ToFloat16(unsigned short fp11)168 static inline half float11ToFloat16(unsigned short fp11)
169 {
170 return shortAsHalf(fp11 << 4); // Sign bit 0
171 }
172
float10ToFloat16(unsigned short fp10)173 static inline half float10ToFloat16(unsigned short fp10)
174 {
175 return shortAsHalf(fp10 << 5); // Sign bit 0
176 }
177
float32ToFloat11(float fp32)178 static inline unsigned short float32ToFloat11(float fp32)
179 {
180 const unsigned int float32MantissaMask = 0x7FFFFF;
181 const unsigned int float32ExponentMask = 0x7F800000;
182 const unsigned int float32SignMask = 0x80000000;
183 const unsigned int float32ValueMask = ~float32SignMask;
184 const unsigned int float32ExponentFirstBit = 23;
185 const unsigned int float32ExponentBias = 127;
186
187 const unsigned short float11Max = 0x7BF;
188 const unsigned short float11MantissaMask = 0x3F;
189 const unsigned short float11ExponentMask = 0x7C0;
190 const unsigned short float11BitMask = 0x7FF;
191 const unsigned int float11ExponentBias = 14;
192
193 const unsigned int float32Maxfloat11 = 0x477E0000;
194 const unsigned int float32MinNormfloat11 = 0x38800000;
195 const unsigned int float32MinDenormfloat11 = 0x35000080;
196
197 const unsigned int float32Bits = bit_cast<unsigned int>(fp32);
198 const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
199
200 unsigned int float32Val = float32Bits & float32ValueMask;
201
202 if((float32Val & float32ExponentMask) == float32ExponentMask)
203 {
204 // INF or NAN
205 if((float32Val & float32MantissaMask) != 0)
206 {
207 return float11ExponentMask |
208 (((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) &
209 float11MantissaMask);
210 }
211 else if(float32Sign)
212 {
213 // -INF is clamped to 0 since float11 is positive only
214 return 0;
215 }
216 else
217 {
218 return float11ExponentMask;
219 }
220 }
221 else if(float32Sign)
222 {
223 // float11 is positive only, so clamp to zero
224 return 0;
225 }
226 else if(float32Val > float32Maxfloat11)
227 {
228 // The number is too large to be represented as a float11, set to max
229 return float11Max;
230 }
231 else if(float32Val < float32MinDenormfloat11)
232 {
233 // The number is too small to be represented as a denormalized float11, set to 0
234 return 0;
235 }
236 else
237 {
238 if(float32Val < float32MinNormfloat11)
239 {
240 // The number is too small to be represented as a normalized float11
241 // Convert it to a denormalized value.
242 const unsigned int shift = (float32ExponentBias - float11ExponentBias) -
243 (float32Val >> float32ExponentFirstBit);
244 float32Val =
245 ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
246 }
247 else
248 {
249 // Rebias the exponent to represent the value as a normalized float11
250 float32Val += 0xC8000000;
251 }
252
253 return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask;
254 }
255 }
256
float32ToFloat10(float fp32)257 static inline unsigned short float32ToFloat10(float fp32)
258 {
259 const unsigned int float32MantissaMask = 0x7FFFFF;
260 const unsigned int float32ExponentMask = 0x7F800000;
261 const unsigned int float32SignMask = 0x80000000;
262 const unsigned int float32ValueMask = ~float32SignMask;
263 const unsigned int float32ExponentFirstBit = 23;
264 const unsigned int float32ExponentBias = 127;
265
266 const unsigned short float10Max = 0x3DF;
267 const unsigned short float10MantissaMask = 0x1F;
268 const unsigned short float10ExponentMask = 0x3E0;
269 const unsigned short float10BitMask = 0x3FF;
270 const unsigned int float10ExponentBias = 14;
271
272 const unsigned int float32Maxfloat10 = 0x477C0000;
273 const unsigned int float32MinNormfloat10 = 0x38800000;
274 const unsigned int float32MinDenormfloat10 = 0x35800040;
275
276 const unsigned int float32Bits = bit_cast<unsigned int>(fp32);
277 const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
278
279 unsigned int float32Val = float32Bits & float32ValueMask;
280
281 if((float32Val & float32ExponentMask) == float32ExponentMask)
282 {
283 // INF or NAN
284 if((float32Val & float32MantissaMask) != 0)
285 {
286 return float10ExponentMask |
287 (((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) &
288 float10MantissaMask);
289 }
290 else if(float32Sign)
291 {
292 // -INF is clamped to 0 since float10 is positive only
293 return 0;
294 }
295 else
296 {
297 return float10ExponentMask;
298 }
299 }
300 else if(float32Sign)
301 {
302 // float10 is positive only, so clamp to zero
303 return 0;
304 }
305 else if(float32Val > float32Maxfloat10)
306 {
307 // The number is too large to be represented as a float10, set to max
308 return float10Max;
309 }
310 else if(float32Val < float32MinDenormfloat10)
311 {
312 // The number is too small to be represented as a denormalized float10, set to 0
313 return 0;
314 }
315 else
316 {
317 if(float32Val < float32MinNormfloat10)
318 {
319 // The number is too small to be represented as a normalized float10
320 // Convert it to a denormalized value.
321 const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
322 (float32Val >> float32ExponentFirstBit);
323 float32Val =
324 ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
325 }
326 else
327 {
328 // Rebias the exponent to represent the value as a normalized float10
329 float32Val += 0xC8000000;
330 }
331
332 return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask;
333 }
334 }
335 };
336
337 } // namespace sw
338
339 #endif // sw_Half_hpp
340