1 #ifndef _TCUFLOAT_HPP
2 #define _TCUFLOAT_HPP
3 /*-------------------------------------------------------------------------
4 * drawElements Quality Program Tester Core
5 * ----------------------------------------
6 *
7 * Copyright 2014 The Android Open Source Project
8 *
9 * Licensed under the Apache License, Version 2.0 (the "License");
10 * you may not use this file except in compliance with the License.
11 * You may obtain a copy of the License at
12 *
13 * http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and
19 * limitations under the License.
20 *
21 *//*!
22 * \file
23 * \brief Reconfigurable floating-point value template.
24 *//*--------------------------------------------------------------------*/
25
26 #include "tcuDefs.hpp"
27
28 // For memcpy().
29 #include <limits>
30 #include <string.h>
31
32 namespace tcu
33 {
34
35 enum FloatFlags
36 {
37 FLOAT_HAS_SIGN = (1 << 0),
38 FLOAT_SUPPORT_DENORM = (1 << 1)
39 };
40
41 enum RoundingDirection
42 {
43 ROUND_TO_EVEN = 0,
44 ROUND_DOWNWARD, // Towards -Inf.
45 ROUND_UPWARD, // Towards +Inf.
46 ROUND_TO_ZERO
47 };
48
49 /*--------------------------------------------------------------------*//*!
50 * \brief Floating-point format template
51 *
52 * This template implements arbitrary floating-point handling. Template
53 * can be used for conversion between different formats and checking
54 * various properties of floating-point values.
55 *//*--------------------------------------------------------------------*/
56 template <typename StorageType_, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
57 class Float
58 {
59 public:
60 typedef StorageType_ StorageType;
61
62 enum
63 {
64 EXPONENT_BITS = ExponentBits,
65 MANTISSA_BITS = MantissaBits,
66 EXPONENT_BIAS = ExponentBias,
67 FLAGS = Flags,
68 };
69
70 Float(void);
71 explicit Float(StorageType value);
72 explicit Float(float v, RoundingDirection rd = ROUND_TO_EVEN);
73 explicit Float(double v, RoundingDirection rd = ROUND_TO_EVEN);
74
75 template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias,
76 uint32_t OtherFlags>
77 static Float convert(
78 const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags> &src,
79 RoundingDirection rd = ROUND_TO_EVEN);
80
convert(const Float<StorageType,ExponentBits,MantissaBits,ExponentBias,Flags> & src,RoundingDirection=ROUND_TO_EVEN)81 static inline Float convert(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> &src,
82 RoundingDirection = ROUND_TO_EVEN)
83 {
84 return src;
85 }
86
87 /*--------------------------------------------------------------------*//*!
88 * \brief Construct floating point value
89 * \param sign Sign. Must be +1/-1
90 * \param exponent Exponent in range [1-ExponentBias, ExponentBias+1]
91 * \param mantissa Mantissa bits with implicit leading bit explicitly set
92 * \return The specified float
93 *
94 * This function constructs a floating point value from its inputs.
95 * The normally implicit leading bit of the mantissa must be explicitly set.
96 * The exponent normally used for zero/subnormals is an invalid input. Such
97 * values are specified with the leading mantissa bit of zero and the lowest
98 * normal exponent (1-ExponentBias). Additionally having both exponent and
99 * mantissa set to zero is a shorthand notation for the correctly signed
100 * floating point zero. Inf and NaN must be specified directly with an
101 * exponent of ExponentBias+1 and the appropriate mantissa (with leading
102 * bit set)
103 *//*--------------------------------------------------------------------*/
104 static inline Float construct(int sign, int exponent, StorageType mantissa);
105
106 /*--------------------------------------------------------------------*//*!
107 * \brief Construct floating point value. Explicit version
108 * \param sign Sign. Must be +1/-1
109 * \param exponent Exponent in range [-ExponentBias, ExponentBias+1]
110 * \param mantissa Mantissa bits
111 * \return The specified float
112 *
113 * This function constructs a floating point value from its inputs with
114 * minimal intervention.
115 * The sign is turned into a sign bit and the exponent bias is added.
116 * See IEEE-754 for additional information on the inputs and
117 * the encoding of special values.
118 *//*--------------------------------------------------------------------*/
119 static Float constructBits(int sign, int exponent, StorageType mantissaBits);
120
bits(void) const121 StorageType bits(void) const
122 {
123 return m_value;
124 }
125 float asFloat(void) const;
126 double asDouble(void) const;
127
signBit(void) const128 inline int signBit(void) const
129 {
130 return (int)(m_value >> (ExponentBits + MantissaBits)) & 1;
131 }
exponentBits(void) const132 inline StorageType exponentBits(void) const
133 {
134 return (m_value >> MantissaBits) & ((StorageType(1) << ExponentBits) - 1);
135 }
mantissaBits(void) const136 inline StorageType mantissaBits(void) const
137 {
138 return m_value & ((StorageType(1) << MantissaBits) - 1);
139 }
140
sign(void) const141 inline int sign(void) const
142 {
143 return signBit() ? -1 : 1;
144 }
exponent(void) const145 inline int exponent(void) const
146 {
147 return isDenorm() ? 1 - ExponentBias : (int)exponentBits() - ExponentBias;
148 }
mantissa(void) const149 inline StorageType mantissa(void) const
150 {
151 return isZero() || isDenorm() ? mantissaBits() : (mantissaBits() | (StorageType(1) << MantissaBits));
152 }
153
isInf(void) const154 inline bool isInf(void) const
155 {
156 return exponentBits() == ((1 << ExponentBits) - 1) && mantissaBits() == 0;
157 }
isNaN(void) const158 inline bool isNaN(void) const
159 {
160 return exponentBits() == ((1 << ExponentBits) - 1) && mantissaBits() != 0;
161 }
isZero(void) const162 inline bool isZero(void) const
163 {
164 return exponentBits() == 0 && mantissaBits() == 0;
165 }
isDenorm(void) const166 inline bool isDenorm(void) const
167 {
168 return exponentBits() == 0 && mantissaBits() != 0;
169 }
170
operator <(const Float<StorageType,ExponentBits,MantissaBits,ExponentBias,Flags> & other) const171 inline bool operator<(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> &other) const
172 {
173 return this->asDouble() < other.asDouble();
174 }
175
176 static Float zero(int sign);
177 static Float inf(int sign);
178 static Float nan(void);
179
180 static Float largestNormal(int sign);
181 static Float smallestNormal(int sign);
182
183 private:
184 StorageType m_value;
185 } DE_WARN_UNUSED_TYPE;
186
187 // Common floating-point types.
188 typedef Float<uint16_t, 5, 10, 15, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
189 Float16; //!< IEEE 754-2008 16-bit floating-point value
190 typedef Float<uint32_t, 8, 23, 127, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
191 Float32; //!< IEEE 754 32-bit floating-point value
192 typedef Float<uint64_t, 11, 52, 1023, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
193 Float64; //!< IEEE 754 64-bit floating-point value
194
195 typedef Float<uint16_t, 5, 10, 15, FLOAT_HAS_SIGN>
196 Float16Denormless; //!< IEEE 754-2008 16-bit floating-point value without denormalized support
197
198 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(void)199 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(void) : m_value(0)
200 {
201 }
202
203 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(StorageType value)204 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(StorageType value) : m_value(value)
205 {
206 }
207
208 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(float value,RoundingDirection rd)209 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(float value, RoundingDirection rd)
210 : m_value(0)
211 {
212 uint32_t u32;
213 memcpy(&u32, &value, sizeof(uint32_t));
214 *this = convert(Float32(u32), rd);
215 }
216
217 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(double value,RoundingDirection rd)218 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(double value, RoundingDirection rd)
219 : m_value(0)
220 {
221 uint64_t u64;
222 memcpy(&u64, &value, sizeof(uint64_t));
223 *this = convert(Float64(u64), rd);
224 }
225
226 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
asFloat(void) const227 inline float Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asFloat(void) const
228 {
229 float v;
230 uint32_t u32 = Float32::convert(*this).bits();
231 memcpy(&v, &u32, sizeof(uint32_t));
232 return v;
233 }
234
235 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
asDouble(void) const236 inline double Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asDouble(void) const
237 {
238 double v;
239 uint64_t u64 = Float64::convert(*this).bits();
240 memcpy(&v, &u64, sizeof(uint64_t));
241 return v;
242 }
243
244 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
245 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
zero(int sign)246 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::zero(int sign)
247 {
248 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
249 return Float(StorageType((sign > 0 ? 0ull : 1ull) << (ExponentBits + MantissaBits)));
250 }
251
252 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
253 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
inf(int sign)254 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::inf(int sign)
255 {
256 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
257 return Float(StorageType(((sign > 0 ? 0ull : 1ull) << (ExponentBits + MantissaBits)) |
258 (((1ull << ExponentBits) - 1) << MantissaBits)));
259 }
260
261 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
262 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
nan(void)263 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::nan(void)
264 {
265 return Float(StorageType((1ull << (ExponentBits + MantissaBits)) - 1));
266 }
267
268 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
269 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
largestNormal(int sign)270 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::largestNormal(int sign)
271 {
272 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
273 return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(
274 sign, ExponentBias, (static_cast<StorageType>(1) << (MantissaBits + 1)) - 1);
275 }
276
277 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
278 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
smallestNormal(int sign)279 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::smallestNormal(int sign)
280 {
281 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
282 return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(
283 sign, 1 - ExponentBias, (static_cast<StorageType>(1) << MantissaBits));
284 }
285
286 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
287 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
construct(int sign,int exponent,StorageType mantissa)288 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(int sign, int exponent,
289 StorageType mantissa)
290 {
291 // Repurpose this otherwise invalid input as a shorthand notation for zero (no need for caller to care about internal representation)
292 const bool isShorthandZero = exponent == 0 && mantissa == 0;
293
294 // Handles the typical notation for zero (min exponent, mantissa 0). Note that the exponent usually used exponent (-ExponentBias) for zero/subnormals is not used.
295 // Instead zero/subnormals have the (normally implicit) leading mantissa bit set to zero.
296 const bool isDenormOrZero = (exponent == 1 - ExponentBias) && (mantissa >> MantissaBits == 0);
297 const StorageType s = StorageType((StorageType(sign < 0 ? 1 : 0)) << (StorageType(ExponentBits + MantissaBits)));
298 const StorageType exp = (isShorthandZero || isDenormOrZero) ? StorageType(0) : StorageType(exponent + ExponentBias);
299
300 DE_ASSERT(sign == +1 || sign == -1);
301 DE_ASSERT(isShorthandZero || isDenormOrZero || mantissa >> MantissaBits == 1);
302 DE_ASSERT(exp >> ExponentBits == 0);
303
304 return Float(StorageType(s | (exp << MantissaBits) | (mantissa & ((StorageType(1) << MantissaBits) - 1))));
305 }
306
307 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
308 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
constructBits(int sign,int exponent,StorageType mantissaBits)309 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::constructBits(int sign, int exponent,
310 StorageType mantissaBits)
311 {
312 const StorageType signBit = static_cast<StorageType>(sign < 0 ? 1 : 0);
313 const StorageType exponentBits = static_cast<StorageType>(exponent + ExponentBias);
314
315 DE_ASSERT(sign == +1 || sign == -1);
316 DE_ASSERT(exponentBits >> ExponentBits == 0);
317 DE_ASSERT(mantissaBits >> MantissaBits == 0);
318
319 return Float(
320 StorageType((signBit << (ExponentBits + MantissaBits)) | (exponentBits << MantissaBits) | (mantissaBits)));
321 }
322
323 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
324 template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias,
325 uint32_t OtherFlags>
326 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits,
327 ExponentBias, Flags>::
convert(const Float<OtherStorageType,OtherExponentBits,OtherMantissaBits,OtherExponentBias,OtherFlags> & other,RoundingDirection rd)328 convert(const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags> &other,
329 RoundingDirection rd)
330 {
331 int sign = other.sign();
332
333 if (!(Flags & FLOAT_HAS_SIGN) && sign < 0)
334 {
335 // Negative number, truncate to zero.
336 return zero(+1);
337 }
338
339 if (other.isInf())
340 {
341 return inf(sign);
342 }
343
344 if (other.isNaN())
345 {
346 return nan();
347 }
348
349 if (other.isZero())
350 {
351 return zero(sign);
352 }
353
354 const int eMin = 1 - ExponentBias;
355 const int eMax = ((1 << ExponentBits) - 2) - ExponentBias;
356
357 const StorageType s = StorageType((StorageType(other.signBit()))
358 << (StorageType(ExponentBits + MantissaBits))); // \note Not sign, but sign bit.
359 int e = other.exponent();
360 uint64_t m = other.mantissa();
361
362 // Normalize denormalized values prior to conversion.
363 while (!(m & (1ull << OtherMantissaBits)))
364 {
365 m <<= 1;
366 e -= 1;
367 }
368
369 if (e < eMin)
370 {
371 // Underflow.
372 if ((Flags & FLOAT_SUPPORT_DENORM) && (eMin - e - 1 <= MantissaBits))
373 {
374 // Shift and round.
375 int bitDiff = (OtherMantissaBits - MantissaBits) + (eMin - e);
376 uint64_t lastBitsMask = (1ull << bitDiff) - 1ull;
377 uint64_t lastBits = (static_cast<uint64_t>(m) & lastBitsMask);
378 uint64_t half = (1ull << (bitDiff - 1)) - 1;
379 uint64_t bias = (m >> bitDiff) & 1;
380
381 switch (rd)
382 {
383 case ROUND_TO_EVEN:
384 return Float(StorageType(s | (m + half + bias) >> bitDiff));
385
386 case ROUND_DOWNWARD:
387 m = (m >> bitDiff);
388 if (lastBits != 0ull && sign < 0)
389 {
390 m += 1;
391 }
392 return Float(StorageType(s | m));
393
394 case ROUND_UPWARD:
395 m = (m >> bitDiff);
396 if (lastBits != 0ull && sign > 0)
397 {
398 m += 1;
399 }
400 return Float(StorageType(s | m));
401
402 case ROUND_TO_ZERO:
403 return Float(StorageType(s | (m >> bitDiff)));
404
405 default:
406 DE_ASSERT(false);
407 break;
408 }
409 }
410
411 return zero(sign);
412 }
413
414 // Remove leading 1.
415 m = m & ~(1ull << OtherMantissaBits);
416
417 if (MantissaBits < OtherMantissaBits)
418 {
419 // Round mantissa.
420 int bitDiff = OtherMantissaBits - MantissaBits;
421 uint64_t lastBitsMask = (1ull << bitDiff) - 1ull;
422 uint64_t lastBits = (static_cast<uint64_t>(m) & lastBitsMask);
423 uint64_t half = (1ull << (bitDiff - 1)) - 1;
424 uint64_t bias = (m >> bitDiff) & 1;
425
426 switch (rd)
427 {
428 case ROUND_TO_EVEN:
429 m = (m + half + bias) >> bitDiff;
430 break;
431
432 case ROUND_DOWNWARD:
433 m = (m >> bitDiff);
434 if (lastBits != 0ull && sign < 0)
435 {
436 m += 1;
437 }
438 break;
439
440 case ROUND_UPWARD:
441 m = (m >> bitDiff);
442 if (lastBits != 0ull && sign > 0)
443 {
444 m += 1;
445 }
446 break;
447
448 case ROUND_TO_ZERO:
449 m = (m >> bitDiff);
450 break;
451
452 default:
453 DE_ASSERT(false);
454 break;
455 }
456
457 if (m & (1ull << MantissaBits))
458 {
459 // Overflow in mantissa.
460 m = 0;
461 e += 1;
462 }
463 }
464 else
465 {
466 int bitDiff = MantissaBits - OtherMantissaBits;
467 m = m << bitDiff;
468 }
469
470 if (e > eMax)
471 {
472 // Overflow.
473 return (((sign < 0 && rd == ROUND_UPWARD) || (sign > 0 && rd == ROUND_DOWNWARD)) ? largestNormal(sign) :
474 inf(sign));
475 }
476
477 DE_ASSERT(de::inRange(e, eMin, eMax));
478 DE_ASSERT(((e + ExponentBias) & ~((1ull << ExponentBits) - 1)) == 0);
479 DE_ASSERT((m & ~((1ull << MantissaBits) - 1)) == 0);
480
481 return Float(StorageType(s | (StorageType(e + ExponentBias) << MantissaBits) | m));
482 }
483
484 typedef typename Float16::StorageType float16_t;
485 template <class F>
486 inline constexpr F floatQuietNaN = std::numeric_limits<F>::quiet_NaN();
487 template <>
488 inline constexpr float16_t floatQuietNaN<float16_t> = 0x7e01;
489 template <class F>
490 inline constexpr F floatSignalingNaN = std::numeric_limits<F>::signaling_NaN();
491 template <>
492 inline constexpr float16_t floatSignalingNaN<float16_t> = 0x7c01;
493
494 } // namespace tcu
495
496 #endif // _TCUFLOAT_HPP
497