xref: /aosp_15_r20/external/deqp/framework/common/tcuFloat.hpp (revision 35238bce31c2a825756842865a792f8cf7f89930)
1 #ifndef _TCUFLOAT_HPP
2 #define _TCUFLOAT_HPP
3 /*-------------------------------------------------------------------------
4  * drawElements Quality Program Tester Core
5  * ----------------------------------------
6  *
7  * Copyright 2014 The Android Open Source Project
8  *
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  *
13  *      http://www.apache.org/licenses/LICENSE-2.0
14  *
15  * Unless required by applicable law or agreed to in writing, software
16  * distributed under the License is distributed on an "AS IS" BASIS,
17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18  * See the License for the specific language governing permissions and
19  * limitations under the License.
20  *
21  *//*!
22  * \file
23  * \brief Reconfigurable floating-point value template.
24  *//*--------------------------------------------------------------------*/
25 
26 #include "tcuDefs.hpp"
27 
28 // For memcpy().
29 #include <limits>
30 #include <string.h>
31 
32 namespace tcu
33 {
34 
35 enum FloatFlags
36 {
37     FLOAT_HAS_SIGN       = (1 << 0),
38     FLOAT_SUPPORT_DENORM = (1 << 1)
39 };
40 
41 enum RoundingDirection
42 {
43     ROUND_TO_EVEN = 0,
44     ROUND_DOWNWARD, // Towards -Inf.
45     ROUND_UPWARD,   // Towards +Inf.
46     ROUND_TO_ZERO
47 };
48 
49 /*--------------------------------------------------------------------*//*!
50  * \brief Floating-point format template
51  *
52  * This template implements arbitrary floating-point handling. Template
53  * can be used for conversion between different formats and checking
54  * various properties of floating-point values.
55  *//*--------------------------------------------------------------------*/
56 template <typename StorageType_, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
57 class Float
58 {
59 public:
60     typedef StorageType_ StorageType;
61 
62     enum
63     {
64         EXPONENT_BITS = ExponentBits,
65         MANTISSA_BITS = MantissaBits,
66         EXPONENT_BIAS = ExponentBias,
67         FLAGS         = Flags,
68     };
69 
70     Float(void);
71     explicit Float(StorageType value);
72     explicit Float(float v, RoundingDirection rd = ROUND_TO_EVEN);
73     explicit Float(double v, RoundingDirection rd = ROUND_TO_EVEN);
74 
75     template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias,
76               uint32_t OtherFlags>
77     static Float convert(
78         const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags> &src,
79         RoundingDirection rd = ROUND_TO_EVEN);
80 
convert(const Float<StorageType,ExponentBits,MantissaBits,ExponentBias,Flags> & src,RoundingDirection=ROUND_TO_EVEN)81     static inline Float convert(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> &src,
82                                 RoundingDirection = ROUND_TO_EVEN)
83     {
84         return src;
85     }
86 
87     /*--------------------------------------------------------------------*//*!
88      * \brief Construct floating point value
89      * \param sign        Sign. Must be +1/-1
90      * \param exponent    Exponent in range [1-ExponentBias, ExponentBias+1]
91      * \param mantissa    Mantissa bits with implicit leading bit explicitly set
92      * \return The specified float
93      *
94      * This function constructs a floating point value from its inputs.
95      * The normally implicit leading bit of the mantissa must be explicitly set.
96      * The exponent normally used for zero/subnormals is an invalid input. Such
97      * values are specified with the leading mantissa bit of zero and the lowest
98      * normal exponent (1-ExponentBias). Additionally having both exponent and
99      * mantissa set to zero is a shorthand notation for the correctly signed
100      * floating point zero. Inf and NaN must be specified directly with an
101      * exponent of ExponentBias+1 and the appropriate mantissa (with leading
102      * bit set)
103      *//*--------------------------------------------------------------------*/
104     static inline Float construct(int sign, int exponent, StorageType mantissa);
105 
106     /*--------------------------------------------------------------------*//*!
107      * \brief Construct floating point value. Explicit version
108      * \param sign        Sign. Must be +1/-1
109      * \param exponent    Exponent in range [-ExponentBias, ExponentBias+1]
110      * \param mantissa    Mantissa bits
111      * \return The specified float
112      *
113      * This function constructs a floating point value from its inputs with
114      * minimal intervention.
115      * The sign is turned into a sign bit and the exponent bias is added.
116      * See IEEE-754 for additional information on the inputs and
117      * the encoding of special values.
118      *//*--------------------------------------------------------------------*/
119     static Float constructBits(int sign, int exponent, StorageType mantissaBits);
120 
bits(void) const121     StorageType bits(void) const
122     {
123         return m_value;
124     }
125     float asFloat(void) const;
126     double asDouble(void) const;
127 
signBit(void) const128     inline int signBit(void) const
129     {
130         return (int)(m_value >> (ExponentBits + MantissaBits)) & 1;
131     }
exponentBits(void) const132     inline StorageType exponentBits(void) const
133     {
134         return (m_value >> MantissaBits) & ((StorageType(1) << ExponentBits) - 1);
135     }
mantissaBits(void) const136     inline StorageType mantissaBits(void) const
137     {
138         return m_value & ((StorageType(1) << MantissaBits) - 1);
139     }
140 
sign(void) const141     inline int sign(void) const
142     {
143         return signBit() ? -1 : 1;
144     }
exponent(void) const145     inline int exponent(void) const
146     {
147         return isDenorm() ? 1 - ExponentBias : (int)exponentBits() - ExponentBias;
148     }
mantissa(void) const149     inline StorageType mantissa(void) const
150     {
151         return isZero() || isDenorm() ? mantissaBits() : (mantissaBits() | (StorageType(1) << MantissaBits));
152     }
153 
isInf(void) const154     inline bool isInf(void) const
155     {
156         return exponentBits() == ((1 << ExponentBits) - 1) && mantissaBits() == 0;
157     }
isNaN(void) const158     inline bool isNaN(void) const
159     {
160         return exponentBits() == ((1 << ExponentBits) - 1) && mantissaBits() != 0;
161     }
isZero(void) const162     inline bool isZero(void) const
163     {
164         return exponentBits() == 0 && mantissaBits() == 0;
165     }
isDenorm(void) const166     inline bool isDenorm(void) const
167     {
168         return exponentBits() == 0 && mantissaBits() != 0;
169     }
170 
operator <(const Float<StorageType,ExponentBits,MantissaBits,ExponentBias,Flags> & other) const171     inline bool operator<(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> &other) const
172     {
173         return this->asDouble() < other.asDouble();
174     }
175 
176     static Float zero(int sign);
177     static Float inf(int sign);
178     static Float nan(void);
179 
180     static Float largestNormal(int sign);
181     static Float smallestNormal(int sign);
182 
183 private:
184     StorageType m_value;
185 } DE_WARN_UNUSED_TYPE;
186 
187 // Common floating-point types.
188 typedef Float<uint16_t, 5, 10, 15, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
189     Float16; //!< IEEE 754-2008 16-bit floating-point value
190 typedef Float<uint32_t, 8, 23, 127, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
191     Float32; //!< IEEE 754 32-bit floating-point value
192 typedef Float<uint64_t, 11, 52, 1023, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
193     Float64; //!< IEEE 754 64-bit floating-point value
194 
195 typedef Float<uint16_t, 5, 10, 15, FLOAT_HAS_SIGN>
196     Float16Denormless; //!< IEEE 754-2008 16-bit floating-point value without denormalized support
197 
198 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(void)199 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(void) : m_value(0)
200 {
201 }
202 
203 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(StorageType value)204 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(StorageType value) : m_value(value)
205 {
206 }
207 
208 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(float value,RoundingDirection rd)209 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(float value, RoundingDirection rd)
210     : m_value(0)
211 {
212     uint32_t u32;
213     memcpy(&u32, &value, sizeof(uint32_t));
214     *this = convert(Float32(u32), rd);
215 }
216 
217 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(double value,RoundingDirection rd)218 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(double value, RoundingDirection rd)
219     : m_value(0)
220 {
221     uint64_t u64;
222     memcpy(&u64, &value, sizeof(uint64_t));
223     *this = convert(Float64(u64), rd);
224 }
225 
226 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
asFloat(void) const227 inline float Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asFloat(void) const
228 {
229     float v;
230     uint32_t u32 = Float32::convert(*this).bits();
231     memcpy(&v, &u32, sizeof(uint32_t));
232     return v;
233 }
234 
235 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
asDouble(void) const236 inline double Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asDouble(void) const
237 {
238     double v;
239     uint64_t u64 = Float64::convert(*this).bits();
240     memcpy(&v, &u64, sizeof(uint64_t));
241     return v;
242 }
243 
244 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
245 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
zero(int sign)246     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::zero(int sign)
247 {
248     DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
249     return Float(StorageType((sign > 0 ? 0ull : 1ull) << (ExponentBits + MantissaBits)));
250 }
251 
252 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
253 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
inf(int sign)254     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::inf(int sign)
255 {
256     DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
257     return Float(StorageType(((sign > 0 ? 0ull : 1ull) << (ExponentBits + MantissaBits)) |
258                              (((1ull << ExponentBits) - 1) << MantissaBits)));
259 }
260 
261 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
262 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
nan(void)263     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::nan(void)
264 {
265     return Float(StorageType((1ull << (ExponentBits + MantissaBits)) - 1));
266 }
267 
268 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
269 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
largestNormal(int sign)270     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::largestNormal(int sign)
271 {
272     DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
273     return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(
274         sign, ExponentBias, (static_cast<StorageType>(1) << (MantissaBits + 1)) - 1);
275 }
276 
277 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
278 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
smallestNormal(int sign)279     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::smallestNormal(int sign)
280 {
281     DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
282     return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(
283         sign, 1 - ExponentBias, (static_cast<StorageType>(1) << MantissaBits));
284 }
285 
286 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
287 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
construct(int sign,int exponent,StorageType mantissa)288     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(int sign, int exponent,
289                                                                              StorageType mantissa)
290 {
291     // Repurpose this otherwise invalid input as a shorthand notation for zero (no need for caller to care about internal representation)
292     const bool isShorthandZero = exponent == 0 && mantissa == 0;
293 
294     // Handles the typical notation for zero (min exponent, mantissa 0). Note that the exponent usually used exponent (-ExponentBias) for zero/subnormals is not used.
295     // Instead zero/subnormals have the (normally implicit) leading mantissa bit set to zero.
296     const bool isDenormOrZero = (exponent == 1 - ExponentBias) && (mantissa >> MantissaBits == 0);
297     const StorageType s   = StorageType((StorageType(sign < 0 ? 1 : 0)) << (StorageType(ExponentBits + MantissaBits)));
298     const StorageType exp = (isShorthandZero || isDenormOrZero) ? StorageType(0) : StorageType(exponent + ExponentBias);
299 
300     DE_ASSERT(sign == +1 || sign == -1);
301     DE_ASSERT(isShorthandZero || isDenormOrZero || mantissa >> MantissaBits == 1);
302     DE_ASSERT(exp >> ExponentBits == 0);
303 
304     return Float(StorageType(s | (exp << MantissaBits) | (mantissa & ((StorageType(1) << MantissaBits) - 1))));
305 }
306 
307 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
308 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
constructBits(int sign,int exponent,StorageType mantissaBits)309     StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::constructBits(int sign, int exponent,
310                                                                                  StorageType mantissaBits)
311 {
312     const StorageType signBit      = static_cast<StorageType>(sign < 0 ? 1 : 0);
313     const StorageType exponentBits = static_cast<StorageType>(exponent + ExponentBias);
314 
315     DE_ASSERT(sign == +1 || sign == -1);
316     DE_ASSERT(exponentBits >> ExponentBits == 0);
317     DE_ASSERT(mantissaBits >> MantissaBits == 0);
318 
319     return Float(
320         StorageType((signBit << (ExponentBits + MantissaBits)) | (exponentBits << MantissaBits) | (mantissaBits)));
321 }
322 
323 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
324 template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias,
325           uint32_t OtherFlags>
326 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits,
327                                                                           ExponentBias, Flags>::
convert(const Float<OtherStorageType,OtherExponentBits,OtherMantissaBits,OtherExponentBias,OtherFlags> & other,RoundingDirection rd)328     convert(const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags> &other,
329             RoundingDirection rd)
330 {
331     int sign = other.sign();
332 
333     if (!(Flags & FLOAT_HAS_SIGN) && sign < 0)
334     {
335         // Negative number, truncate to zero.
336         return zero(+1);
337     }
338 
339     if (other.isInf())
340     {
341         return inf(sign);
342     }
343 
344     if (other.isNaN())
345     {
346         return nan();
347     }
348 
349     if (other.isZero())
350     {
351         return zero(sign);
352     }
353 
354     const int eMin = 1 - ExponentBias;
355     const int eMax = ((1 << ExponentBits) - 2) - ExponentBias;
356 
357     const StorageType s = StorageType((StorageType(other.signBit()))
358                                       << (StorageType(ExponentBits + MantissaBits))); // \note Not sign, but sign bit.
359     int e               = other.exponent();
360     uint64_t m          = other.mantissa();
361 
362     // Normalize denormalized values prior to conversion.
363     while (!(m & (1ull << OtherMantissaBits)))
364     {
365         m <<= 1;
366         e -= 1;
367     }
368 
369     if (e < eMin)
370     {
371         // Underflow.
372         if ((Flags & FLOAT_SUPPORT_DENORM) && (eMin - e - 1 <= MantissaBits))
373         {
374             // Shift and round.
375             int bitDiff           = (OtherMantissaBits - MantissaBits) + (eMin - e);
376             uint64_t lastBitsMask = (1ull << bitDiff) - 1ull;
377             uint64_t lastBits     = (static_cast<uint64_t>(m) & lastBitsMask);
378             uint64_t half         = (1ull << (bitDiff - 1)) - 1;
379             uint64_t bias         = (m >> bitDiff) & 1;
380 
381             switch (rd)
382             {
383             case ROUND_TO_EVEN:
384                 return Float(StorageType(s | (m + half + bias) >> bitDiff));
385 
386             case ROUND_DOWNWARD:
387                 m = (m >> bitDiff);
388                 if (lastBits != 0ull && sign < 0)
389                 {
390                     m += 1;
391                 }
392                 return Float(StorageType(s | m));
393 
394             case ROUND_UPWARD:
395                 m = (m >> bitDiff);
396                 if (lastBits != 0ull && sign > 0)
397                 {
398                     m += 1;
399                 }
400                 return Float(StorageType(s | m));
401 
402             case ROUND_TO_ZERO:
403                 return Float(StorageType(s | (m >> bitDiff)));
404 
405             default:
406                 DE_ASSERT(false);
407                 break;
408             }
409         }
410 
411         return zero(sign);
412     }
413 
414     // Remove leading 1.
415     m = m & ~(1ull << OtherMantissaBits);
416 
417     if (MantissaBits < OtherMantissaBits)
418     {
419         // Round mantissa.
420         int bitDiff           = OtherMantissaBits - MantissaBits;
421         uint64_t lastBitsMask = (1ull << bitDiff) - 1ull;
422         uint64_t lastBits     = (static_cast<uint64_t>(m) & lastBitsMask);
423         uint64_t half         = (1ull << (bitDiff - 1)) - 1;
424         uint64_t bias         = (m >> bitDiff) & 1;
425 
426         switch (rd)
427         {
428         case ROUND_TO_EVEN:
429             m = (m + half + bias) >> bitDiff;
430             break;
431 
432         case ROUND_DOWNWARD:
433             m = (m >> bitDiff);
434             if (lastBits != 0ull && sign < 0)
435             {
436                 m += 1;
437             }
438             break;
439 
440         case ROUND_UPWARD:
441             m = (m >> bitDiff);
442             if (lastBits != 0ull && sign > 0)
443             {
444                 m += 1;
445             }
446             break;
447 
448         case ROUND_TO_ZERO:
449             m = (m >> bitDiff);
450             break;
451 
452         default:
453             DE_ASSERT(false);
454             break;
455         }
456 
457         if (m & (1ull << MantissaBits))
458         {
459             // Overflow in mantissa.
460             m = 0;
461             e += 1;
462         }
463     }
464     else
465     {
466         int bitDiff = MantissaBits - OtherMantissaBits;
467         m           = m << bitDiff;
468     }
469 
470     if (e > eMax)
471     {
472         // Overflow.
473         return (((sign < 0 && rd == ROUND_UPWARD) || (sign > 0 && rd == ROUND_DOWNWARD)) ? largestNormal(sign) :
474                                                                                            inf(sign));
475     }
476 
477     DE_ASSERT(de::inRange(e, eMin, eMax));
478     DE_ASSERT(((e + ExponentBias) & ~((1ull << ExponentBits) - 1)) == 0);
479     DE_ASSERT((m & ~((1ull << MantissaBits) - 1)) == 0);
480 
481     return Float(StorageType(s | (StorageType(e + ExponentBias) << MantissaBits) | m));
482 }
483 
484 typedef typename Float16::StorageType float16_t;
485 template <class F>
486 inline constexpr F floatQuietNaN = std::numeric_limits<F>::quiet_NaN();
487 template <>
488 inline constexpr float16_t floatQuietNaN<float16_t> = 0x7e01;
489 template <class F>
490 inline constexpr F floatSignalingNaN = std::numeric_limits<F>::signaling_NaN();
491 template <>
492 inline constexpr float16_t floatSignalingNaN<float16_t> = 0x7c01;
493 
494 } // namespace tcu
495 
496 #endif // _TCUFLOAT_HPP
497