xref: /aosp_15_r20/external/mesa3d/src/util/bitscan.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2008 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 #ifndef BITSCAN_H
30 #define BITSCAN_H
31 
32 #include <assert.h>
33 #include <stdint.h>
34 #include <stdbool.h>
35 #include <string.h>
36 
37 #if defined(_MSC_VER)
38 #include <intrin.h>
39 #endif
40 
41 #if defined(__POPCNT__)
42 #include <popcntintrin.h>
43 #endif
44 
45 #include "macros.h"
46 
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50 
51 
52 /**
53  * Find first bit set in word.  Least significant bit is 1.
54  * Return 0 if no bits set.
55  */
56 #ifdef HAVE___BUILTIN_FFS
57 #define ffs __builtin_ffs
58 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
59 static inline
60 int ffs(int i)
61 {
62    unsigned long index;
63    if (_BitScanForward(&index, i))
64       return index + 1;
65    else
66       return 0;
67 }
68 #else
69 extern
70 int ffs(int i);
71 #endif
72 
73 #ifdef HAVE___BUILTIN_FFSLL
74 #define ffsll __builtin_ffsll
75 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
76 static inline int
77 ffsll(long long int i)
78 {
79    unsigned long index;
80    if (_BitScanForward64(&index, i))
81       return index + 1;
82    else
83       return 0;
84 }
85 #else
86 extern int
87 ffsll(long long int val);
88 #endif
89 
90 
91 /* Destructively loop over all of the bits in a mask as in:
92  *
93  * while (mymask) {
94  *   int i = u_bit_scan(&mymask);
95  *   ... process element i
96  * }
97  *
98  */
99 static inline int
u_bit_scan(unsigned * mask)100 u_bit_scan(unsigned *mask)
101 {
102    const int i = ffs(*mask) - 1;
103    *mask ^= (1u << i);
104    return i;
105 }
106 
107 #define u_foreach_bit(b, dword)                          \
108    for (uint32_t __dword = (dword), b;                     \
109         ((b) = ffs(__dword) - 1, __dword);      \
110         __dword &= ~(1 << (b)))
111 
112 static inline int
u_bit_scan64(uint64_t * mask)113 u_bit_scan64(uint64_t *mask)
114 {
115    const int i = ffsll(*mask) - 1;
116    *mask ^= (((uint64_t)1) << i);
117    return i;
118 }
119 
120 #define u_foreach_bit64(b, dword)                          \
121    for (uint64_t __dword = (dword), b;                     \
122         ((b) = ffsll(__dword) - 1, __dword);      \
123         __dword &= ~(1ull << (b)))
124 
125 /* Determine if an uint32_t value is a power of two.
126  *
127  * \note
128  * Zero is treated as a power of two.
129  */
130 static inline bool
util_is_power_of_two_or_zero(uint32_t v)131 util_is_power_of_two_or_zero(uint32_t v)
132 {
133    return IS_POT(v);
134 }
135 
136 /* Determine if an uint64_t value is a power of two.
137  *
138  * \note
139  * Zero is treated as a power of two.
140  */
141 static inline bool
util_is_power_of_two_or_zero64(uint64_t v)142 util_is_power_of_two_or_zero64(uint64_t v)
143 {
144    return IS_POT(v);
145 }
146 
147 /* Determine if an uint32_t value is a power of two.
148  *
149  * \note
150  * Zero is \b not treated as a power of two.
151  */
152 static inline bool
util_is_power_of_two_nonzero(uint32_t v)153 util_is_power_of_two_nonzero(uint32_t v)
154 {
155    /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT.  The latter
156     * indicates the existence of the __builtin_popcount function.  The former
157     * indicates that _mm_popcnt_u32 exists and is a native instruction.
158     *
159     * The other alternative is to use SSE 4.2 compile-time flags.  This has
160     * two drawbacks.  First, there is currently no build infrastructure for
161     * SSE 4.2 (only 4.1), so that would have to be added.  Second, some AMD
162     * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
163     */
164 #ifdef __POPCNT__
165    return _mm_popcnt_u32(v) == 1;
166 #else
167    return IS_POT_NONZERO(v);
168 #endif
169 }
170 
171 /* Determine if an uint64_t value is a power of two.
172  *
173  * \note
174  * Zero is \b not treated as a power of two.
175  */
176 static inline bool
util_is_power_of_two_nonzero64(uint64_t v)177 util_is_power_of_two_nonzero64(uint64_t v)
178 {
179    return IS_POT_NONZERO(v);
180 }
181 
182 /* Determine if an size_t/uintptr_t/intptr_t value is a power of two.
183  *
184  * \note
185  * Zero is \b not treated as a power of two.
186  */
187 static inline bool
util_is_power_of_two_nonzero_uintptr(uintptr_t v)188 util_is_power_of_two_nonzero_uintptr(uintptr_t v)
189 {
190    return IS_POT_NONZERO(v);
191 }
192 
193 /* For looping over a bitmask when you want to loop over consecutive bits
194  * manually, for example:
195  *
196  * while (mask) {
197  *    int start, count, i;
198  *
199  *    u_bit_scan_consecutive_range(&mask, &start, &count);
200  *
201  *    for (i = 0; i < count; i++)
202  *       ... process element (start+i)
203  * }
204  */
205 static inline void
u_bit_scan_consecutive_range(unsigned * mask,int * start,int * count)206 u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
207 {
208    if (*mask == 0xffffffff) {
209       *start = 0;
210       *count = 32;
211       *mask = 0;
212       return;
213    }
214    *start = ffs(*mask) - 1;
215    *count = ffs(~(*mask >> *start)) - 1;
216    *mask &= ~(((1u << *count) - 1) << *start);
217 }
218 
219 static inline void
u_bit_scan_consecutive_range64(uint64_t * mask,int * start,int * count)220 u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
221 {
222    if (*mask == ~0ull) {
223       *start = 0;
224       *count = 64;
225       *mask = 0;
226       return;
227    }
228    *start = ffsll(*mask) - 1;
229    *count = ffsll(~(*mask >> *start)) - 1;
230    *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
231 }
232 
233 
234 /**
235  * Find last bit set in a word.  The least significant bit is 1.
236  * Return 0 if no bits are set.
237  * Essentially ffs() in the reverse direction.
238  */
239 static inline unsigned
util_last_bit(unsigned u)240 util_last_bit(unsigned u)
241 {
242 #if defined(HAVE___BUILTIN_CLZ)
243    return u == 0 ? 0 : 32 - __builtin_clz(u);
244 #elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
245    unsigned long index;
246    if (_BitScanReverse(&index, u))
247       return index + 1;
248    else
249       return 0;
250 #else
251    unsigned r = 0;
252    while (u) {
253       r++;
254       u >>= 1;
255    }
256    return r;
257 #endif
258 }
259 
260 /**
261  * Find last bit set in a word.  The least significant bit is 1.
262  * Return 0 if no bits are set.
263  * Essentially ffsll() in the reverse direction.
264  */
265 static inline unsigned
util_last_bit64(uint64_t u)266 util_last_bit64(uint64_t u)
267 {
268 #if defined(HAVE___BUILTIN_CLZLL)
269    return u == 0 ? 0 : 64 - __builtin_clzll(u);
270 #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
271    unsigned long index;
272    if (_BitScanReverse64(&index, u))
273       return index + 1;
274    else
275       return 0;
276 #else
277    unsigned r = 0;
278    while (u) {
279       r++;
280       u >>= 1;
281    }
282    return r;
283 #endif
284 }
285 
286 /**
287  * Find last bit in a word that does not match the sign bit. The least
288  * significant bit is 1.
289  * Return 0 if no bits are set.
290  */
291 static inline unsigned
util_last_bit_signed(int i)292 util_last_bit_signed(int i)
293 {
294    if (i >= 0)
295       return util_last_bit(i);
296    else
297       return util_last_bit(~(unsigned)i);
298 }
299 
300 /* Returns a bitfield in which the first count bits starting at start are
301  * set.
302  */
303 static inline unsigned
u_bit_consecutive(unsigned start,unsigned count)304 u_bit_consecutive(unsigned start, unsigned count)
305 {
306    assert(start + count <= 32);
307    if (count == 32)
308       return ~0;
309    return ((1u << count) - 1) << start;
310 }
311 
312 static inline uint64_t
u_bit_consecutive64(unsigned start,unsigned count)313 u_bit_consecutive64(unsigned start, unsigned count)
314 {
315    assert(start + count <= 64);
316    if (count == 64)
317       return ~(uint64_t)0;
318    return (((uint64_t)1 << count) - 1) << start;
319 }
320 
321 /**
322  * Return number of bits set in n.
323  */
324 static inline unsigned
util_bitcount(unsigned n)325 util_bitcount(unsigned n)
326 {
327 #if defined(HAVE___BUILTIN_POPCOUNT)
328    return __builtin_popcount(n);
329 #else
330    /* K&R classic bitcount.
331     *
332     * For each iteration, clear the LSB from the bitfield.
333     * Requires only one iteration per set bit, instead of
334     * one iteration per bit less than highest set bit.
335     */
336    unsigned bits;
337    for (bits = 0; n; bits++) {
338       n &= n - 1;
339    }
340    return bits;
341 #endif
342 }
343 
344 /**
345  * Return the number of bits set in n using the native popcnt instruction.
346  * The caller is responsible for ensuring that popcnt is supported by the CPU.
347  *
348  * gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
349  *
350  */
351 static inline unsigned
util_popcnt_inline_asm(unsigned n)352 util_popcnt_inline_asm(unsigned n)
353 {
354 #if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
355    uint32_t out;
356    __asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
357    return out;
358 #else
359    /* We should never get here by accident, but I'm sure it'll happen. */
360    return util_bitcount(n);
361 #endif
362 }
363 
364 static inline unsigned
util_bitcount64(uint64_t n)365 util_bitcount64(uint64_t n)
366 {
367 #ifdef HAVE___BUILTIN_POPCOUNTLL
368    return __builtin_popcountll(n);
369 #else
370    return util_bitcount(n) + util_bitcount(n >> 32);
371 #endif
372 }
373 
374 /**
375  * Widens the given bit mask by a multiplier, meaning that it will
376  * replicate each bit by that amount.
377  *
378  * For example:
379  * 0b101 widened by 2 will become: 0b110011
380  *
381  * This is typically used in shader I/O to transform a 64-bit
382  * writemask to a 32-bit writemask.
383  */
384 static inline uint32_t
util_widen_mask(uint32_t mask,unsigned multiplier)385 util_widen_mask(uint32_t mask, unsigned multiplier)
386 {
387    uint32_t new_mask = 0;
388    u_foreach_bit(i, mask)
389       new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
390    return new_mask;
391 }
392 
393 #ifdef __cplusplus
394 }
395 
396 /* util_bitcount has large measurable overhead (~2%), so it's recommended to
397  * use the POPCNT instruction via inline assembly if the CPU supports it.
398  */
399 enum util_popcnt {
400    POPCNT_NO,
401    POPCNT_YES,
402    POPCNT_INVALID,
403 };
404 
405 /* Convenient function to select popcnt through a C++ template argument.
406  * This should be used as part of larger functions that are optimized
407  * as a whole.
408  */
409 template<util_popcnt POPCNT> inline unsigned
util_bitcount_fast(unsigned n)410 util_bitcount_fast(unsigned n)
411 {
412    if (POPCNT == POPCNT_YES)
413       return util_popcnt_inline_asm(n);
414    else
415       return util_bitcount(n);
416 }
417 
418 #endif /* __cplusplus */
419 
420 #endif /* BITSCAN_H */
421