1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23 #ifndef INTEL_PIXEL_HASH_H
24 #define INTEL_PIXEL_HASH_H
25
26 /**
27 * Compute an \p n x \p m pixel hashing table usable as slice, subslice or
28 * pixel pipe hashing table. The resulting table is the cyclic repetition of
29 * a fixed pattern with periodicity equal to \p period.
30 *
31 * If \p index is specified to be equal to \p period, a 2-way hashing table
32 * will be generated such that indices 0 and 1 are returned for the following
33 * fractions of entries respectively:
34 *
35 * p_0 = ceil(period / 2) / period
36 * p_1 = floor(period / 2) / period
37 *
38 * If \p index is even and less than \p period, a 3-way hashing table will be
39 * generated such that indices 0, 1 and 2 are returned for the following
40 * fractions of entries:
41 *
42 * p_0 = (ceil(period / 2) - 1) / period
43 * p_1 = floor(period / 2) / period
44 * p_2 = 1 / period
45 *
46 * The equations above apply if \p flip is equal to 0, if it is equal to 1 p_0
47 * and p_1 will be swapped for the result. Note that in the context of pixel
48 * pipe hashing this can be always 0 on Gfx12 platforms, since the hardware
49 * transparently remaps logical indices found on the table to physical pixel
50 * pipe indices from the highest to lowest EU count.
51 */
52 UNUSED static void
intel_compute_pixel_hash_table_3way(unsigned n,unsigned m,unsigned period,unsigned index,bool flip,uint32_t * p)53 intel_compute_pixel_hash_table_3way(unsigned n, unsigned m,
54 unsigned period, unsigned index, bool flip,
55 uint32_t *p)
56 {
57 for (unsigned i = 0; i < n; i++) {
58 for (unsigned j = 0; j < m; j++) {
59 const unsigned k = (i + j) % period;
60 p[j + m * i] = (k == index ? 2 : (k & 1) ^ flip);
61 }
62 }
63 }
64
65 /**
66 * Compute an \p n x \p m pixel hashing table usable as slice,
67 * subslice or pixel pipe hashing table. This generalizes the
68 * previous 3-way hash table function to an arbitrary number of ways
69 * given by the number of bits set in the expression "mask1 | mask2".
70 * If a way is only set in one of the two mask arguments it will
71 * appear on the table with half the frequency as a way set on both
72 * masks.
73 */
74 UNUSED static void
intel_compute_pixel_hash_table_nway(unsigned n,unsigned m,uint32_t mask1,uint32_t mask2,uint32_t * p)75 intel_compute_pixel_hash_table_nway(unsigned n, unsigned m,
76 uint32_t mask1, uint32_t mask2,
77 uint32_t *p)
78 {
79 /* If both masks are equal all ways are expected to show up with
80 * the same frequency on the final table, so we can zero out one of
81 * the masks in order to halve the number of IDs we need to handle.
82 */
83 if (mask1 == mask2)
84 mask2 = 0;
85
86 /* Construct a table mapping consecutive indices to the physical
87 * indices given by the bits set on the mask arguments. Ways
88 * enabled on both masks will appear twice on the mapping, so
89 * they'll show up with twice the frequency on the final table.
90 */
91 unsigned phys_ids[(sizeof(mask1) + sizeof(mask2)) * CHAR_BIT];
92 unsigned num_ids = 0;
93
94 for (unsigned i = 0; i < sizeof(mask1) * CHAR_BIT; i++) {
95 if (mask1 & (1u << i))
96 phys_ids[num_ids++] = i;
97 if (mask2 & (1u << i))
98 phys_ids[num_ids++] = i;
99 }
100
101 assert(num_ids > 0);
102
103 /* Compute a permutation of the above indices that assigns indices
104 * as far as possible to adjacent entries. This permutation is
105 * designed to be equivalent to the bit reversal of each index in
106 * cases where num_ids is a power of two, but doesn't actually
107 * require it to be a power of two in order to satisfy the required
108 * properties (which is necessary to handle configurations with
109 * arbitrary non-power of two fusing). By construction, flipping
110 * bit l of its input will lead to a change in its result of the
111 * order of num_ids/2^(l+1) (see variable t below). The
112 * bijectivity of this permutation can be verified easily by
113 * induction. This permutation is applied cyclically to the
114 * vertical indices of the hashing table constructed below.
115 */
116 const unsigned bits = util_logbase2_ceil(num_ids);
117 unsigned swzy[ARRAY_SIZE(phys_ids)];
118
119 for (unsigned k = 0; k < num_ids; k++) {
120 unsigned t = num_ids;
121 unsigned s = 0;
122
123 for (unsigned l = 0; l < bits; l++) {
124 if (k & (1u << l)) {
125 s += (t + 1) >> 1;
126 t >>= 1;
127 } else {
128 t = (t + 1) >> 1;
129 }
130 }
131
132 swzy[k] = s;
133 }
134
135 /* Compute a second permutation applied cyclically to the
136 * horizontal indices of the hashing table. In cases where a
137 * single mask is present (which means that all ways are expected
138 * to have the same frequency) this permutation will be the
139 * identity and will have no effect.
140 *
141 * In cases where some ways have twice the frequency of the others,
142 * use a similar iterative halving of the range of the permutation
143 * as in the the swzy[] permutation defined above, but instead of
144 * scanning the bits of its argument (the "k" variable above) in
145 * the opposite order (from LSB to MSB), proceed by halving the
146 * domain of the permutation in the same order as its range, which
147 * would lead to an identity permutation if it wasn't because the
148 * LSB of its range is adjusted as early as possible instead of at
149 * the last iteration.
150 *
151 * The reason for the special casing of the LSB is that we want to
152 * avoid assigning adjacent IDs to adjacent elements of the table,
153 * since ways that appear duplicated in the phys_ids mapping above
154 * would then appear duplicated in adjacent positions of the final
155 * table, which would lead to poor utilization for small primitives
156 * that only cover a small contiguous portion of the hashing table
157 * and would have twice as much work as necessary submitted to the
158 * same way instead of spreading its processing over a larger
159 * number of ways.
160 */
161 unsigned swzx[ARRAY_SIZE(phys_ids)];
162
163 if (mask1 && mask2) {
164 for (unsigned k = 0; k < num_ids; k++) {
165 unsigned l = k;
166 unsigned t = num_ids;
167 unsigned s = 0;
168 bool in_range = false;
169
170 while (t > 1) {
171 const bool first_in_range = t <= m && !in_range;
172 in_range |= first_in_range;
173
174 if (l >= (t + 1) >> 1) {
175 /* Apply the s++ increment (which could only occur in
176 * the last t == 2 iteration if we were constructing an
177 * identity permutation) as soon as the domain of the
178 * permutation has been decomposed into a chunk smaller
179 * than the width of the hashing table \p m (which
180 * causes in_range to be first set to true), since
181 * doing it earlier would prevent any alternation
182 * between even and odd indices in the first \p m
183 * elements of swzx[], which are the only ones actually
184 * used.
185 *
186 * Subsequent (in_range == true) increments of s need
187 * to be doubled since they are selecting between
188 * indices of the same parity.
189 */
190 if (!in_range)
191 s += (t + 1) >> 1;
192 else if (first_in_range)
193 s++;
194 else
195 s += (t + 1) >> 1 << 1;
196
197 l -= (t + 1) >> 1;
198 t >>= 1;
199 } else {
200 t = (t + 1) >> 1;
201 }
202 }
203
204 swzx[k] = s;
205 }
206 } else {
207 for (unsigned k = 0; k < num_ids; k++)
208 swzx[k] = k;
209 }
210
211 /* Initialize the table with the cyclic repetition of a
212 * num_ids-periodic pattern.
213 *
214 * Note that the horizontal and vertical permutations (swzx and
215 * swzy respectively) are different, and the former is either an
216 * identity permutation or close to the identity. This asymmetry
217 * is intentional in order to minimize the size of the contiguous
218 * area that needs to be rendered in parallel in order to utilize
219 * the whole GPU: In cases where swzx is the identity a rendering
220 * rectangle of width W will need to be at least H blocks high,
221 * where H is bounded by 2^ceil(log2(num_ids/W)) thanks to the
222 * above definition of the swzy permutation.
223 */
224 for (unsigned i = 0; i < n; i++) {
225 const unsigned k = i % num_ids;
226 for (unsigned j = 0; j < m; j++) {
227 const unsigned l = j % num_ids;
228 p[j + m * i] = phys_ids[(swzx[l] + swzy[k]) % num_ids];
229 }
230 }
231 }
232
233 #endif
234