1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_nir.h"
26 #include "compiler/nir/nir.h"
27 #include "util/u_dynarray.h"
28
29 /**
30 * \file brw_nir_analyze_ubo_ranges.c
31 *
32 * This pass decides which portions of UBOs to upload as push constants,
33 * so shaders can access them as part of the thread payload, rather than
34 * having to issue expensive memory reads to pull the data.
35 *
36 * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
37 * buffers, in GRF sized units. This was always 256 bits (32 bytes).
38 * Starting with Xe2, it is 512 bits (64 bytes).
39 *
40 * To do this, we examine NIR load_ubo intrinsics, recording the number of
41 * loads at each offset. We track offsets at a sizeof(GRF) granularity, so even
42 * fields with a bit of padding between them tend to fall into contiguous
43 * ranges. We build a list of these ranges, tracking their "cost" (number
44 * of registers required) and "benefit" (number of pull loads eliminated
45 * by pushing the range). We then sort the list to obtain the four best
46 * ranges (most benefit for the least cost).
47 */
48
49 struct ubo_range_entry
50 {
51 struct brw_ubo_range range;
52 int benefit;
53 };
54
55 static int
score(const struct ubo_range_entry * entry)56 score(const struct ubo_range_entry *entry)
57 {
58 return 2 * entry->benefit - entry->range.length;
59 }
60
61 /**
62 * Compares score for two UBO range entries.
63 *
64 * For a descending qsort().
65 */
66 static int
cmp_ubo_range_entry(const void * va,const void * vb)67 cmp_ubo_range_entry(const void *va, const void *vb)
68 {
69 const struct ubo_range_entry *a = va;
70 const struct ubo_range_entry *b = vb;
71
72 /* Rank based on scores, descending order */
73 int delta = score(b) - score(a);
74
75 /* Then use the UBO block index as a tie-breaker, descending order */
76 if (delta == 0)
77 delta = b->range.block - a->range.block;
78
79 /* Finally use the start offset as a second tie-breaker, ascending order */
80 if (delta == 0)
81 delta = a->range.start - b->range.start;
82
83 return delta;
84 }
85
86 struct ubo_block_info
87 {
88 /* Each bit in the offsets bitfield represents a 32-byte section of data.
89 * If it's set to one, there is interesting UBO data at that offset. If
90 * not, there's a "hole" - padding between data - or just nothing at all.
91 */
92 uint64_t offsets;
93 uint8_t uses[64];
94 };
95
96 struct ubo_analysis_state
97 {
98 struct hash_table *blocks;
99 bool uses_regular_uniforms;
100 const struct intel_device_info *devinfo;
101 };
102
103 static struct ubo_block_info *
get_block_info(struct ubo_analysis_state * state,int block)104 get_block_info(struct ubo_analysis_state *state, int block)
105 {
106 uint32_t hash = block + 1;
107 void *key = (void *) (uintptr_t) hash;
108
109 struct hash_entry *entry =
110 _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
111
112 if (entry)
113 return (struct ubo_block_info *) entry->data;
114
115 struct ubo_block_info *info =
116 rzalloc(state->blocks, struct ubo_block_info);
117 _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
118
119 return info;
120 }
121
122 static void
analyze_ubos_block(struct ubo_analysis_state * state,nir_block * block)123 analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
124 {
125 nir_foreach_instr(instr, block) {
126 if (instr->type != nir_instr_type_intrinsic)
127 continue;
128
129 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
130 switch (intrin->intrinsic) {
131 case nir_intrinsic_load_uniform:
132 case nir_intrinsic_image_deref_load:
133 case nir_intrinsic_image_deref_store:
134 case nir_intrinsic_image_deref_atomic:
135 case nir_intrinsic_image_deref_atomic_swap:
136 case nir_intrinsic_image_deref_size:
137 state->uses_regular_uniforms = true;
138 continue;
139
140 case nir_intrinsic_load_ubo:
141 break; /* Fall through to the analysis below */
142
143 default:
144 continue; /* Not a uniform or UBO intrinsic */
145 }
146
147 if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
148 nir_src_is_const(intrin->src[1])) {
149 const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
150 const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
151 const unsigned sizeof_GRF = REG_SIZE * reg_unit(state->devinfo);
152 const int offset = byte_offset / sizeof_GRF;
153
154 /* Avoid shifting by larger than the width of our bitfield, as this
155 * is undefined in C. Even if we require multiple bits to represent
156 * the entire value, it's OK to record a partial value - the backend
157 * is capable of falling back to pull loads for later components of
158 * vectors, as it has to shrink ranges for other reasons anyway.
159 */
160 if (offset >= 64)
161 continue;
162
163 /* The value might span multiple sizeof(GRF) chunks. */
164 const int bytes = nir_intrinsic_dest_components(intrin) *
165 (intrin->def.bit_size / 8);
166 const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF);
167 const int end = ALIGN(byte_offset + bytes, sizeof_GRF);
168 const int chunks = (end - start) / sizeof_GRF;
169
170 /* TODO: should we count uses in loops as higher benefit? */
171
172 struct ubo_block_info *info = get_block_info(state, block);
173 info->offsets |= ((1ull << chunks) - 1) << offset;
174 info->uses[offset]++;
175 }
176 }
177 }
178
179 static void
print_ubo_entry(FILE * file,const struct ubo_range_entry * entry,struct ubo_analysis_state * state)180 print_ubo_entry(FILE *file,
181 const struct ubo_range_entry *entry,
182 struct ubo_analysis_state *state)
183 {
184 struct ubo_block_info *info = get_block_info(state, entry->range.block);
185
186 fprintf(file,
187 "block %2d, start %2d, length %2d, bits = %"PRIx64", "
188 "benefit %2d, cost %2d, score = %2d\n",
189 entry->range.block, entry->range.start, entry->range.length,
190 info->offsets, entry->benefit, entry->range.length, score(entry));
191 }
192
193 void
brw_nir_analyze_ubo_ranges(const struct brw_compiler * compiler,nir_shader * nir,struct brw_ubo_range out_ranges[4])194 brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
195 nir_shader *nir,
196 struct brw_ubo_range out_ranges[4])
197 {
198 void *mem_ctx = ralloc_context(NULL);
199
200 struct ubo_analysis_state state = {
201 .uses_regular_uniforms = false,
202 .blocks =
203 _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
204 .devinfo = compiler->devinfo,
205 };
206
207 /* Compute shaders use push constants to get the subgroup ID so it's
208 * best to just assume some system values are pushed.
209 */
210 if (nir->info.stage == MESA_SHADER_COMPUTE)
211 state.uses_regular_uniforms = true;
212
213 /* Walk the IR, recording how many times each UBO block/offset is used. */
214 nir_foreach_function_impl(impl, nir) {
215 nir_foreach_block(block, impl) {
216 analyze_ubos_block(&state, block);
217 }
218 }
219
220 /* Find ranges: a block, starting register-size aligned byte offset, and
221 * length.
222 */
223 struct util_dynarray ranges;
224 util_dynarray_init(&ranges, mem_ctx);
225
226 hash_table_foreach(state.blocks, entry) {
227 const int b = entry->hash - 1;
228 const struct ubo_block_info *info = entry->data;
229 uint64_t offsets = info->offsets;
230
231 /* Walk through the offsets bitfield, finding contiguous regions of
232 * set bits:
233 *
234 * 0000000001111111111111000000000000111111111111110000000011111100
235 * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
236 *
237 * Each of these will become a UBO range.
238 */
239 while (offsets != 0) {
240 /* Find the first 1 in the offsets bitfield. This represents the
241 * start of a range of interesting UBO data. Make it zero-indexed.
242 */
243 int first_bit = ffsll(offsets) - 1;
244
245 /* Find the first 0 bit in offsets beyond first_bit. To find the
246 * first zero bit, we find the first 1 bit in the complement. In
247 * order to ignore bits before first_bit, we mask off those bits.
248 */
249 int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
250
251 if (first_hole == -1) {
252 /* If we didn't find a hole, then set it to the end of the
253 * bitfield. There are no more ranges to process.
254 */
255 first_hole = 64;
256 offsets = 0;
257 } else {
258 /* We've processed all bits before first_hole. Mask them off. */
259 offsets &= ~((1ull << first_hole) - 1);
260 }
261
262 struct ubo_range_entry *entry =
263 util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
264
265 entry->range.block = b;
266 entry->range.start = first_bit;
267 /* first_hole is one beyond the end, so we don't need to add 1 */
268 entry->range.length = first_hole - first_bit;
269 entry->benefit = 0;
270
271 for (int i = 0; i < entry->range.length; i++)
272 entry->benefit += info->uses[first_bit + i];
273 }
274 }
275
276 int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
277
278 if (0) {
279 util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
280 print_ubo_entry(stderr, entry, &state);
281 }
282 }
283
284 /* TODO: Consider combining ranges.
285 *
286 * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are
287 * more ranges, and two are close by with only a small hole, it may be
288 * worth combining them. The holes will waste register space, but the
289 * benefit of removing pulls may outweigh that cost.
290 */
291
292 /* Sort the list so the most beneficial ranges are at the front. */
293 if (nr_entries > 0) {
294 qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
295 cmp_ubo_range_entry);
296 }
297
298 struct ubo_range_entry *entries = ranges.data;
299
300 /* Return the top 4 or so. We drop by one if regular uniforms are in
301 * use, assuming one push buffer will be dedicated to those. We may
302 * also only get 3 on Haswell if we can't write INSTPM.
303 *
304 * The backend may need to shrink these ranges to ensure that they
305 * don't exceed the maximum push constant limits. It can simply drop
306 * the tail of the list, as that's the least valuable portion. We
307 * unfortunately can't truncate it here, because we don't know what
308 * the backend is planning to do with regular uniforms.
309 */
310 const int max_ubos = 4 - state.uses_regular_uniforms;
311 nr_entries = MIN2(nr_entries, max_ubos);
312
313 for (int i = 0; i < nr_entries; i++) {
314 out_ranges[i] = entries[i].range;
315
316 /* To this point, various values have been tracked in terms of the real
317 * hardware register sizes. However, the rest of the compiler expects
318 * values in terms of pre-Xe2 256-bit registers. Scale start and length
319 * to account for this.
320 */
321 out_ranges[i].start *= reg_unit(compiler->devinfo);
322 out_ranges[i].length *= reg_unit(compiler->devinfo);
323 }
324 for (int i = nr_entries; i < 4; i++) {
325 out_ranges[i].block = 0;
326 out_ranges[i].start = 0;
327 out_ranges[i].length = 0;
328 }
329
330 ralloc_free(ranges.mem_ctx);
331 }
332