1 /*
2 * Copyright (C) 2019 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <[email protected]>
25 */
26
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "compiler.h"
30
31 /* This pass promotes reads from UBOs to register-mapped uniforms. This saves
32 * both instructions and work register pressure, but it reduces the work
33 * registers available, requiring a balance.
34 *
35 * We use a heuristic to determine the ideal count, implemented by
36 * mir_work_heuristic, which returns the ideal number of work registers.
37 */
38
39 static bool
mir_is_ubo(midgard_instruction * ins)40 mir_is_ubo(midgard_instruction *ins)
41 {
42 return (ins->type == TAG_LOAD_STORE_4) && (OP_IS_UBO_READ(ins->op));
43 }
44
45 static bool
mir_is_direct_aligned_ubo(midgard_instruction * ins)46 mir_is_direct_aligned_ubo(midgard_instruction *ins)
47 {
48 return mir_is_ubo(ins) && !(ins->constants.u32[0] & 0xF) &&
49 (ins->src[1] == ~0) && (ins->src[2] == ~0);
50 }
51
52 /* Represents use data for a single UBO */
53
54 #define MAX_UBO_QWORDS (65536 / 16)
55
56 struct mir_ubo_block {
57 BITSET_DECLARE(uses, MAX_UBO_QWORDS);
58 BITSET_DECLARE(pushed, MAX_UBO_QWORDS);
59 };
60
61 struct mir_ubo_analysis {
62 /* Per block analysis */
63 unsigned nr_blocks;
64 struct mir_ubo_block *blocks;
65 };
66
67 static struct mir_ubo_analysis
mir_analyze_ranges(compiler_context * ctx)68 mir_analyze_ranges(compiler_context *ctx)
69 {
70 struct mir_ubo_analysis res = {
71 .nr_blocks = ctx->nir->info.num_ubos + 1,
72 };
73
74 res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block));
75
76 mir_foreach_instr_global(ctx, ins) {
77 if (!mir_is_direct_aligned_ubo(ins))
78 continue;
79
80 unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
81 unsigned offset = ins->constants.u32[0] / 16;
82
83 assert(ubo < res.nr_blocks);
84
85 if (offset < MAX_UBO_QWORDS)
86 BITSET_SET(res.blocks[ubo].uses, offset);
87 }
88
89 return res;
90 }
91
92 /* Select UBO words to push. A sophisticated implementation would consider the
93 * number of uses and perhaps the control flow to estimate benefit. This is not
94 * sophisticated. Select from the last UBO first to prioritize sysvals. */
95
96 static void
mir_pick_ubo(struct panfrost_ubo_push * push,struct mir_ubo_analysis * analysis,unsigned max_qwords)97 mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis,
98 unsigned max_qwords)
99 {
100 unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4);
101
102 for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
103 struct mir_ubo_block *block = &analysis->blocks[ubo];
104
105 unsigned vec4;
106 BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) {
107 /* Don't push more than possible */
108 if (push->count > max_words - 4)
109 return;
110
111 for (unsigned offs = 0; offs < 4; ++offs) {
112 struct panfrost_ubo_word word = {
113 .ubo = ubo,
114 .offset = (vec4 * 16) + (offs * 4),
115 };
116
117 push->words[push->count++] = word;
118 }
119
120 /* Mark it as pushed so we can rewrite */
121 BITSET_SET(block->pushed, vec4);
122 }
123 }
124 }
125
126 #if 0
127 static void
128 mir_dump_ubo_analysis(struct mir_ubo_analysis *res)
129 {
130 printf("%u blocks\n", res->nr_blocks);
131
132 for (unsigned i = 0; i < res->nr_blocks; ++i) {
133 BITSET_WORD *uses = res->blocks[i].uses;
134 BITSET_WORD *push = res->blocks[i].pushed;
135
136 unsigned last = BITSET_LAST_BIT_SIZED(uses, BITSET_WORDS(MAX_UBO_QWORDS));
137
138 printf("\t");
139
140 for (unsigned j = 0; j < last; ++j) {
141 bool used = BITSET_TEST(uses, j);
142 bool pushed = BITSET_TEST(push, j);
143 assert(used || !pushed);
144
145 putchar(pushed ? '*' : used ? '-' : '_');
146 }
147
148 printf("\n");
149 }
150 }
151 #endif
152
153 static unsigned
mir_promoteable_uniform_count(struct mir_ubo_analysis * analysis)154 mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis)
155 {
156 unsigned count = 0;
157
158 for (unsigned i = 0; i < analysis->nr_blocks; ++i) {
159 BITSET_WORD *uses = analysis->blocks[i].uses;
160
161 for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w)
162 count += util_bitcount(uses[w]);
163 }
164
165 return count;
166 }
167
168 static unsigned
mir_count_live(uint16_t * live,unsigned temp_count)169 mir_count_live(uint16_t *live, unsigned temp_count)
170 {
171 unsigned count = 0;
172
173 for (unsigned i = 0; i < temp_count; ++i)
174 count += util_bitcount(live[i]);
175
176 return count;
177 }
178
179 static unsigned
mir_estimate_pressure(compiler_context * ctx)180 mir_estimate_pressure(compiler_context *ctx)
181 {
182 mir_invalidate_liveness(ctx);
183 mir_compute_liveness(ctx);
184
185 unsigned max_live = 0;
186
187 mir_foreach_block(ctx, _block) {
188 midgard_block *block = (midgard_block *)_block;
189 uint16_t *live =
190 mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));
191
192 mir_foreach_instr_in_block_rev(block, ins) {
193 unsigned count = mir_count_live(live, ctx->temp_count);
194 max_live = MAX2(max_live, count);
195 mir_liveness_ins_update(live, ins, ctx->temp_count);
196 }
197
198 free(live);
199 }
200
201 return DIV_ROUND_UP(max_live, 16);
202 }
203
204 static unsigned
mir_work_heuristic(compiler_context * ctx,struct mir_ubo_analysis * analysis)205 mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis)
206 {
207 unsigned uniform_count = mir_promoteable_uniform_count(analysis);
208
209 /* If there are 8 or fewer uniforms, it doesn't matter what we do, so
210 * allow as many work registers as needed */
211
212 if (uniform_count <= 8)
213 return 16;
214
215 /* Otherwise, estimate the register pressure */
216
217 unsigned pressure = mir_estimate_pressure(ctx);
218
219 /* Prioritize not spilling above all else. The relation between the
220 * pressure estimate and the actual register pressure is a little
221 * murkier than we might like (due to scheduling, pipeline registers,
222 * failure to pack vector registers, load/store registers, texture
223 * registers...), hence why this is a heuristic parameter */
224
225 if (pressure > 6)
226 return 16;
227
228 /* If there's no chance of spilling, prioritize UBOs and thread count */
229
230 return 8;
231 }
232
233 /* Bitset of indices that will be used as a special register -- inputs to a
234 * non-ALU op. We precompute this set so that testing is efficient, otherwise
235 * we end up O(mn) behaviour for n instructions and m uniform reads */
236
237 static BITSET_WORD *
mir_special_indices(compiler_context * ctx)238 mir_special_indices(compiler_context *ctx)
239 {
240 mir_compute_temp_count(ctx);
241 BITSET_WORD *bset =
242 calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));
243
244 mir_foreach_instr_global(ctx, ins) {
245 /* Look for special instructions */
246 bool is_ldst = ins->type == TAG_LOAD_STORE_4;
247 bool is_tex = ins->type == TAG_TEXTURE_4;
248 bool is_writeout = ins->compact_branch && ins->writeout;
249
250 if (!(is_ldst || is_tex || is_writeout))
251 continue;
252
253 /* Anything read by a special instruction is itself special */
254 mir_foreach_src(ins, i) {
255 unsigned idx = ins->src[i];
256
257 if (idx < ctx->temp_count)
258 BITSET_SET(bset, idx);
259 }
260 }
261
262 return bset;
263 }
264
265 void
midgard_promote_uniforms(compiler_context * ctx)266 midgard_promote_uniforms(compiler_context *ctx)
267 {
268 if (ctx->inputs->no_ubo_to_push) {
269 /* If nothing is pushed, all UBOs need to be uploaded
270 * conventionally */
271 ctx->ubo_mask = ~0;
272 return;
273 }
274
275 struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx);
276
277 unsigned work_count = mir_work_heuristic(ctx, &analysis);
278 unsigned promoted_count = 24 - work_count;
279
280 /* Ensure we are 16 byte aligned to avoid underallocations */
281 mir_pick_ubo(&ctx->info->push, &analysis, promoted_count);
282 ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4);
283
284 /* First, figure out special indices a priori so we don't recompute a lot */
285 BITSET_WORD *special = mir_special_indices(ctx);
286
287 ctx->ubo_mask = 0;
288
289 mir_foreach_instr_global_safe(ctx, ins) {
290 if (!mir_is_ubo(ins))
291 continue;
292
293 unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
294 unsigned qword = ins->constants.u32[0] / 16;
295
296 if (!mir_is_direct_aligned_ubo(ins)) {
297 if (ins->src[1] == ~0)
298 ctx->ubo_mask |= BITSET_BIT(ubo);
299 else
300 ctx->ubo_mask = ~0;
301
302 continue;
303 }
304
305 /* Check if we decided to push this */
306 assert(ubo < analysis.nr_blocks);
307 if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) {
308 ctx->ubo_mask |= BITSET_BIT(ubo);
309 continue;
310 }
311
312 /* Find where we pushed to, TODO: unaligned pushes to pack */
313 unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16);
314 assert((base & 0x3) == 0);
315
316 unsigned address = base / 4;
317 unsigned uniform_reg = 23 - address;
318
319 /* Should've taken into account when pushing */
320 assert(address < promoted_count);
321 unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
322
323 /* We do need the move for safety for a non-SSA dest, or if
324 * we're being fed into a special class */
325
326 bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;
327
328 if (ins->dest < ctx->temp_count)
329 needs_move |= BITSET_TEST(special, ins->dest);
330
331 if (needs_move) {
332 unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);
333 midgard_instruction mov = v_mov(promoted, ins->dest);
334 mov.dest_type = nir_type_uint | type_size;
335 mov.src_types[1] = mov.dest_type;
336
337 uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);
338 mir_set_bytemask(&mov, rounded);
339 mir_insert_instruction_before(ctx, ins, mov);
340 } else {
341 mir_rewrite_index_src(ctx, ins->dest, promoted);
342 }
343
344 mir_remove_instruction(ins);
345 }
346
347 free(special);
348 free(analysis.blocks);
349 }
350