1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * Copyright 2021 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "agx_builder.h"
8 #include "agx_compiler.h"
9
10 UNUSED static void
print_copy(const struct agx_copy * cp)11 print_copy(const struct agx_copy *cp)
12 {
13 printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
14 agx_print_index(cp->src, false, stdout);
15 printf("\n");
16 }
17
18 UNUSED static void
print_copies(const struct agx_copy * copies,unsigned nr)19 print_copies(const struct agx_copy *copies, unsigned nr)
20 {
21 printf("[\n");
22
23 for (unsigned i = 0; i < nr; ++i) {
24 printf(" ");
25 print_copy(&copies[i]);
26 }
27
28 printf("]\n");
29 }
30
31 /*
32 * Emits code for
33 *
34 * for (int i = 0; i < n; ++i)
35 * registers[dests[i]] = registers[srcs[i]];
36 *
37 * ...with all copies happening in parallel.
38 *
39 * That is, emit machine instructions equivalent to a parallel copy. This is
40 * used to lower not only parallel copies but also collects and splits, which
41 * also have parallel copy semantics.
42 *
43 * We only handles register-register copies, not general agx_index sources. This
44 * suffices for its internal use for register allocation.
45 */
46
47 static void
do_copy(agx_builder * b,const struct agx_copy * copy)48 do_copy(agx_builder *b, const struct agx_copy *copy)
49 {
50 agx_index dst = copy->dest_mem
51 ? agx_memory_register(copy->dest, copy->src.size)
52 : agx_register(copy->dest, copy->src.size);
53
54 if (copy->dest_mem && copy->src.memory) {
55 /* Memory-memory copies need to be lowered to memory-register and
56 * register-memory, using a reserved scratch register.
57 */
58 agx_index scratch_reg = agx_register(2, copy->src.size);
59 agx_mov_to(b, scratch_reg, copy->src);
60 agx_mov_to(b, dst, scratch_reg);
61 } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
62 agx_mov_imm_to(b, dst, copy->src.value);
63 } else {
64 agx_mov_to(b, dst, copy->src);
65 }
66 }
67
68 static void
do_swap(agx_builder * b,const struct agx_copy * copy)69 do_swap(agx_builder *b, const struct agx_copy *copy)
70 {
71 assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");
72
73 if (copy->dest == copy->src.value)
74 return;
75
76 /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */
77 if (copy->src.size == AGX_SIZE_16 &&
78 (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) {
79
80 assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) &&
81 "no trivial swaps, and only 2 halves of a register");
82
83 /* r0 = extr r0, r0, #16
84 * = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF
85 * = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16)
86 * = (r0l << 16) | r0h
87 */
88 agx_index reg32 = agx_register(copy->dest & ~1, AGX_SIZE_32);
89 agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0);
90 return;
91 }
92
93 agx_index x = copy->dest_mem
94 ? agx_memory_register(copy->dest, copy->src.size)
95 : agx_register(copy->dest, copy->src.size);
96 agx_index y = copy->src;
97
98 /* Memory-memory swaps need to be lowered */
99 assert(x.memory == y.memory);
100 if (x.memory) {
101 agx_index temp1 = agx_register(4, copy->src.size);
102 agx_index temp2 = agx_register(6, copy->src.size);
103
104 agx_mov_to(b, temp1, x);
105 agx_mov_to(b, temp2, y);
106 agx_mov_to(b, y, temp1);
107 agx_mov_to(b, x, temp2);
108 return;
109 }
110
111 /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */
112 agx_xor_to(b, x, x, y);
113 agx_xor_to(b, y, x, y);
114 agx_xor_to(b, x, x, y);
115 }
116
117 struct copy_ctx {
118 /* Number of copies being processed */
119 unsigned entry_count;
120
121 /* For each physreg, the number of pending copy entries that use it as a
122 * source. Once this drops to zero, then the physreg is unblocked and can
123 * be moved to.
124 */
125 unsigned physreg_use_count[AGX_NUM_MODELED_REGS];
126
127 /* For each physreg, the pending copy_entry that uses it as a dest. */
128 struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];
129
130 struct agx_copy entries[AGX_NUM_MODELED_REGS];
131 };
132
133 static bool
entry_blocked(struct agx_copy * entry,struct copy_ctx * ctx)134 entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
135 {
136 for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
137 if (ctx->physreg_use_count[entry->dest + i] != 0)
138 return true;
139 }
140
141 return false;
142 }
143
144 static bool
is_real(struct agx_copy * entry)145 is_real(struct agx_copy *entry)
146 {
147 return entry->src.type == AGX_INDEX_REGISTER &&
148 entry->dest_mem == entry->src.memory;
149 }
150
151 /* TODO: Generalize to other bit sizes */
152 static void
split_32bit_copy(struct copy_ctx * ctx,struct agx_copy * entry)153 split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
154 {
155 assert(!entry->done);
156 assert(is_real(entry));
157 assert(agx_size_align_16(entry->src.size) == 2);
158 struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
159
160 new_entry->dest = entry->dest + 1;
161 new_entry->dest_mem = entry->dest_mem;
162 new_entry->src = entry->src;
163 new_entry->src.value += 1;
164 new_entry->done = false;
165 entry->src.size = AGX_SIZE_16;
166 new_entry->src.size = AGX_SIZE_16;
167 ctx->physreg_dest[entry->dest + 1] = new_entry;
168 }
169
170 static void
agx_emit_parallel_copies_for_class(agx_builder * b,struct agx_copy * copies,unsigned num_copies,bool cls)171 agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
172 unsigned num_copies, bool cls)
173 {
174 /* First, lower away 64-bit copies to smaller chunks, since we don't have
175 * 64-bit ALU so we always want to split.
176 */
177 struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2);
178 unsigned num_copies2 = 0;
179
180 for (unsigned i = 0; i < num_copies; ++i) {
181 struct agx_copy copy = copies[i];
182
183 /* Filter by class */
184 if (copy.dest_mem != cls)
185 continue;
186
187 assert(copy.dest < AGX_NUM_MODELED_REGS);
188
189 if (copy.src.size == AGX_SIZE_64) {
190 copy.src.size = AGX_SIZE_32;
191 copies2[num_copies2++] = copy;
192
193 if (copy.src.type == AGX_INDEX_IMMEDIATE) {
194 static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
195 copy.src.value = 0;
196 } else {
197 assert(copy.src.type == AGX_INDEX_REGISTER ||
198 copy.src.type == AGX_INDEX_UNIFORM);
199
200 copy.src.value += 2;
201 }
202
203 copy.dest += 2;
204 copies2[num_copies2++] = copy;
205 } else {
206 copies2[num_copies2++] = copy;
207 }
208 }
209
210 copies = copies2;
211 num_copies = num_copies2;
212
213 /* Set up the bookkeeping */
214 struct copy_ctx _ctx = {.entry_count = num_copies};
215 struct copy_ctx *ctx = &_ctx;
216
217 memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
218 memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
219
220 for (unsigned i = 0; i < ctx->entry_count; i++) {
221 struct agx_copy *entry = &copies[i];
222
223 ctx->entries[i] = *entry;
224
225 for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
226 if (is_real(entry))
227 ctx->physreg_use_count[entry->src.value + j]++;
228
229 /* Copies should not have overlapping destinations. */
230 assert(!ctx->physreg_dest[entry->dest + j]);
231 ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
232 }
233 }
234
235 /* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
236 for (unsigned i = 0; i < ctx->entry_count; i++) {
237 struct agx_copy *entry = &ctx->entries[i];
238 if (entry->src.size != AGX_SIZE_16)
239 continue;
240
241 if ((entry->dest & 1) || (entry->src.value & 1))
242 continue;
243
244 if (entry->src.type != AGX_INDEX_UNIFORM &&
245 entry->src.type != AGX_INDEX_REGISTER)
246 continue;
247
248 unsigned next_dest = entry->dest + 1;
249 assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");
250
251 struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
252 if (!next_copy)
253 continue;
254
255 assert(next_copy->dest == next_dest && "data structure invariant");
256 assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");
257
258 if (next_copy->src.type != entry->src.type)
259 continue;
260
261 if (next_copy->src.value != (entry->src.value + 1))
262 continue;
263
264 /* Vectorize the copies */
265 ctx->physreg_dest[next_dest] = entry;
266 entry->src.size = AGX_SIZE_32;
267 next_copy->done = true;
268 }
269
270 bool progress = true;
271 while (progress) {
272 progress = false;
273
274 /* Step 1: resolve paths in the transfer graph. This means finding
275 * copies whose destination aren't blocked by something else and then
276 * emitting them, continuing this process until every copy is blocked
277 * and there are only cycles left.
278 *
279 * TODO: We should note that src is also available in dest to unblock
280 * cycles that src is involved in.
281 */
282
283 for (unsigned i = 0; i < ctx->entry_count; i++) {
284 struct agx_copy *entry = &ctx->entries[i];
285 if (!entry->done && !entry_blocked(entry, ctx)) {
286 entry->done = true;
287 progress = true;
288 do_copy(b, entry);
289 for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
290 if (is_real(entry))
291 ctx->physreg_use_count[entry->src.value + j]--;
292 ctx->physreg_dest[entry->dest + j] = NULL;
293 }
294 }
295 }
296
297 if (progress)
298 continue;
299
300 /* Step 2: Find partially blocked copies and split them. In the
301 * mergedregs case, we can 32-bit copies which are only blocked on one
302 * 16-bit half, and splitting them helps get things moving.
303 *
304 * We can skip splitting copies if the source isn't a register,
305 * however, because it does not unblock anything and therefore doesn't
306 * contribute to making forward progress with step 1. These copies
307 * should still be resolved eventually in step 1 because they can't be
308 * part of a cycle.
309 */
310 for (unsigned i = 0; i < ctx->entry_count; i++) {
311 struct agx_copy *entry = &ctx->entries[i];
312 if (entry->done || (agx_size_align_16(entry->src.size) != 2))
313 continue;
314
315 if (((ctx->physreg_use_count[entry->dest] == 0 ||
316 ctx->physreg_use_count[entry->dest + 1] == 0)) &&
317 is_real(entry)) {
318 split_32bit_copy(ctx, entry);
319 progress = true;
320 }
321 }
322 }
323
324 /* Step 3: resolve cycles through swapping.
325 *
326 * At this point, the transfer graph should consist of only cycles.
327 * The reason is that, given any physreg n_1 that's the source of a
328 * remaining entry, it has a destination n_2, which (because every
329 * copy is blocked) is the source of some other copy whose destination
330 * is n_3, and so we can follow the chain until we get a cycle. If we
331 * reached some other node than n_1:
332 *
333 * n_1 -> n_2 -> ... -> n_i
334 * ^ |
335 * |-------------|
336 *
337 * then n_2 would be the destination of 2 copies, which is illegal
338 * (checked above in an assert). So n_1 must be part of a cycle:
339 *
340 * n_1 -> n_2 -> ... -> n_i
341 * ^ |
342 * |---------------------|
343 *
344 * and this must be only cycle n_1 is involved in, because any other
345 * path starting from n_1 would also have to end in n_1, resulting in
346 * a node somewhere along the way being the destination of 2 copies
347 * when the 2 paths merge.
348 *
349 * The way we resolve the cycle is through picking a copy (n_1, n_2)
350 * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
351 * out of the cycle:
352 *
353 * n_1 -> ... -> n_i
354 * ^ |
355 * |--------------|
356 *
357 * and we can keep repeating this until the cycle is empty.
358 */
359
360 for (unsigned i = 0; i < ctx->entry_count; i++) {
361 struct agx_copy *entry = &ctx->entries[i];
362 if (entry->done)
363 continue;
364
365 assert(is_real(entry));
366
367 /* catch trivial copies */
368 if (entry->dest == entry->src.value) {
369 entry->done = true;
370 continue;
371 }
372
373 do_swap(b, entry);
374
375 /* Split any blocking copies whose sources are only partially
376 * contained within our destination.
377 */
378 if (agx_size_align_16(entry->src.size) == 1) {
379 for (unsigned j = 0; j < ctx->entry_count; j++) {
380 struct agx_copy *blocking = &ctx->entries[j];
381
382 if (blocking->done)
383 continue;
384
385 if (blocking->src.value <= entry->dest &&
386 blocking->src.value + 1 >= entry->dest &&
387 agx_size_align_16(blocking->src.size) == 2) {
388 split_32bit_copy(ctx, blocking);
389 }
390 }
391 }
392
393 /* Update sources of blocking copies.
394 *
395 * Note: at this point, every blocking copy's source should be
396 * contained within our destination.
397 */
398 for (unsigned j = 0; j < ctx->entry_count; j++) {
399 struct agx_copy *blocking = &ctx->entries[j];
400 if (blocking->src.value >= entry->dest &&
401 blocking->src.value <
402 entry->dest + agx_size_align_16(entry->src.size)) {
403 blocking->src.value =
404 entry->src.value + (blocking->src.value - entry->dest);
405 }
406 }
407
408 entry->done = true;
409 }
410
411 free(copies2);
412 }
413
414 void
agx_emit_parallel_copies(agx_builder * b,struct agx_copy * copies,unsigned num_copies)415 agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
416 unsigned num_copies)
417 {
418 /* Emit copies fo reach register class separately because we don't have
419 * register class awareness in the parallel copy lowering data structure.
420 */
421 agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
422 agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
423 }
424