xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_lower_parallelcopy.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3_ra.h"
7 #include "ir3_shader.h"
8 
9 struct copy_src {
10    unsigned flags;
11    union {
12       uint32_t imm;
13       physreg_t reg;
14       unsigned const_num;
15    };
16 };
17 
18 struct copy_entry {
19    physreg_t dst;
20    unsigned flags;
21    bool done;
22 
23    struct copy_src src;
24 };
25 
26 static unsigned
copy_entry_size(const struct copy_entry * entry)27 copy_entry_size(const struct copy_entry *entry)
28 {
29    return (entry->flags & IR3_REG_HALF) ? 1 : 2;
30 }
31 
32 static struct copy_src
get_copy_src(const struct ir3_register * reg,unsigned offset)33 get_copy_src(const struct ir3_register *reg, unsigned offset)
34 {
35    if (reg->flags & IR3_REG_IMMED) {
36       return (struct copy_src){
37          .flags = IR3_REG_IMMED,
38          .imm = reg->uim_val,
39       };
40    } else if (reg->flags & IR3_REG_CONST) {
41       return (struct copy_src){
42          .flags = IR3_REG_CONST,
43          .const_num = reg->num,
44       };
45    } else {
46       return (struct copy_src){
47          .flags = 0,
48          .reg = ra_reg_get_physreg(reg) + offset,
49       };
50    }
51 }
52 
53 static void
do_xor(struct ir3_instruction * instr,unsigned dst_num,unsigned src1_num,unsigned src2_num,unsigned flags)54 do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
55        unsigned src2_num, unsigned flags)
56 {
57    struct ir3_instruction * xor
58       = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
59    ir3_dst_create(xor, dst_num, flags);
60    ir3_src_create(xor, src1_num, flags);
61    ir3_src_create(xor, src2_num, flags);
62 
63    ir3_instr_move_before(xor, instr);
64 }
65 
66 static void
do_swap(struct ir3_compiler * compiler,struct ir3_instruction * instr,const struct copy_entry * entry)67 do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
68         const struct copy_entry *entry)
69 {
70    assert(!entry->src.flags);
71 
72    if (entry->flags & IR3_REG_HALF) {
73       const unsigned half_size =
74          (entry->flags & IR3_REG_SHARED) ? RA_SHARED_HALF_SIZE : RA_HALF_SIZE;
75 
76       /* We currently make sure to never emit parallel copies where the
77        * source/destination is a half-reg above the range accessable to half
78        * registers. However, when a full-reg source overlaps a half-reg
79        * destination or vice versa, it can be very, very complicated to come
80        * up with a series of "legal" swaps and copies to resolve the
81        * parallel copy. So here we provide a fallback to implement the
82        * "illegal" swap instead. This may also be useful for implementing
83        * "spilling" half-regs to the inaccessable space.
84        */
85       if (entry->src.reg >= half_size) {
86          /* Choose a temporary that doesn't overlap src or dst */
87          physreg_t tmp = entry->dst < 2 ? 2 : 0;
88 
89          /* Swap src and the temporary */
90          do_swap(compiler, instr,
91                  &(struct copy_entry){
92                     .src = {.reg = entry->src.reg & ~1u},
93                     .dst = tmp,
94                     .flags = entry->flags & ~IR3_REG_HALF,
95                  });
96 
97          /* If src and dst are within the same full register, then swapping src
98           * with tmp above will also move dst to tmp. Account for that here.
99           */
100          unsigned dst =
101             (entry->src.reg & ~1u) == (entry->dst & ~1u) ?
102             tmp + (entry->dst & 1u) : entry->dst;
103 
104          /* Do the original swap with src replaced with tmp */
105          do_swap(compiler, instr,
106                  &(struct copy_entry){
107                     .src = {.reg = tmp + (entry->src.reg & 1)},
108                     .dst = dst,
109                     .flags = entry->flags,
110                  });
111 
112          /* Swap src and the temporary back */
113          do_swap(compiler, instr,
114                  &(struct copy_entry){
115                     .src = {.reg = entry->src.reg & ~1u},
116                     .dst = tmp,
117                     .flags = entry->flags & ~IR3_REG_HALF,
118                  });
119          return;
120       }
121 
122       /* If dst is not addressable, we only need to swap the arguments and
123        * let the case above handle it.
124        */
125       if (entry->dst >= half_size) {
126          do_swap(compiler, instr,
127                  &(struct copy_entry){
128                     .src = {.reg = entry->dst},
129                     .dst = entry->src.reg,
130                     .flags = entry->flags,
131                  });
132          return;
133       }
134    }
135 
136    unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
137    unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
138 
139    /* a5xx+ is known to support swz, which enables us to swap two registers
140     * in-place. If unsupported we emulate it using the xor trick.
141     */
142    if (compiler->gen < 5 || (entry->flags & IR3_REG_SHARED)) {
143       do_xor(instr, dst_num, dst_num, src_num, entry->flags);
144       do_xor(instr, src_num, src_num, dst_num, entry->flags);
145       do_xor(instr, dst_num, dst_num, src_num, entry->flags);
146    } else {
147       struct ir3_instruction *swz = ir3_instr_create(instr->block, OPC_SWZ, 2, 2);
148       ir3_dst_create(swz, dst_num, entry->flags);
149       ir3_dst_create(swz, src_num, entry->flags);
150       ir3_src_create(swz, src_num, entry->flags);
151       ir3_src_create(swz, dst_num, entry->flags);
152       swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
153       swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
154       swz->repeat = 1;
155       ir3_instr_move_before(swz, instr);
156    }
157 }
158 
159 static void
do_copy(struct ir3_compiler * compiler,struct ir3_instruction * instr,const struct copy_entry * entry)160 do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
161         const struct copy_entry *entry)
162 {
163    if (entry->flags & IR3_REG_HALF) {
164       /* See do_swap() for why this is here. */
165       const unsigned half_size =
166          (entry->flags & IR3_REG_SHARED) ? RA_SHARED_HALF_SIZE : RA_HALF_SIZE;
167       if (entry->dst >= half_size) {
168          /* TODO: is there a hw instruction we can use for this case? */
169          physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
170 
171          do_swap(compiler, instr,
172                  &(struct copy_entry){
173                     .src = {.reg = entry->dst & ~1u},
174                     .dst = tmp,
175                     .flags = entry->flags & ~IR3_REG_HALF,
176                  });
177 
178          /* Similar to in do_swap(), account for src being swapped with tmp if
179           * src and dst are in the same register.
180           */
181          struct copy_src src = entry->src;
182          if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u))
183             src.reg = tmp + (src.reg & 1u);
184 
185          do_copy(compiler, instr,
186                  &(struct copy_entry){
187                     .src = src,
188                     .dst = tmp + (entry->dst & 1),
189                     .flags = entry->flags,
190                  });
191 
192          do_swap(compiler, instr,
193                  &(struct copy_entry){
194                     .src = {.reg = entry->dst & ~1u},
195                     .dst = tmp,
196                     .flags = entry->flags & ~IR3_REG_HALF,
197                  });
198          return;
199       }
200 
201       if (!entry->src.flags && entry->src.reg >= half_size) {
202          unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
203                                               entry->flags & ~IR3_REG_HALF);
204          unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
205 
206          if (entry->src.reg % 2 == 0) {
207             /* cov.u32u16 dst, src */
208             struct ir3_instruction *cov =
209                ir3_instr_create(instr->block, OPC_MOV, 1, 1);
210             ir3_dst_create(cov, dst_num, entry->flags);
211             ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
212             cov->cat1.dst_type = TYPE_U16;
213             cov->cat1.src_type = TYPE_U32;
214             ir3_instr_move_before(cov, instr);
215          } else {
216             /* shr.b dst, src, (16) */
217             struct ir3_instruction *shr =
218                ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
219             ir3_dst_create(shr, dst_num, entry->flags);
220             ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
221             ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16;
222             ir3_instr_move_before(shr, instr);
223          }
224          return;
225       }
226    }
227 
228    unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
229    unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
230 
231    struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_MOV, 1, 1);
232    ir3_dst_create(mov, dst_num, entry->flags);
233    if (entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))
234       ir3_src_create(mov, INVALID_REG, (entry->flags & IR3_REG_HALF) | entry->src.flags);
235    else
236       ir3_src_create(mov, src_num, entry->flags);
237    mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
238    mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
239    if (entry->src.flags & IR3_REG_IMMED)
240       mov->srcs[0]->uim_val = entry->src.imm;
241    else if (entry->src.flags & IR3_REG_CONST)
242       mov->srcs[0]->num = entry->src.const_num;
243    ir3_instr_move_before(mov, instr);
244 }
245 
246 struct copy_ctx {
247    /* For each physreg, the number of pending copy entries that use it as a
248     * source. Once this drops to zero, then the physreg is unblocked and can
249     * be moved to.
250     */
251    unsigned physreg_use_count[RA_MAX_FILE_SIZE];
252 
253    /* For each physreg, the pending copy_entry that uses it as a dest. */
254    struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
255 
256    struct copy_entry entries[RA_MAX_FILE_SIZE];
257    unsigned entry_count;
258 };
259 
260 static bool
entry_blocked(struct copy_entry * entry,struct copy_ctx * ctx)261 entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
262 {
263    for (unsigned i = 0; i < copy_entry_size(entry); i++) {
264       if (ctx->physreg_use_count[entry->dst + i] != 0)
265          return true;
266    }
267 
268    return false;
269 }
270 
271 static void
split_32bit_copy(struct copy_ctx * ctx,struct copy_entry * entry)272 split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
273 {
274    assert(!entry->done);
275    assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST)));
276    assert(copy_entry_size(entry) == 2);
277    struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
278 
279    new_entry->dst = entry->dst + 1;
280    new_entry->src.flags = entry->src.flags;
281    new_entry->src.reg = entry->src.reg + 1;
282    new_entry->done = false;
283    entry->flags |= IR3_REG_HALF;
284    new_entry->flags = entry->flags;
285    ctx->physreg_dst[entry->dst + 1] = new_entry;
286 }
287 
288 static void
_handle_copies(struct ir3_compiler * compiler,struct ir3_instruction * instr,struct copy_ctx * ctx)289 _handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
290                struct copy_ctx *ctx)
291 {
292    /* Set up the bookkeeping */
293    memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
294    memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
295 
296    for (unsigned i = 0; i < ctx->entry_count; i++) {
297       struct copy_entry *entry = &ctx->entries[i];
298       for (unsigned j = 0; j < copy_entry_size(entry); j++) {
299          if (!entry->src.flags)
300             ctx->physreg_use_count[entry->src.reg + j]++;
301 
302          /* Copies should not have overlapping destinations. */
303          assert(!ctx->physreg_dst[entry->dst + j]);
304          ctx->physreg_dst[entry->dst + j] = entry;
305       }
306    }
307 
308    bool progress = true;
309    while (progress) {
310       progress = false;
311 
312       /* Step 1: resolve paths in the transfer graph. This means finding
313        * copies whose destination aren't blocked by something else and then
314        * emitting them, continuing this process until every copy is blocked
315        * and there are only cycles left.
316        *
317        * TODO: We should note that src is also available in dst to unblock
318        * cycles that src is involved in.
319        */
320 
321       for (unsigned i = 0; i < ctx->entry_count; i++) {
322          struct copy_entry *entry = &ctx->entries[i];
323          if (!entry->done && !entry_blocked(entry, ctx)) {
324             entry->done = true;
325             progress = true;
326             do_copy(compiler, instr, entry);
327             for (unsigned j = 0; j < copy_entry_size(entry); j++) {
328                if (!entry->src.flags)
329                   ctx->physreg_use_count[entry->src.reg + j]--;
330                ctx->physreg_dst[entry->dst + j] = NULL;
331             }
332          }
333       }
334 
335       if (progress)
336          continue;
337 
338       /* Step 2: Find partially blocked copies and split them. In the
339        * mergedregs case, we can 32-bit copies which are only blocked on one
340        * 16-bit half, and splitting them helps get things moving.
341        *
342        * We can skip splitting copies if the source isn't a register,
343        * however, because it does not unblock anything and therefore doesn't
344        * contribute to making forward progress with step 1. These copies
345        * should still be resolved eventually in step 1 because they can't be
346        * part of a cycle.
347        */
348       for (unsigned i = 0; i < ctx->entry_count; i++) {
349          struct copy_entry *entry = &ctx->entries[i];
350          if (entry->done || entry->flags & IR3_REG_HALF)
351             continue;
352 
353          if (((ctx->physreg_use_count[entry->dst] == 0 ||
354                ctx->physreg_use_count[entry->dst + 1] == 0)) &&
355              !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
356             split_32bit_copy(ctx, entry);
357             progress = true;
358          }
359       }
360    }
361 
362    /* Step 3: resolve cycles through swapping.
363     *
364     * At this point, the transfer graph should consist of only cycles.
365     * The reason is that, given any physreg n_1 that's the source of a
366     * remaining entry, it has a destination n_2, which (because every
367     * copy is blocked) is the source of some other copy whose destination
368     * is n_3, and so we can follow the chain until we get a cycle. If we
369     * reached some other node than n_1:
370     *
371     *  n_1 -> n_2 -> ... -> n_i
372     *          ^             |
373     *          |-------------|
374     *
375     *  then n_2 would be the destination of 2 copies, which is illegal
376     *  (checked above in an assert). So n_1 must be part of a cycle:
377     *
378     *  n_1 -> n_2 -> ... -> n_i
379     *  ^                     |
380     *  |---------------------|
381     *
382     *  and this must be only cycle n_1 is involved in, because any other
383     *  path starting from n_1 would also have to end in n_1, resulting in
384     *  a node somewhere along the way being the destination of 2 copies
385     *  when the 2 paths merge.
386     *
387     *  The way we resolve the cycle is through picking a copy (n_1, n_2)
388     *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
389     *  out of the cycle:
390     *
391     *  n_1 -> ... -> n_i
392     *  ^              |
393     *  |--------------|
394     *
395     *  and we can keep repeating this until the cycle is empty.
396     */
397 
398    for (unsigned i = 0; i < ctx->entry_count; i++) {
399       struct copy_entry *entry = &ctx->entries[i];
400       if (entry->done)
401          continue;
402 
403       assert(!entry->src.flags);
404 
405       /* catch trivial copies */
406       if (entry->dst == entry->src.reg) {
407          entry->done = true;
408          continue;
409       }
410 
411       do_swap(compiler, instr, entry);
412 
413       /* Split any blocking copies whose sources are only partially
414        * contained within our destination.
415        */
416       if (entry->flags & IR3_REG_HALF) {
417          for (unsigned j = 0; j < ctx->entry_count; j++) {
418             struct copy_entry *blocking = &ctx->entries[j];
419 
420             if (blocking->done)
421                continue;
422 
423             if (blocking->src.reg <= entry->dst &&
424                 blocking->src.reg + 1 >= entry->dst &&
425                 !(blocking->flags & IR3_REG_HALF)) {
426                split_32bit_copy(ctx, blocking);
427             }
428          }
429       }
430 
431       /* Update sources of blocking copies.
432        *
433        * Note: at this point, every blocking copy's source should be
434        * contained within our destination.
435        */
436       for (unsigned j = 0; j < ctx->entry_count; j++) {
437          struct copy_entry *blocking = &ctx->entries[j];
438          if (blocking->src.reg >= entry->dst &&
439              blocking->src.reg < entry->dst + copy_entry_size(entry)) {
440             blocking->src.reg =
441                entry->src.reg + (blocking->src.reg - entry->dst);
442          }
443       }
444 
445       entry->done = true;
446    }
447 }
448 
449 static void
handle_copies(struct ir3_shader_variant * v,struct ir3_instruction * instr,struct copy_entry * entries,unsigned entry_count)450 handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
451               struct copy_entry *entries, unsigned entry_count)
452 {
453    struct copy_ctx ctx;
454 
455    /* handle shared copies first */
456    ctx.entry_count = 0;
457    for (unsigned i = 0; i < entry_count; i++) {
458       if (entries[i].flags & IR3_REG_SHARED)
459          ctx.entries[ctx.entry_count++] = entries[i];
460    }
461    _handle_copies(v->compiler, instr, &ctx);
462 
463    if (v->mergedregs) {
464       /* Half regs and full regs are in the same file, so handle everything
465        * at once.
466        */
467       ctx.entry_count = 0;
468       for (unsigned i = 0; i < entry_count; i++) {
469          if (!(entries[i].flags & IR3_REG_SHARED))
470             ctx.entries[ctx.entry_count++] = entries[i];
471       }
472       _handle_copies(v->compiler, instr, &ctx);
473    } else {
474       /* There may be both half copies and full copies, so we have to split
475        * them up since they don't interfere.
476        */
477       ctx.entry_count = 0;
478       for (unsigned i = 0; i < entry_count; i++) {
479          if (entries[i].flags & IR3_REG_HALF)
480             ctx.entries[ctx.entry_count++] = entries[i];
481       }
482       _handle_copies(v->compiler, instr, &ctx);
483 
484       ctx.entry_count = 0;
485       for (unsigned i = 0; i < entry_count; i++) {
486          if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
487             ctx.entries[ctx.entry_count++] = entries[i];
488       }
489       _handle_copies(v->compiler, instr, &ctx);
490    }
491 }
492 
493 void
ir3_lower_copies(struct ir3_shader_variant * v)494 ir3_lower_copies(struct ir3_shader_variant *v)
495 {
496    DECLARE_ARRAY(struct copy_entry, copies);
497    copies_count = copies_sz = 0;
498    copies = NULL;
499 
500    foreach_block (block, &v->ir->block_list) {
501       foreach_instr_safe (instr, &block->instr_list) {
502          if (instr->opc == OPC_META_PARALLEL_COPY) {
503             copies_count = 0;
504             for (unsigned i = 0; i < instr->dsts_count; i++) {
505                struct ir3_register *dst = instr->dsts[i];
506                struct ir3_register *src = instr->srcs[i];
507                unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
508                unsigned dst_physreg = ra_reg_get_physreg(dst);
509                for (unsigned j = 0; j < reg_elems(dst); j++) {
510                   array_insert(
511                      NULL, copies,
512                      (struct copy_entry){
513                         .dst = dst_physreg + j * reg_elem_size(dst),
514                         .src = get_copy_src(src, j * reg_elem_size(dst)),
515                         .flags = flags,
516                      });
517                }
518             }
519             handle_copies(v, instr, copies, copies_count);
520             list_del(&instr->node);
521          } else if (instr->opc == OPC_META_COLLECT) {
522             copies_count = 0;
523             struct ir3_register *dst = instr->dsts[0];
524             unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
525             for (unsigned i = 0; i < instr->srcs_count; i++) {
526                struct ir3_register *src = instr->srcs[i];
527                array_insert(NULL, copies,
528                             (struct copy_entry){
529                                .dst = ra_num_to_physreg(dst->num + i, flags),
530                                .src = get_copy_src(src, 0),
531                                .flags = flags,
532                             });
533             }
534             handle_copies(v, instr, copies, copies_count);
535             list_del(&instr->node);
536          } else if (instr->opc == OPC_META_SPLIT) {
537             copies_count = 0;
538             struct ir3_register *dst = instr->dsts[0];
539             struct ir3_register *src = instr->srcs[0];
540             unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
541             array_insert(NULL, copies,
542                          (struct copy_entry){
543                             .dst = ra_reg_get_physreg(dst),
544                             .src = get_copy_src(
545                                src, instr->split.off * reg_elem_size(dst)),
546                             .flags = flags,
547                          });
548             handle_copies(v, instr, copies, copies_count);
549             list_del(&instr->node);
550          } else if (instr->opc == OPC_META_PHI) {
551             list_del(&instr->node);
552          } else if (instr->opc == OPC_MOV) {
553             /* There seems to be a HW bug where moves where the source is 16-bit
554              * non-shared and the destination is 16-bit shared don't work when
555              * only fibers 64-127 are active. We work around it by instead
556              * generating a narrowing mov, which only works with even-numbered
557              * registers (i.e. .x and .z), but for odd numbers we can swap the
558              * components of the normal src and its even neighbor and then
559              * unswap afterwords to make it work for everything.
560              */
561             if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
562                 (instr->dsts[0]->flags & IR3_REG_HALF) &&
563                 !(instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED |
564                                            IR3_REG_CONST)) &&
565                 (instr->srcs[0]->flags & IR3_REG_HALF)) {
566                unsigned src_num = instr->srcs[0]->num;
567                unsigned dst_num = instr->dsts[0]->num;
568 
569                for (unsigned i = 0; i <= instr->repeat; i++,
570                     src_num++, dst_num++) {
571                   if (src_num & 1) {
572                      for (unsigned i = 0; i < 2; i++) {
573                         struct ir3_instruction *swz = ir3_instr_create(instr->block, OPC_SWZ, 2, 2);
574                         ir3_dst_create(swz, src_num - 1, IR3_REG_HALF);
575                         ir3_dst_create(swz, src_num, IR3_REG_HALF);
576                         ir3_src_create(swz, src_num, IR3_REG_HALF);
577                         ir3_src_create(swz, src_num - 1, IR3_REG_HALF);
578                         swz->cat1.dst_type = TYPE_U16;
579                         swz->cat1.src_type = TYPE_U16;
580                         swz->repeat = 1;
581                         if (i == 0)
582                            ir3_instr_move_before(swz, instr);
583                         else
584                            ir3_instr_move_after(swz, instr);
585                      }
586                   }
587 
588                   struct ir3_instruction *mov =
589                      ir3_instr_create(instr->block, OPC_MOV, 1, 1);
590 
591                   ir3_dst_create(mov, dst_num, instr->dsts[0]->flags);
592                   ir3_src_create(mov, src_num / 2,
593                                  instr->srcs[0]->flags & ~IR3_REG_HALF);
594 
595                   /* Float conversions are banned in this case in
596                    * ir3_valid_flags(), so we only have to worry about normal
597                    * non-converting moves.
598                    */
599                   assert(instr->cat1.src_type == TYPE_U16 ||
600                          instr->cat1.src_type == TYPE_S16);
601                   mov->cat1.src_type = TYPE_U32;
602                   mov->cat1.dst_type = TYPE_U16;
603 
604                   ir3_instr_move_before(mov, instr);
605                }
606 
607                list_del(&instr->node);
608             }
609          }
610       }
611    }
612 
613    if (copies)
614       ralloc_free(copies);
615 }
616