1*61046927SAndroid Build Coastguard Worker /*
2*61046927SAndroid Build Coastguard Worker * Copyright © 2019 Google, Inc.
3*61046927SAndroid Build Coastguard Worker * SPDX-License-Identifier: MIT
4*61046927SAndroid Build Coastguard Worker *
5*61046927SAndroid Build Coastguard Worker * Authors:
6*61046927SAndroid Build Coastguard Worker * Rob Clark <[email protected]>
7*61046927SAndroid Build Coastguard Worker */
8*61046927SAndroid Build Coastguard Worker
9*61046927SAndroid Build Coastguard Worker #include "ir3.h"
10*61046927SAndroid Build Coastguard Worker
11*61046927SAndroid Build Coastguard Worker #include "ir3_compiler.h"
12*61046927SAndroid Build Coastguard Worker
13*61046927SAndroid Build Coastguard Worker /* The maximum number of nop's we may need to insert between two instructions.
14*61046927SAndroid Build Coastguard Worker */
15*61046927SAndroid Build Coastguard Worker #define MAX_NOPS 6
16*61046927SAndroid Build Coastguard Worker
17*61046927SAndroid Build Coastguard Worker /*
18*61046927SAndroid Build Coastguard Worker * Helpers to figure out the necessary delay slots between instructions. Used
19*61046927SAndroid Build Coastguard Worker * both in scheduling pass(es) and the final pass to insert any required nop's
20*61046927SAndroid Build Coastguard Worker * so that the shader program is valid.
21*61046927SAndroid Build Coastguard Worker *
22*61046927SAndroid Build Coastguard Worker * Note that this needs to work both pre and post RA, so we can't assume ssa
23*61046927SAndroid Build Coastguard Worker * src iterators work.
24*61046927SAndroid Build Coastguard Worker */
25*61046927SAndroid Build Coastguard Worker
26*61046927SAndroid Build Coastguard Worker /* calculate required # of delay slots between the instruction that
27*61046927SAndroid Build Coastguard Worker * assigns a value and the one that consumes
28*61046927SAndroid Build Coastguard Worker */
29*61046927SAndroid Build Coastguard Worker int
ir3_delayslots(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)30*61046927SAndroid Build Coastguard Worker ir3_delayslots(struct ir3_compiler *compiler,
31*61046927SAndroid Build Coastguard Worker struct ir3_instruction *assigner,
32*61046927SAndroid Build Coastguard Worker struct ir3_instruction *consumer, unsigned n, bool soft)
33*61046927SAndroid Build Coastguard Worker {
34*61046927SAndroid Build Coastguard Worker /* generally don't count false dependencies, since this can just be
35*61046927SAndroid Build Coastguard Worker * something like a barrier, or SSBO store.
36*61046927SAndroid Build Coastguard Worker */
37*61046927SAndroid Build Coastguard Worker if (__is_false_dep(consumer, n))
38*61046927SAndroid Build Coastguard Worker return 0;
39*61046927SAndroid Build Coastguard Worker
40*61046927SAndroid Build Coastguard Worker /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
41*61046927SAndroid Build Coastguard Worker * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
42*61046927SAndroid Build Coastguard Worker * handled with sync bits
43*61046927SAndroid Build Coastguard Worker */
44*61046927SAndroid Build Coastguard Worker
45*61046927SAndroid Build Coastguard Worker if (is_meta(assigner) || is_meta(consumer))
46*61046927SAndroid Build Coastguard Worker return 0;
47*61046927SAndroid Build Coastguard Worker
48*61046927SAndroid Build Coastguard Worker if (writes_addr0(assigner) || writes_addr1(assigner))
49*61046927SAndroid Build Coastguard Worker return 6;
50*61046927SAndroid Build Coastguard Worker
51*61046927SAndroid Build Coastguard Worker if (soft && needs_ss(compiler, assigner, consumer))
52*61046927SAndroid Build Coastguard Worker return soft_ss_delay(assigner);
53*61046927SAndroid Build Coastguard Worker
54*61046927SAndroid Build Coastguard Worker /* handled via sync flags: */
55*61046927SAndroid Build Coastguard Worker if (needs_ss(compiler, assigner, consumer) ||
56*61046927SAndroid Build Coastguard Worker is_sy_producer(assigner))
57*61046927SAndroid Build Coastguard Worker return 0;
58*61046927SAndroid Build Coastguard Worker
59*61046927SAndroid Build Coastguard Worker /* scalar ALU -> scalar ALU depdendencies where the source and destination
60*61046927SAndroid Build Coastguard Worker * register sizes match don't require any nops.
61*61046927SAndroid Build Coastguard Worker */
62*61046927SAndroid Build Coastguard Worker if (is_scalar_alu(assigner, compiler)) {
63*61046927SAndroid Build Coastguard Worker assert(is_scalar_alu(consumer, compiler));
64*61046927SAndroid Build Coastguard Worker /* If the sizes don't match then we need (ss) and needs_ss() should've
65*61046927SAndroid Build Coastguard Worker * returned above.
66*61046927SAndroid Build Coastguard Worker */
67*61046927SAndroid Build Coastguard Worker assert((assigner->dsts[0]->flags & IR3_REG_HALF) ==
68*61046927SAndroid Build Coastguard Worker (consumer->srcs[n]->flags & IR3_REG_HALF));
69*61046927SAndroid Build Coastguard Worker return 0;
70*61046927SAndroid Build Coastguard Worker }
71*61046927SAndroid Build Coastguard Worker
72*61046927SAndroid Build Coastguard Worker /* As far as we know, shader outputs don't need any delay. */
73*61046927SAndroid Build Coastguard Worker if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
74*61046927SAndroid Build Coastguard Worker return 0;
75*61046927SAndroid Build Coastguard Worker
76*61046927SAndroid Build Coastguard Worker /* assigner must be alu: */
77*61046927SAndroid Build Coastguard Worker if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
78*61046927SAndroid Build Coastguard Worker is_mem(consumer)) {
79*61046927SAndroid Build Coastguard Worker return 6;
80*61046927SAndroid Build Coastguard Worker } else {
81*61046927SAndroid Build Coastguard Worker /* In mergedregs mode, there is an extra 2-cycle penalty when half of
82*61046927SAndroid Build Coastguard Worker * a full-reg is read as a half-reg or when a half-reg is read as a
83*61046927SAndroid Build Coastguard Worker * full-reg.
84*61046927SAndroid Build Coastguard Worker */
85*61046927SAndroid Build Coastguard Worker bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
86*61046927SAndroid Build Coastguard Worker (consumer->srcs[n]->flags & IR3_REG_HALF);
87*61046927SAndroid Build Coastguard Worker unsigned penalty = mismatched_half ? 3 : 0;
88*61046927SAndroid Build Coastguard Worker if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
89*61046927SAndroid Build Coastguard Worker /* special case, 3rd src to cat3 not required on first cycle */
90*61046927SAndroid Build Coastguard Worker return 1 + penalty;
91*61046927SAndroid Build Coastguard Worker } else {
92*61046927SAndroid Build Coastguard Worker return 3 + penalty;
93*61046927SAndroid Build Coastguard Worker }
94*61046927SAndroid Build Coastguard Worker }
95*61046927SAndroid Build Coastguard Worker }
96*61046927SAndroid Build Coastguard Worker
97*61046927SAndroid Build Coastguard Worker unsigned
ir3_delayslots_with_repeat(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n)98*61046927SAndroid Build Coastguard Worker ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
99*61046927SAndroid Build Coastguard Worker struct ir3_instruction *assigner,
100*61046927SAndroid Build Coastguard Worker struct ir3_instruction *consumer,
101*61046927SAndroid Build Coastguard Worker unsigned assigner_n, unsigned consumer_n)
102*61046927SAndroid Build Coastguard Worker {
103*61046927SAndroid Build Coastguard Worker unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false);
104*61046927SAndroid Build Coastguard Worker
105*61046927SAndroid Build Coastguard Worker struct ir3_register *src = consumer->srcs[consumer_n];
106*61046927SAndroid Build Coastguard Worker struct ir3_register *dst = assigner->dsts[assigner_n];
107*61046927SAndroid Build Coastguard Worker
108*61046927SAndroid Build Coastguard Worker if (assigner->repeat == 0 && consumer->repeat == 0)
109*61046927SAndroid Build Coastguard Worker return delay;
110*61046927SAndroid Build Coastguard Worker
111*61046927SAndroid Build Coastguard Worker unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
112*61046927SAndroid Build Coastguard Worker unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
113*61046927SAndroid Build Coastguard Worker
114*61046927SAndroid Build Coastguard Worker /* If either side is a relative access, we can't really apply most of the
115*61046927SAndroid Build Coastguard Worker * reasoning below because we don't know which component aliases which.
116*61046927SAndroid Build Coastguard Worker * Just bail in this case.
117*61046927SAndroid Build Coastguard Worker */
118*61046927SAndroid Build Coastguard Worker if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
119*61046927SAndroid Build Coastguard Worker return delay;
120*61046927SAndroid Build Coastguard Worker
121*61046927SAndroid Build Coastguard Worker /* MOVMSK seems to require that all users wait until the entire
122*61046927SAndroid Build Coastguard Worker * instruction is finished, so just bail here.
123*61046927SAndroid Build Coastguard Worker */
124*61046927SAndroid Build Coastguard Worker if (assigner->opc == OPC_MOVMSK)
125*61046927SAndroid Build Coastguard Worker return delay;
126*61046927SAndroid Build Coastguard Worker
127*61046927SAndroid Build Coastguard Worker /* TODO: Handle the combination of (rpt) and different component sizes
128*61046927SAndroid Build Coastguard Worker * better like below. This complicates things significantly because the
129*61046927SAndroid Build Coastguard Worker * components don't line up.
130*61046927SAndroid Build Coastguard Worker */
131*61046927SAndroid Build Coastguard Worker if ((src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF))
132*61046927SAndroid Build Coastguard Worker return delay;
133*61046927SAndroid Build Coastguard Worker
134*61046927SAndroid Build Coastguard Worker /* If an instruction has a (rpt), then it acts as a sequence of
135*61046927SAndroid Build Coastguard Worker * instructions, reading its non-(r) sources at each cycle. First, get the
136*61046927SAndroid Build Coastguard Worker * register num for the first instruction where they interfere:
137*61046927SAndroid Build Coastguard Worker */
138*61046927SAndroid Build Coastguard Worker
139*61046927SAndroid Build Coastguard Worker unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
140*61046927SAndroid Build Coastguard Worker
141*61046927SAndroid Build Coastguard Worker /* Now, for that first conflicting half/full register, figure out the
142*61046927SAndroid Build Coastguard Worker * sub-instruction within assigner/consumer it corresponds to. For (r)
143*61046927SAndroid Build Coastguard Worker * sources, this should already return the correct answer of 0. However we
144*61046927SAndroid Build Coastguard Worker * have to special-case the multi-mov instructions, where the
145*61046927SAndroid Build Coastguard Worker * sub-instructions sometimes come from the src/dst indices instead.
146*61046927SAndroid Build Coastguard Worker */
147*61046927SAndroid Build Coastguard Worker unsigned first_src_instr;
148*61046927SAndroid Build Coastguard Worker if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
149*61046927SAndroid Build Coastguard Worker first_src_instr = consumer_n;
150*61046927SAndroid Build Coastguard Worker else
151*61046927SAndroid Build Coastguard Worker first_src_instr = first_num - src->num;
152*61046927SAndroid Build Coastguard Worker
153*61046927SAndroid Build Coastguard Worker unsigned first_dst_instr;
154*61046927SAndroid Build Coastguard Worker if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
155*61046927SAndroid Build Coastguard Worker first_dst_instr = assigner_n;
156*61046927SAndroid Build Coastguard Worker else
157*61046927SAndroid Build Coastguard Worker first_dst_instr = first_num - dst->num;
158*61046927SAndroid Build Coastguard Worker
159*61046927SAndroid Build Coastguard Worker /* The delay we return is relative to the *end* of assigner and the
160*61046927SAndroid Build Coastguard Worker * *beginning* of consumer, because it's the number of nops (or other
161*61046927SAndroid Build Coastguard Worker * things) needed between them. Any instructions after first_dst_instr
162*61046927SAndroid Build Coastguard Worker * subtract from the delay, and so do any instructions before
163*61046927SAndroid Build Coastguard Worker * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
164*61046927SAndroid Build Coastguard Worker * delay to account for that.
165*61046927SAndroid Build Coastguard Worker *
166*61046927SAndroid Build Coastguard Worker * Now, a priori, we need to go through this process for every
167*61046927SAndroid Build Coastguard Worker * conflicting regnum and take the minimum of the offsets to make sure
168*61046927SAndroid Build Coastguard Worker * that the appropriate number of nop's is inserted for every conflicting
169*61046927SAndroid Build Coastguard Worker * pair of sub-instructions. However, as we go to the next conflicting
170*61046927SAndroid Build Coastguard Worker * regnum (if any), the number of instructions after first_dst_instr
171*61046927SAndroid Build Coastguard Worker * decreases by 1 and the number of source instructions before
172*61046927SAndroid Build Coastguard Worker * first_src_instr correspondingly increases by 1, so the offset stays the
173*61046927SAndroid Build Coastguard Worker * same for all conflicting registers.
174*61046927SAndroid Build Coastguard Worker */
175*61046927SAndroid Build Coastguard Worker unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
176*61046927SAndroid Build Coastguard Worker return offset > delay ? 0 : delay - offset;
177*61046927SAndroid Build Coastguard Worker }
178*61046927SAndroid Build Coastguard Worker
179