1 /*
2 * Copyright 2023 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_builder.h"
7 #include "agx_compiler.h"
8 #include "agx_opcodes.h"
9
10 /*
11 * AGX control flow instructions predicate out threads. No forward branches are
12 * inserted during instruction selection, only backwards branches at the end of
13 * loops exist before this pass. This means, prior to this pass, we would always
14 * execute both sides of an if.
15 *
16 * To improve performance, this pass inserts conservative forward branches after
17 * if_*cmp, else_*cmp, and break_if_*cmp instructions, jumping the
18 * subgroup to their logical destination if all threads in the subgroup are
19 * inactive. This has the effect of skipping over the unexecuted half of an if.
20 * That means this pass is critical for control flow performance.
21 */
22
23 /* Estimated cost of inserting a jmp_exec_none. This value is tuned to Dolphin
24 * ubershaders. It needs to be retuned in lockstep with changes to the cost
25 * estimation heuristic.
26 */
27 #define COST_JMP (19)
28
29 static uint32_t
cost_instr(agx_instr * I)30 cost_instr(agx_instr *I)
31 {
32 /* TODO: Better heuristic */
33 switch (I->op) {
34 case AGX_OPCODE_DEVICE_LOAD:
35 return 10;
36 case AGX_OPCODE_TEXTURE_LOAD:
37 case AGX_OPCODE_TEXTURE_SAMPLE:
38 return 50;
39 default:
40 return 1;
41 }
42 }
43
44 /*
45 * Estimate the cost between the instruction and the branch target. This is an
46 * input for our heuristic. The branch target is guaranteed to be a forward
47 * branch.
48 */
49 static uint32_t
cost_between(agx_context * ctx,agx_block * from,agx_instr * from_I,agx_block * target,bool skip_to_end_of_target)50 cost_between(agx_context *ctx, agx_block *from, agx_instr *from_I,
51 agx_block *target, bool skip_to_end_of_target)
52 {
53 uint32_t cost = 0;
54
55 /* Consider the cost in the rest of this block */
56 if (from_I != agx_last_instr(from)) {
57 agx_foreach_instr_in_block_from(from, J, from_I) {
58 /* If we reach the end, we're done */
59 if (from == target && skip_to_end_of_target &&
60 J == agx_last_instr(target))
61 break;
62
63 cost += cost_instr(J);
64 }
65 }
66
67 if (from == target)
68 return cost;
69
70 /* Consider the cost in the subsequent blocks */
71 agx_foreach_block_from(ctx, from, block) {
72 if (block == from)
73 continue;
74
75 if (block == target && !skip_to_end_of_target)
76 break;
77
78 agx_foreach_instr_in_block(block, I) {
79 if (block == target && I == agx_last_instr(target))
80 break;
81
82 cost += cost_instr(I);
83 }
84
85 if (block == target) {
86 assert(skip_to_end_of_target);
87 break;
88 }
89 }
90
91 return cost;
92 }
93
94 static void
try_insert_jmp(agx_context * ctx,agx_block * from,agx_instr * from_I,agx_block * target,bool skip_to_end_of_target,unsigned inverse_probability)95 try_insert_jmp(agx_context *ctx, agx_block *from, agx_instr *from_I,
96 agx_block *target, bool skip_to_end_of_target,
97 unsigned inverse_probability)
98 {
99 agx_builder b = agx_init_builder(ctx, agx_after_instr(from_I));
100
101 /* If the control flow instruction was only inserted for its side effects,
102 * there is nowhere to jump. Bail.
103 */
104 if (!target)
105 return;
106
107 /* If we do not insert a jump, we execute the predicated instructions
108 * unconditionally, with an expected cost C.
109 *
110 * If we do insert a jump, then we pay the cost J of the jump, AND if we do
111 * not take the jump, also the cost of the instructions C. The expected cost
112 * if we insert a jump is therefore J + P(not all threads inactive) C.
113 *
114 * Therefore, we should insert a jump if:
115 *
116 * J + P(not all threads inactive) C < C
117 *
118 * To model the implicit (i-cache, etc) costs of inserting a jump
119 * instruction, we tie break conservatively, comparing with < instead of <=.
120 *
121 * Rearranging terms, we should NOT insert a jump if:
122 *
123 * C < J / P(all threads inactive).
124 */
125 uint32_t cost_instructions =
126 cost_between(ctx, from, from_I, target, skip_to_end_of_target);
127
128 if (cost_instructions < COST_JMP * inverse_probability)
129 return;
130
131 /* It looks like inserting a jump will be a win. Do so. */
132 if (skip_to_end_of_target)
133 agx_jmp_exec_none_after(&b, target);
134 else
135 agx_jmp_exec_none(&b, target);
136 }
137
138 void
agx_opt_jmp_none(agx_context * ctx)139 agx_opt_jmp_none(agx_context *ctx)
140 {
141 agx_foreach_block(ctx, blk) {
142 /* Handle the beginning of blocks */
143 agx_instr *first_ = agx_first_instr(blk);
144 if (first_ && (first_->op == AGX_OPCODE_ELSE_ICMP ||
145 first_->op == AGX_OPCODE_ELSE_FCMP)) {
146
147 /* The target of the else is the last block of the else, so we skip
148 * to the end of the block (to start execution with the pop_exec).
149 */
150 try_insert_jmp(ctx, blk, first_, first_->target, true, 2);
151 } else if (first_ &&
152 (first_->op == AGX_OPCODE_BREAK_IF_ICMP ||
153 first_->op == AGX_OPCODE_BREAK_IF_FCMP) &&
154 first_->nest == 1) {
155 /* The target of the break is the block immediately after the end of
156 * the loop, so jump to the end of the previous block to get the
157 * appropriate pop_exec.
158 *
159 * Also, note we only do this for nest=1 to ensure we don't insert
160 * jumps inside if-statements inside breaks. We can't insert a
161 * jmp_exec_none inside the if because it would break out of the loop
162 * for threads that are still running the loop but merely predicated
163 * out due to the if-condition. This is similarly why we don't bother
164 * handling unconditional break.
165 *
166 * TODO: This is not optimal, but fixing this would require
167 * considerably more CFG gymnastics.
168 */
169 agx_block *target = agx_prev_block(first_->target);
170 try_insert_jmp(ctx, blk, first_, target, true, 10);
171 }
172
173 /* Handle end of block instructions */
174 agx_foreach_instr_in_block_rev(blk, I) {
175 if (!instr_after_logical_end(I))
176 break;
177
178 if (I->op == AGX_OPCODE_IF_ICMP || I->op == AGX_OPCODE_IF_FCMP) {
179 try_insert_jmp(ctx, blk, I, I->target, false, 2);
180 break;
181 }
182 }
183 }
184 }
185