1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <[email protected]>
25 *
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_live_variables.h"
31 #include "brw_cfg.h"
32 #include <new>
33
34 using namespace brw;
35
36 /** @file
37 *
38 * List scheduling of FS instructions.
39 *
40 * The basic model of the list scheduler is to take a basic block,
41 * compute a DAG of the dependencies (RAW ordering with latency, WAW
42 * ordering with latency, WAR ordering), and make a list of the DAG heads.
43 * Heuristically pick a DAG head, then put all the children that are
44 * now DAG heads into the list of things to schedule.
45 *
46 * The heuristic is the important part. We're trying to be cheap,
47 * since actually computing the optimal scheduling is NP complete.
48 * What we do is track a "current clock". When we schedule a node, we
49 * update the earliest-unblocked clock time of its children, and
50 * increment the clock. Then, when trying to schedule, we just pick
51 * the earliest-unblocked instruction to schedule.
52 *
53 * Note that often there will be many things which could execute
54 * immediately, and there are a range of heuristic options to choose
55 * from in picking among those.
56 */
57
58 static bool debug = false;
59
60 struct schedule_node_child;
61
62 class schedule_node : public exec_node
63 {
64 public:
65 void set_latency(const struct brw_isa_info *isa);
66
67 fs_inst *inst;
68 schedule_node_child *children;
69 int children_count;
70 int children_cap;
71 int initial_parent_count;
72 int initial_unblocked_time;
73 int latency;
74
75 /**
76 * This is the sum of the instruction's latency plus the maximum delay of
77 * its children, or just the issue_time if it's a leaf node.
78 */
79 int delay;
80
81 /**
82 * Preferred exit node among the (direct or indirect) successors of this
83 * node. Among the scheduler nodes blocked by this node, this will be the
84 * one that may cause earliest program termination, or NULL if none of the
85 * successors is an exit node.
86 */
87 schedule_node *exit;
88
89 /**
90 * How many cycles this instruction takes to issue.
91 *
92 * Instructions in gen hardware are handled one simd4 vector at a time,
93 * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2
94 * cycles to dispatch and SIMD16 (compressed) instructions take 4.
95 */
96 int issue_time;
97
98 /* Temporary data used during the scheduling process. */
99 struct {
100 int parent_count;
101 int unblocked_time;
102
103 /**
104 * Which iteration of pushing groups of children onto the candidates list
105 * this node was a part of.
106 */
107 unsigned cand_generation;
108 } tmp;
109 };
110
111 struct schedule_node_child {
112 schedule_node *n;
113 int effective_latency;
114 };
115
116 static inline void
reset_node_tmp(schedule_node * n)117 reset_node_tmp(schedule_node *n)
118 {
119 n->tmp.parent_count = n->initial_parent_count;
120 n->tmp.unblocked_time = n->initial_unblocked_time;
121 n->tmp.cand_generation = 0;
122 }
123
124 /**
125 * Lower bound of the scheduling time after which one of the instructions
126 * blocked by this node may lead to program termination.
127 *
128 * exit_unblocked_time() determines a strict partial ordering relation '«' on
129 * the set of scheduler nodes as follows:
130 *
131 * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
132 *
133 * which can be used to heuristically order nodes according to how early they
134 * can unblock an exit node and lead to program termination.
135 */
136 static inline int
exit_tmp_unblocked_time(const schedule_node * n)137 exit_tmp_unblocked_time(const schedule_node *n)
138 {
139 return n->exit ? n->exit->tmp.unblocked_time : INT_MAX;
140 }
141
142 static inline int
exit_initial_unblocked_time(const schedule_node * n)143 exit_initial_unblocked_time(const schedule_node *n)
144 {
145 return n->exit ? n->exit->initial_unblocked_time : INT_MAX;
146 }
147
148 void
set_latency(const struct brw_isa_info * isa)149 schedule_node::set_latency(const struct brw_isa_info *isa)
150 {
151 switch (inst->opcode) {
152 case BRW_OPCODE_MAD:
153 /* 2 cycles
154 * (since the last two src operands are in different register banks):
155 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
156 *
157 * 3 cycles on IVB, 4 on HSW
158 * (since the last two src operands are in the same register bank):
159 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
160 *
161 * 18 cycles on IVB, 16 on HSW
162 * (since the last two src operands are in different register banks):
163 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
164 * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q };
165 *
166 * 20 cycles on IVB, 18 on HSW
167 * (since the last two src operands are in the same register bank):
168 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
169 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
170 */
171
172 /* Our register allocator doesn't know about register banks, so use the
173 * higher latency.
174 */
175 latency = 18;
176 break;
177
178 case BRW_OPCODE_LRP:
179 /* 2 cycles
180 * (since the last two src operands are in different register banks):
181 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
182 *
183 * 3 cycles on IVB, 4 on HSW
184 * (since the last two src operands are in the same register bank):
185 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
186 *
187 * 16 cycles on IVB, 14 on HSW
188 * (since the last two src operands are in different register banks):
189 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
190 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
191 *
192 * 16 cycles
193 * (since the last two src operands are in the same register bank):
194 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
195 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
196 */
197
198 /* Our register allocator doesn't know about register banks, so use the
199 * higher latency.
200 */
201 latency = 14;
202 break;
203
204 case SHADER_OPCODE_RCP:
205 case SHADER_OPCODE_RSQ:
206 case SHADER_OPCODE_SQRT:
207 case SHADER_OPCODE_LOG2:
208 case SHADER_OPCODE_EXP2:
209 case SHADER_OPCODE_SIN:
210 case SHADER_OPCODE_COS:
211 /* 2 cycles:
212 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
213 *
214 * 18 cycles:
215 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
216 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
217 *
218 * Same for exp2, log2, rsq, sqrt, sin, cos.
219 */
220 latency = 16;
221 break;
222
223 case SHADER_OPCODE_POW:
224 /* 2 cycles:
225 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
226 *
227 * 26 cycles:
228 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
229 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
230 */
231 latency = 24;
232 break;
233
234 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
235 /* testing using varying-index pull constants:
236 *
237 * 16 cycles:
238 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
239 * send(8) g4<1>F g4<8,8,1>D
240 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
241 *
242 * ~480 cycles:
243 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
244 * send(8) g4<1>F g4<8,8,1>D
245 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
246 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
247 *
248 * ~620 cycles:
249 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
250 * send(8) g4<1>F g4<8,8,1>D
251 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
252 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
253 * send(8) g4<1>F g4<8,8,1>D
254 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
255 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
256 *
257 * So, if it's cache-hot, it's about 140. If it's cache cold, it's
258 * about 460. We expect to mostly be cache hot, so pick something more
259 * in that direction.
260 */
261 latency = 200;
262 break;
263
264 case SHADER_OPCODE_SEND:
265 switch (inst->sfid) {
266 case BRW_SFID_SAMPLER: {
267 unsigned msg_type = (inst->desc >> 12) & 0x1f;
268 switch (msg_type) {
269 case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO:
270 case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO:
271 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
272 * cycles (n=15):
273 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
274 * send(8) g6<1>UW g114<8,8,1>F
275 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
276 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
277 *
278 *
279 * Two loads was 535 +/- 30 cycles (n=19):
280 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
281 * send(16) g6<1>UW g114<8,8,1>F
282 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
283 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
284 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
285 * send(16) g8<1>UW g114<8,8,1>F
286 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
287 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
288 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
289 *
290 * Since the only caches that should matter are just the
291 * instruction/state cache containing the surface state,
292 * assume that we always have hot caches.
293 */
294 latency = 100;
295 break;
296
297 default:
298 /* 18 cycles:
299 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
300 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
301 * send(8) g4<1>UW g114<8,8,1>F
302 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
303 *
304 * 697 +/-49 cycles (min 610, n=26):
305 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
306 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
307 * send(8) g4<1>UW g114<8,8,1>F
308 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
309 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
310 *
311 * So the latency on our first texture load of the batchbuffer
312 * takes ~700 cycles, since the caches are cold at that point.
313 *
314 * 840 +/- 92 cycles (min 720, n=25):
315 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
316 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
317 * send(8) g4<1>UW g114<8,8,1>F
318 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
319 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
320 * send(8) g4<1>UW g114<8,8,1>F
321 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
322 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
323 *
324 * On the second load, it takes just an extra ~140 cycles, and
325 * after accounting for the 14 cycles of the MOV's latency, that
326 * makes ~130.
327 *
328 * 683 +/- 49 cycles (min = 602, n=47):
329 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
330 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
331 * send(8) g4<1>UW g114<8,8,1>F
332 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
333 * send(8) g50<1>UW g114<8,8,1>F
334 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
335 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
336 *
337 * The unit appears to be pipelined, since this matches up with
338 * the cache-cold case, despite there being two loads here. If
339 * you replace the g4 in the MOV to null with g50, it's still
340 * 693 +/- 52 (n=39).
341 *
342 * So, take some number between the cache-hot 140 cycles and the
343 * cache-cold 700 cycles. No particular tuning was done on this.
344 *
345 * I haven't done significant testing of the non-TEX opcodes.
346 * TXL at least looked about the same as TEX.
347 */
348 latency = 200;
349 break;
350 }
351 break;
352 }
353
354 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
355 /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
356 latency = 200;
357 break;
358
359 case GFX6_SFID_DATAPORT_RENDER_CACHE:
360 switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) {
361 case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE:
362 case GFX7_DATAPORT_RC_TYPED_SURFACE_READ:
363 /* See also SHADER_OPCODE_TYPED_SURFACE_READ */
364 latency = 600;
365 break;
366
367 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
368 /* See also SHADER_OPCODE_TYPED_ATOMIC */
369 latency = 14000;
370 break;
371
372 case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE:
373 /* completely fabricated number */
374 latency = 600;
375 break;
376
377 default:
378 unreachable("Unknown render cache message");
379 }
380 break;
381
382 case GFX7_SFID_DATAPORT_DATA_CACHE:
383 switch ((inst->desc >> 14) & 0x1f) {
384 case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ:
385 case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ:
386 case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE:
387 /* We have no data for this but assume it's a little faster than
388 * untyped surface read/write.
389 */
390 latency = 200;
391 break;
392
393 case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ:
394 case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE:
395 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ:
396 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE:
397 /* We have no data for this but assume it's roughly the same as
398 * untyped surface read/write.
399 */
400 latency = 300;
401 break;
402
403 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ:
404 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE:
405 /* Test code:
406 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
407 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
408 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
409 * send(8) g4<1>UD g112<8,8,1>UD
410 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
411 * .
412 * . [repeats 8 times]
413 * .
414 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
415 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
416 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
417 * send(8) g4<1>UD g112<8,8,1>UD
418 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
419 *
420 * Running it 100 times as fragment shader on a 128x128 quad
421 * gives an average latency of 583 cycles per surface read,
422 * standard deviation 0.9%.
423 */
424 latency = 600;
425 break;
426
427 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
428 /* Test code:
429 * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q };
430 * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all };
431 * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q };
432 * send(8) g4<1>ud g112<8,8,1>ud
433 * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q };
434 *
435 * Running it 100 times as fragment shader on a 128x128 quad
436 * gives an average latency of 13867 cycles per atomic op,
437 * standard deviation 3%. Note that this is a rather
438 * pessimistic estimate, the actual latency in cases with few
439 * collisions between threads and favorable pipelining has been
440 * seen to be reduced by a factor of 100.
441 */
442 latency = 14000;
443 break;
444
445 default:
446 unreachable("Unknown data cache message");
447 }
448 break;
449
450 case HSW_SFID_DATAPORT_DATA_CACHE_1:
451 switch (brw_dp_desc_msg_type(isa->devinfo, inst->desc)) {
452 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
453 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
454 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
455 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
456 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
457 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
458 case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
459 case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
460 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ:
461 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE:
462 /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */
463 latency = 300;
464 break;
465
466 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
467 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
468 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
469 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
470 case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP:
471 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP:
472 case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP:
473 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP:
474 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP:
475 /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */
476 latency = 14000;
477 break;
478
479 default:
480 unreachable("Unknown data cache message");
481 }
482 break;
483
484 case GFX7_SFID_PIXEL_INTERPOLATOR:
485 latency = 50; /* TODO */
486 break;
487
488 case GFX12_SFID_UGM:
489 case GFX12_SFID_TGM:
490 case GFX12_SFID_SLM:
491 switch (lsc_msg_desc_opcode(isa->devinfo, inst->desc)) {
492 case LSC_OP_LOAD:
493 case LSC_OP_STORE:
494 case LSC_OP_LOAD_CMASK:
495 case LSC_OP_STORE_CMASK:
496 latency = 300;
497 break;
498 case LSC_OP_FENCE:
499 case LSC_OP_ATOMIC_INC:
500 case LSC_OP_ATOMIC_DEC:
501 case LSC_OP_ATOMIC_LOAD:
502 case LSC_OP_ATOMIC_STORE:
503 case LSC_OP_ATOMIC_ADD:
504 case LSC_OP_ATOMIC_SUB:
505 case LSC_OP_ATOMIC_MIN:
506 case LSC_OP_ATOMIC_MAX:
507 case LSC_OP_ATOMIC_UMIN:
508 case LSC_OP_ATOMIC_UMAX:
509 case LSC_OP_ATOMIC_CMPXCHG:
510 case LSC_OP_ATOMIC_FADD:
511 case LSC_OP_ATOMIC_FSUB:
512 case LSC_OP_ATOMIC_FMIN:
513 case LSC_OP_ATOMIC_FMAX:
514 case LSC_OP_ATOMIC_FCMPXCHG:
515 case LSC_OP_ATOMIC_AND:
516 case LSC_OP_ATOMIC_OR:
517 case LSC_OP_ATOMIC_XOR:
518 latency = 1400;
519 break;
520 default:
521 unreachable("unsupported new data port message instruction");
522 }
523 break;
524
525 case BRW_SFID_MESSAGE_GATEWAY:
526 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
527 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
528 /* TODO.
529 *
530 * We'll assume for the moment that this is pretty quick as it
531 * doesn't actually return any data.
532 */
533 latency = 200;
534 break;
535
536 case BRW_SFID_URB:
537 latency = 200;
538 break;
539
540 default:
541 unreachable("Unknown SFID");
542 }
543 break;
544
545 case BRW_OPCODE_DPAS:
546 switch (inst->rcount) {
547 case 1:
548 latency = 21;
549 break;
550 case 2:
551 latency = 22;
552 break;
553 case 8:
554 default:
555 latency = 32;
556 break;
557 }
558 break;
559
560 default:
561 /* 2 cycles:
562 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
563 *
564 * 16 cycles:
565 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
566 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
567 */
568 latency = 14;
569 break;
570 }
571 }
572
573 class instruction_scheduler {
574 public:
575 instruction_scheduler(void *mem_ctx, const fs_visitor *s, int grf_count, int hw_reg_count,
576 int block_count, bool post_reg_alloc);
577
578 void add_barrier_deps(schedule_node *n);
579 void add_cross_lane_deps(schedule_node *n);
580 void add_dep(schedule_node *before, schedule_node *after, int latency);
581 void add_dep(schedule_node *before, schedule_node *after);
582
583 void set_current_block(bblock_t *block);
584 void compute_delays();
585 void compute_exits();
586
587 void schedule(schedule_node *chosen);
588 void update_children(schedule_node *chosen);
589
590 void calculate_deps();
591 bool is_compressed(const fs_inst *inst);
592 bool register_needs_barrier(const brw_reg ®);
593 schedule_node *choose_instruction_to_schedule();
594 int calculate_issue_time(const fs_inst *inst);
595
596 void count_reads_remaining(const fs_inst *inst);
597 void setup_liveness(cfg_t *cfg);
598 void update_register_pressure(const fs_inst *inst);
599 int get_register_pressure_benefit(const fs_inst *inst);
600 void clear_last_grf_write();
601
602 void schedule_instructions();
603 void run(instruction_scheduler_mode mode);
604
605 int grf_index(const brw_reg ®);
606
607 void *mem_ctx;
608 linear_ctx *lin_ctx;
609
610 schedule_node *nodes;
611 int nodes_len;
612
613 /* Current block being processed. */
614 struct {
615 bblock_t *block;
616
617 /* Range of nodes in the block. End will point to first node
618 * address after the block, i.e. the range is [start, end).
619 */
620 schedule_node *start;
621 schedule_node *end;
622 int len;
623
624 int scheduled;
625
626 unsigned cand_generation;
627 int time;
628 exec_list available;
629 } current;
630
631 bool post_reg_alloc;
632 int grf_count;
633 const fs_visitor *s;
634
635 /**
636 * Last instruction to have written the grf (or a channel in the grf, for the
637 * scalar backend)
638 */
639 schedule_node **last_grf_write;
640
641 unsigned hw_reg_count;
642 int reg_pressure;
643 instruction_scheduler_mode mode;
644
645 /*
646 * The register pressure at the beginning of each basic block.
647 */
648
649 int *reg_pressure_in;
650
651 /*
652 * The virtual GRF's whose range overlaps the beginning of each basic block.
653 */
654
655 BITSET_WORD **livein;
656
657 /*
658 * The virtual GRF's whose range overlaps the end of each basic block.
659 */
660
661 BITSET_WORD **liveout;
662
663 /*
664 * The hardware GRF's whose range overlaps the end of each basic block.
665 */
666
667 BITSET_WORD **hw_liveout;
668
669 /*
670 * Whether we've scheduled a write for this virtual GRF yet.
671 */
672
673 bool *written;
674
675 /*
676 * How many reads we haven't scheduled for this virtual GRF yet.
677 */
678
679 int *reads_remaining;
680
681 /*
682 * How many reads we haven't scheduled for this hardware GRF yet.
683 */
684
685 int *hw_reads_remaining;
686 };
687
instruction_scheduler(void * mem_ctx,const fs_visitor * s,int grf_count,int hw_reg_count,int block_count,bool post_reg_alloc)688 instruction_scheduler::instruction_scheduler(void *mem_ctx, const fs_visitor *s,
689 int grf_count, int hw_reg_count,
690 int block_count, bool post_reg_alloc)
691 : s(s)
692 {
693 this->mem_ctx = mem_ctx;
694 this->lin_ctx = linear_context(this->mem_ctx);
695 this->grf_count = grf_count;
696 this->post_reg_alloc = post_reg_alloc;
697
698 const unsigned grf_write_scale = MAX_VGRF_SIZE(s->devinfo);
699 this->last_grf_write = linear_zalloc_array(lin_ctx, schedule_node *, grf_count * grf_write_scale);
700
701 this->nodes_len = s->cfg->last_block()->end_ip + 1;
702 this->nodes = linear_zalloc_array(lin_ctx, schedule_node, this->nodes_len);
703
704 const struct brw_isa_info *isa = &s->compiler->isa;
705
706 schedule_node *n = nodes;
707 foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
708 n->inst = inst;
709
710 if (!post_reg_alloc)
711 n->latency = 1;
712 else
713 n->set_latency(isa);
714
715 n++;
716 }
717 assert(n == nodes + nodes_len);
718
719 current.block = NULL;
720 current.start = NULL;
721 current.end = NULL;
722 current.len = 0;
723 current.time = 0;
724 current.cand_generation = 0;
725 current.available.make_empty();
726
727 this->hw_reg_count = hw_reg_count;
728 this->mode = SCHEDULE_NONE;
729 this->reg_pressure = 0;
730
731 if (!post_reg_alloc) {
732 this->reg_pressure_in = linear_zalloc_array(lin_ctx, int, block_count);
733
734 this->livein = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
735 for (int i = 0; i < block_count; i++)
736 this->livein[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
737 BITSET_WORDS(grf_count));
738
739 this->liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
740 for (int i = 0; i < block_count; i++)
741 this->liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
742 BITSET_WORDS(grf_count));
743
744 this->hw_liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
745 for (int i = 0; i < block_count; i++)
746 this->hw_liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
747 BITSET_WORDS(hw_reg_count));
748
749 setup_liveness(s->cfg);
750
751 this->written = linear_alloc_array(lin_ctx, bool, grf_count);
752
753 this->reads_remaining = linear_alloc_array(lin_ctx, int, grf_count);
754
755 this->hw_reads_remaining = linear_alloc_array(lin_ctx, int, hw_reg_count);
756 } else {
757 this->reg_pressure_in = NULL;
758 this->livein = NULL;
759 this->liveout = NULL;
760 this->hw_liveout = NULL;
761 this->written = NULL;
762 this->reads_remaining = NULL;
763 this->hw_reads_remaining = NULL;
764 }
765
766 foreach_block(block, s->cfg) {
767 set_current_block(block);
768
769 for (schedule_node *n = current.start; n < current.end; n++)
770 n->issue_time = calculate_issue_time(n->inst);
771
772 calculate_deps();
773 compute_delays();
774 compute_exits();
775 }
776 }
777
778 static bool
is_src_duplicate(const fs_inst * inst,int src)779 is_src_duplicate(const fs_inst *inst, int src)
780 {
781 for (int i = 0; i < src; i++)
782 if (inst->src[i].equals(inst->src[src]))
783 return true;
784
785 return false;
786 }
787
788 void
count_reads_remaining(const fs_inst * inst)789 instruction_scheduler::count_reads_remaining(const fs_inst *inst)
790 {
791 assert(reads_remaining);
792
793 for (int i = 0; i < inst->sources; i++) {
794 if (is_src_duplicate(inst, i))
795 continue;
796
797 if (inst->src[i].file == VGRF) {
798 reads_remaining[inst->src[i].nr]++;
799 } else if (inst->src[i].file == FIXED_GRF) {
800 if (inst->src[i].nr >= hw_reg_count)
801 continue;
802
803 for (unsigned j = 0; j < regs_read(inst, i); j++)
804 hw_reads_remaining[inst->src[i].nr + j]++;
805 }
806 }
807 }
808
809 void
setup_liveness(cfg_t * cfg)810 instruction_scheduler::setup_liveness(cfg_t *cfg)
811 {
812 const fs_live_variables &live = s->live_analysis.require();
813
814 /* First, compute liveness on a per-GRF level using the in/out sets from
815 * liveness calculation.
816 */
817 for (int block = 0; block < cfg->num_blocks; block++) {
818 for (int i = 0; i < live.num_vars; i++) {
819 if (BITSET_TEST(live.block_data[block].livein, i)) {
820 int vgrf = live.vgrf_from_var[i];
821 if (!BITSET_TEST(livein[block], vgrf)) {
822 reg_pressure_in[block] += s->alloc.sizes[vgrf];
823 BITSET_SET(livein[block], vgrf);
824 }
825 }
826
827 if (BITSET_TEST(live.block_data[block].liveout, i))
828 BITSET_SET(liveout[block], live.vgrf_from_var[i]);
829 }
830 }
831
832 /* Now, extend the live in/live out sets for when a range crosses a block
833 * boundary, which matches what our register allocator/interference code
834 * does to account for force_writemask_all and incompatible exec_mask's.
835 */
836 for (int block = 0; block < cfg->num_blocks - 1; block++) {
837 for (int i = 0; i < grf_count; i++) {
838 if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip &&
839 live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) {
840 if (!BITSET_TEST(livein[block + 1], i)) {
841 reg_pressure_in[block + 1] += s->alloc.sizes[i];
842 BITSET_SET(livein[block + 1], i);
843 }
844
845 BITSET_SET(liveout[block], i);
846 }
847 }
848 }
849
850 int payload_last_use_ip[hw_reg_count];
851 s->calculate_payload_ranges(true, hw_reg_count, payload_last_use_ip);
852
853 for (unsigned i = 0; i < hw_reg_count; i++) {
854 if (payload_last_use_ip[i] == -1)
855 continue;
856
857 for (int block = 0; block < cfg->num_blocks; block++) {
858 if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
859 reg_pressure_in[block]++;
860
861 if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
862 BITSET_SET(hw_liveout[block], i);
863 }
864 }
865 }
866
867 void
update_register_pressure(const fs_inst * inst)868 instruction_scheduler::update_register_pressure(const fs_inst *inst)
869 {
870 assert(reads_remaining);
871
872 if (inst->dst.file == VGRF) {
873 written[inst->dst.nr] = true;
874 }
875
876 for (int i = 0; i < inst->sources; i++) {
877 if (is_src_duplicate(inst, i))
878 continue;
879
880 if (inst->src[i].file == VGRF) {
881 reads_remaining[inst->src[i].nr]--;
882 } else if (inst->src[i].file == FIXED_GRF &&
883 inst->src[i].nr < hw_reg_count) {
884 for (unsigned off = 0; off < regs_read(inst, i); off++)
885 hw_reads_remaining[inst->src[i].nr + off]--;
886 }
887 }
888 }
889
890 int
get_register_pressure_benefit(const fs_inst * inst)891 instruction_scheduler::get_register_pressure_benefit(const fs_inst *inst)
892 {
893 int benefit = 0;
894 const int block_idx = current.block->num;
895
896 if (inst->dst.file == VGRF) {
897 if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
898 !written[inst->dst.nr])
899 benefit -= s->alloc.sizes[inst->dst.nr];
900 }
901
902 for (int i = 0; i < inst->sources; i++) {
903 if (is_src_duplicate(inst, i))
904 continue;
905
906 if (inst->src[i].file == VGRF &&
907 !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
908 reads_remaining[inst->src[i].nr] == 1)
909 benefit += s->alloc.sizes[inst->src[i].nr];
910
911 if (inst->src[i].file == FIXED_GRF &&
912 inst->src[i].nr < hw_reg_count) {
913 for (unsigned off = 0; off < regs_read(inst, i); off++) {
914 int reg = inst->src[i].nr + off;
915 if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
916 hw_reads_remaining[reg] == 1) {
917 benefit++;
918 }
919 }
920 }
921 }
922
923 return benefit;
924 }
925
926 void
set_current_block(bblock_t * block)927 instruction_scheduler::set_current_block(bblock_t *block)
928 {
929 current.block = block;
930 current.start = nodes + block->start_ip;
931 current.len = block->end_ip - block->start_ip + 1;
932 current.end = current.start + current.len;
933 current.time = 0;
934 current.scheduled = 0;
935 current.cand_generation = 1;
936 }
937
938 /** Computation of the delay member of each node. */
939 void
compute_delays()940 instruction_scheduler::compute_delays()
941 {
942 for (schedule_node *n = current.end - 1; n >= current.start; n--) {
943 if (!n->children_count) {
944 n->delay = n->issue_time;
945 } else {
946 for (int i = 0; i < n->children_count; i++) {
947 assert(n->children[i].n->delay);
948 n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
949 }
950 }
951 }
952 }
953
954 void
compute_exits()955 instruction_scheduler::compute_exits()
956 {
957 /* Calculate a lower bound of the scheduling time of each node in the
958 * graph. This is analogous to the node's critical path but calculated
959 * from the top instead of from the bottom of the block.
960 */
961 for (schedule_node *n = current.start; n < current.end; n++) {
962 for (int i = 0; i < n->children_count; i++) {
963 schedule_node_child *child = &n->children[i];
964 child->n->initial_unblocked_time =
965 MAX2(child->n->initial_unblocked_time,
966 n->initial_unblocked_time + n->issue_time + child->effective_latency);
967 }
968 }
969
970 /* Calculate the exit of each node by induction based on the exit nodes of
971 * its children. The preferred exit of a node is the one among the exit
972 * nodes of its children which can be unblocked first according to the
973 * optimistic unblocked time estimate calculated above.
974 */
975 for (schedule_node *n = current.end - 1; n >= current.start; n--) {
976 n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL);
977
978 for (int i = 0; i < n->children_count; i++) {
979 if (exit_initial_unblocked_time(n->children[i].n) < exit_initial_unblocked_time(n))
980 n->exit = n->children[i].n->exit;
981 }
982 }
983 }
984
985 /**
986 * Add a dependency between two instruction nodes.
987 *
988 * The @after node will be scheduled after @before. We will try to
989 * schedule it @latency cycles after @before, but no guarantees there.
990 */
991 void
add_dep(schedule_node * before,schedule_node * after,int latency)992 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
993 int latency)
994 {
995 if (!before || !after)
996 return;
997
998 assert(before != after);
999
1000 for (int i = 0; i < before->children_count; i++) {
1001 schedule_node_child *child = &before->children[i];
1002 if (child->n == after) {
1003 child->effective_latency = MAX2(child->effective_latency, latency);
1004 return;
1005 }
1006 }
1007
1008 if (before->children_cap <= before->children_count) {
1009 if (before->children_cap < 16)
1010 before->children_cap = 16;
1011 else
1012 before->children_cap *= 2;
1013
1014 before->children = reralloc(mem_ctx, before->children,
1015 schedule_node_child,
1016 before->children_cap);
1017 }
1018
1019 schedule_node_child *child = &before->children[before->children_count];
1020 child->n = after;
1021 child->effective_latency = latency;
1022 before->children_count++;
1023 after->initial_parent_count++;
1024 }
1025
1026 void
add_dep(schedule_node * before,schedule_node * after)1027 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
1028 {
1029 if (!before)
1030 return;
1031
1032 add_dep(before, after, before->latency);
1033 }
1034
1035 static bool
is_scheduling_barrier(const fs_inst * inst)1036 is_scheduling_barrier(const fs_inst *inst)
1037 {
1038 return inst->opcode == SHADER_OPCODE_HALT_TARGET ||
1039 inst->is_control_flow() ||
1040 inst->has_side_effects();
1041 }
1042
1043 static bool
has_cross_lane_access(const fs_inst * inst)1044 has_cross_lane_access(const fs_inst *inst)
1045 {
1046 /* FINISHME:
1047 *
1048 * This function is likely incomplete in terms of identify cross lane
1049 * accesses.
1050 */
1051 if (inst->opcode == SHADER_OPCODE_BROADCAST ||
1052 inst->opcode == SHADER_OPCODE_CLUSTER_BROADCAST ||
1053 inst->opcode == SHADER_OPCODE_SHUFFLE ||
1054 inst->opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
1055 inst->opcode == SHADER_OPCODE_LOAD_LIVE_CHANNELS ||
1056 inst->opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1057 inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL)
1058 return true;
1059
1060 for (unsigned s = 0; s < inst->sources; s++) {
1061 if (inst->src[s].file == VGRF) {
1062 if (inst->src[s].stride == 0)
1063 return true;
1064 }
1065 }
1066
1067 return false;
1068 }
1069
1070 /**
1071 * Some register access need dependencies on other instructions.
1072 */
1073 bool
register_needs_barrier(const brw_reg & reg)1074 instruction_scheduler::register_needs_barrier(const brw_reg ®)
1075 {
1076 if (reg.file != ARF || reg.is_null())
1077 return false;
1078
1079 /* If you look at SR register layout, there is nothing in there that
1080 * depends on other instructions. This is just fixed dispatch information.
1081 *
1082 * ATSM PRMs, Volume 9: Render Engine, State Register Fields :
1083 * sr0.0:
1084 * - 0:2 TID
1085 * - 4:13 Slice, DSS, Subslice, EU IDs
1086 * - 20:22 Priority
1087 * - 23:23 Priority class
1088 * - 24:27 FFID
1089 * sr0.1:
1090 * - 0:5 IEEE Exception
1091 * - 21:31 FFTID
1092 * sr0.2:
1093 * - 0:31 Dispatch Mask
1094 * sr0.3:
1095 * - 0:31 Vector Mask
1096 */
1097 if (reg.nr == BRW_ARF_STATE)
1098 return false;
1099
1100 return true;
1101 }
1102
1103 /**
1104 * Sometimes we really want this node to execute after everything that
1105 * was before it and before everything that followed it. This adds
1106 * the deps to do so.
1107 */
1108 void
add_barrier_deps(schedule_node * n)1109 instruction_scheduler::add_barrier_deps(schedule_node *n)
1110 {
1111 for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
1112 add_dep(prev, n, 0);
1113 if (is_scheduling_barrier(prev->inst))
1114 break;
1115 }
1116
1117 for (schedule_node *next = n + 1; next < current.end; next++) {
1118 add_dep(n, next, 0);
1119 if (is_scheduling_barrier(next->inst))
1120 break;
1121 }
1122 }
1123
1124 /**
1125 * Because some instructions like HALT can disable lanes, scheduling prior to
1126 * a cross lane access should not be allowed, otherwise we could end up with
1127 * later instructions accessing uninitialized data.
1128 */
1129 void
add_cross_lane_deps(schedule_node * n)1130 instruction_scheduler::add_cross_lane_deps(schedule_node *n)
1131 {
1132 for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
1133 if (has_cross_lane_access((fs_inst*)prev->inst))
1134 add_dep(prev, n, 0);
1135 }
1136 }
1137
1138 /* instruction scheduling needs to be aware of when an MRF write
1139 * actually writes 2 MRFs.
1140 */
1141 bool
is_compressed(const fs_inst * inst)1142 instruction_scheduler::is_compressed(const fs_inst *inst)
1143 {
1144 return inst->exec_size == 16;
1145 }
1146
1147 /* Clears last_grf_write to be ready to start calculating deps for a block
1148 * again.
1149 *
1150 * Since pre-ra grf_count scales with instructions, and instructions scale with
1151 * BBs, we don't want to memset all of last_grf_write per block or you'll end up
1152 * O(n^2) with number of blocks. For shaders using softfp64, we get a *lot* of
1153 * blocks.
1154 *
1155 * We don't bother being careful for post-ra, since then grf_count doesn't scale
1156 * with instructions.
1157 */
1158 void
clear_last_grf_write()1159 instruction_scheduler::clear_last_grf_write()
1160 {
1161 if (!post_reg_alloc) {
1162 for (schedule_node *n = current.start; n < current.end; n++) {
1163 fs_inst *inst = (fs_inst *)n->inst;
1164
1165 if (inst->dst.file == VGRF) {
1166 /* Don't bother being careful with regs_written(), quicker to just clear 2 cachelines. */
1167 memset(&last_grf_write[inst->dst.nr * MAX_VGRF_SIZE(s->devinfo)], 0,
1168 sizeof(*last_grf_write) * MAX_VGRF_SIZE(s->devinfo));
1169 }
1170 }
1171 } else {
1172 memset(last_grf_write, 0,
1173 sizeof(*last_grf_write) * grf_count * MAX_VGRF_SIZE(s->devinfo));
1174 }
1175 }
1176
1177 int
grf_index(const brw_reg & reg)1178 instruction_scheduler::grf_index(const brw_reg ®)
1179 {
1180 if (post_reg_alloc)
1181 return reg.nr;
1182 return reg.nr * MAX_VGRF_SIZE(s->devinfo) + reg.offset / REG_SIZE;
1183 }
1184
1185 void
calculate_deps()1186 instruction_scheduler::calculate_deps()
1187 {
1188 /* Pre-register-allocation, this tracks the last write per VGRF offset.
1189 * After register allocation, reg_offsets are gone and we track individual
1190 * GRF registers.
1191 */
1192 schedule_node *last_conditional_mod[8] = {};
1193 schedule_node *last_accumulator_write = NULL;
1194 /* Fixed HW registers are assumed to be separate from the virtual
1195 * GRFs, so they can be tracked separately. We don't really write
1196 * to fixed GRFs much, so don't bother tracking them on a more
1197 * granular level.
1198 */
1199 schedule_node *last_fixed_grf_write = NULL;
1200
1201 /* top-to-bottom dependencies: RAW and WAW. */
1202 for (schedule_node *n = current.start; n < current.end; n++) {
1203 fs_inst *inst = (fs_inst *)n->inst;
1204
1205 if (is_scheduling_barrier(inst))
1206 add_barrier_deps(n);
1207
1208 if (inst->opcode == BRW_OPCODE_HALT ||
1209 inst->opcode == SHADER_OPCODE_HALT_TARGET)
1210 add_cross_lane_deps(n);
1211
1212 /* read-after-write deps. */
1213 for (int i = 0; i < inst->sources; i++) {
1214 if (inst->src[i].file == VGRF) {
1215 for (unsigned r = 0; r < regs_read(inst, i); r++)
1216 add_dep(last_grf_write[grf_index(inst->src[i]) + r], n);
1217 } else if (inst->src[i].file == FIXED_GRF) {
1218 if (post_reg_alloc) {
1219 for (unsigned r = 0; r < regs_read(inst, i); r++)
1220 add_dep(last_grf_write[inst->src[i].nr + r], n);
1221 } else {
1222 add_dep(last_fixed_grf_write, n);
1223 }
1224 } else if (inst->src[i].is_accumulator()) {
1225 add_dep(last_accumulator_write, n);
1226 } else if (register_needs_barrier(inst->src[i])) {
1227 add_barrier_deps(n);
1228 }
1229 }
1230
1231
1232 if (const unsigned mask = inst->flags_read(s->devinfo)) {
1233 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1234
1235 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1236 if (mask & (1 << i))
1237 add_dep(last_conditional_mod[i], n);
1238 }
1239 }
1240
1241 if (inst->reads_accumulator_implicitly()) {
1242 add_dep(last_accumulator_write, n);
1243 }
1244
1245 /* write-after-write deps. */
1246 if (inst->dst.file == VGRF) {
1247 int grf_idx = grf_index(inst->dst);
1248 for (unsigned r = 0; r < regs_written(inst); r++) {
1249 add_dep(last_grf_write[grf_idx + r], n);
1250 last_grf_write[grf_idx + r] = n;
1251 }
1252 } else if (inst->dst.file == FIXED_GRF) {
1253 if (post_reg_alloc) {
1254 for (unsigned r = 0; r < regs_written(inst); r++) {
1255 add_dep(last_grf_write[inst->dst.nr + r], n);
1256 last_grf_write[inst->dst.nr + r] = n;
1257 }
1258 } else {
1259 add_dep(last_fixed_grf_write, n);
1260 last_fixed_grf_write = n;
1261 }
1262 } else if (inst->dst.is_accumulator()) {
1263 add_dep(last_accumulator_write, n);
1264 last_accumulator_write = n;
1265 } else if (register_needs_barrier(inst->dst)) {
1266 add_barrier_deps(n);
1267 }
1268
1269 if (const unsigned mask = inst->flags_written(s->devinfo)) {
1270 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1271
1272 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1273 if (mask & (1 << i)) {
1274 add_dep(last_conditional_mod[i], n, 0);
1275 last_conditional_mod[i] = n;
1276 }
1277 }
1278 }
1279
1280 if (inst->writes_accumulator_implicitly(s->devinfo) &&
1281 !inst->dst.is_accumulator()) {
1282 add_dep(last_accumulator_write, n);
1283 last_accumulator_write = n;
1284 }
1285 }
1286
1287 clear_last_grf_write();
1288
1289 /* bottom-to-top dependencies: WAR */
1290 memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
1291 last_accumulator_write = NULL;
1292 last_fixed_grf_write = NULL;
1293
1294 for (schedule_node *n = current.end - 1; n >= current.start; n--) {
1295 fs_inst *inst = (fs_inst *)n->inst;
1296
1297 /* write-after-read deps. */
1298 for (int i = 0; i < inst->sources; i++) {
1299 if (inst->src[i].file == VGRF) {
1300 for (unsigned r = 0; r < regs_read(inst, i); r++)
1301 add_dep(n, last_grf_write[grf_index(inst->src[i]) + r], 0);
1302 } else if (inst->src[i].file == FIXED_GRF) {
1303 if (post_reg_alloc) {
1304 for (unsigned r = 0; r < regs_read(inst, i); r++)
1305 add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
1306 } else {
1307 add_dep(n, last_fixed_grf_write, 0);
1308 }
1309 } else if (inst->src[i].is_accumulator()) {
1310 add_dep(n, last_accumulator_write, 0);
1311 } else if (register_needs_barrier(inst->src[i])) {
1312 add_barrier_deps(n);
1313 }
1314 }
1315
1316 if (const unsigned mask = inst->flags_read(s->devinfo)) {
1317 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1318
1319 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1320 if (mask & (1 << i))
1321 add_dep(n, last_conditional_mod[i]);
1322 }
1323 }
1324
1325 if (inst->reads_accumulator_implicitly()) {
1326 add_dep(n, last_accumulator_write);
1327 }
1328
1329 /* Update the things this instruction wrote, so earlier reads
1330 * can mark this as WAR dependency.
1331 */
1332 if (inst->dst.file == VGRF) {
1333 for (unsigned r = 0; r < regs_written(inst); r++)
1334 last_grf_write[grf_index(inst->dst) + r] = n;
1335 } else if (inst->dst.file == FIXED_GRF) {
1336 if (post_reg_alloc) {
1337 for (unsigned r = 0; r < regs_written(inst); r++)
1338 last_grf_write[inst->dst.nr + r] = n;
1339 } else {
1340 last_fixed_grf_write = n;
1341 }
1342 } else if (inst->dst.is_accumulator()) {
1343 last_accumulator_write = n;
1344 } else if (register_needs_barrier(inst->dst)) {
1345 add_barrier_deps(n);
1346 }
1347
1348 if (const unsigned mask = inst->flags_written(s->devinfo)) {
1349 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1350
1351 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1352 if (mask & (1 << i))
1353 last_conditional_mod[i] = n;
1354 }
1355 }
1356
1357 if (inst->writes_accumulator_implicitly(s->devinfo)) {
1358 last_accumulator_write = n;
1359 }
1360 }
1361
1362 clear_last_grf_write();
1363 }
1364
1365 schedule_node *
choose_instruction_to_schedule()1366 instruction_scheduler::choose_instruction_to_schedule()
1367 {
1368 schedule_node *chosen = NULL;
1369
1370 if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
1371 int chosen_time = 0;
1372
1373 /* Of the instructions ready to execute or the closest to being ready,
1374 * choose the one most likely to unblock an early program exit, or
1375 * otherwise the oldest one.
1376 */
1377 foreach_in_list(schedule_node, n, ¤t.available) {
1378 if (!chosen ||
1379 exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) ||
1380 (exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) &&
1381 n->tmp.unblocked_time < chosen_time)) {
1382 chosen = n;
1383 chosen_time = n->tmp.unblocked_time;
1384 }
1385 }
1386 } else {
1387 int chosen_register_pressure_benefit = 0;
1388
1389 /* Before register allocation, we don't care about the latencies of
1390 * instructions. All we care about is reducing live intervals of
1391 * variables so that we can avoid register spilling, or get SIMD16
1392 * shaders which naturally do a better job of hiding instruction
1393 * latency.
1394 */
1395 foreach_in_list(schedule_node, n, ¤t.available) {
1396 if (!chosen) {
1397 chosen = n;
1398 chosen_register_pressure_benefit =
1399 get_register_pressure_benefit(chosen->inst);
1400 continue;
1401 }
1402
1403 /* Most important: If we can definitely reduce register pressure, do
1404 * so immediately.
1405 */
1406 int register_pressure_benefit = get_register_pressure_benefit(n->inst);
1407
1408 if (register_pressure_benefit > 0 &&
1409 register_pressure_benefit > chosen_register_pressure_benefit) {
1410 chosen = n;
1411 chosen_register_pressure_benefit = register_pressure_benefit;
1412 continue;
1413 } else if (chosen_register_pressure_benefit > 0 &&
1414 (register_pressure_benefit <
1415 chosen_register_pressure_benefit)) {
1416 continue;
1417 }
1418
1419 if (mode == SCHEDULE_PRE_LIFO) {
1420 /* Prefer instructions that recently became available for
1421 * scheduling. These are the things that are most likely to
1422 * (eventually) make a variable dead and reduce register pressure.
1423 * Typical register pressure estimates don't work for us because
1424 * most of our pressure comes from texturing, where no single
1425 * instruction to schedule will make a vec4 value dead.
1426 */
1427 if (n->tmp.cand_generation > chosen->tmp.cand_generation) {
1428 chosen = n;
1429 chosen_register_pressure_benefit = register_pressure_benefit;
1430 continue;
1431 } else if (n->tmp.cand_generation < chosen->tmp.cand_generation) {
1432 continue;
1433 }
1434 }
1435
1436 /* For instructions pushed on the cands list at the same time, prefer
1437 * the one with the highest delay to the end of the program. This is
1438 * most likely to have its values able to be consumed first (such as
1439 * for a large tree of lowered ubo loads, which appear reversed in
1440 * the instruction stream with respect to when they can be consumed).
1441 */
1442 if (n->delay > chosen->delay) {
1443 chosen = n;
1444 chosen_register_pressure_benefit = register_pressure_benefit;
1445 continue;
1446 } else if (n->delay < chosen->delay) {
1447 continue;
1448 }
1449
1450 /* Prefer the node most likely to unblock an early program exit.
1451 */
1452 if (exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen)) {
1453 chosen = n;
1454 chosen_register_pressure_benefit = register_pressure_benefit;
1455 continue;
1456 } else if (exit_tmp_unblocked_time(n) > exit_tmp_unblocked_time(chosen)) {
1457 continue;
1458 }
1459
1460 /* If all other metrics are equal, we prefer the first instruction in
1461 * the list (program execution).
1462 */
1463 }
1464 }
1465
1466 return chosen;
1467 }
1468
1469 int
calculate_issue_time(const fs_inst * inst)1470 instruction_scheduler::calculate_issue_time(const fs_inst *inst)
1471 {
1472 const struct brw_isa_info *isa = &s->compiler->isa;
1473 const unsigned overhead = s->grf_used && has_bank_conflict(isa, inst) ?
1474 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0;
1475 if (is_compressed(inst))
1476 return 4 + overhead;
1477 else
1478 return 2 + overhead;
1479 }
1480
1481 void
schedule(schedule_node * chosen)1482 instruction_scheduler::schedule(schedule_node *chosen)
1483 {
1484 assert(current.scheduled < current.len);
1485 current.scheduled++;
1486
1487 assert(chosen);
1488 chosen->remove();
1489 current.block->instructions.push_tail(chosen->inst);
1490
1491 /* If we expected a delay for scheduling, then bump the clock to reflect
1492 * that. In reality, the hardware will switch to another hyperthread
1493 * and may not return to dispatching our thread for a while even after
1494 * we're unblocked. After this, we have the time when the chosen
1495 * instruction will start executing.
1496 */
1497 current.time = MAX2(current.time, chosen->tmp.unblocked_time);
1498
1499 /* Update the clock for how soon an instruction could start after the
1500 * chosen one.
1501 */
1502 current.time += chosen->issue_time;
1503
1504 if (debug) {
1505 fprintf(stderr, "clock %4d, scheduled: ", current.time);
1506 brw_print_instruction(*s, chosen->inst);
1507 }
1508 }
1509
1510 void
update_children(schedule_node * chosen)1511 instruction_scheduler::update_children(schedule_node *chosen)
1512 {
1513 /* Now that we've scheduled a new instruction, some of its
1514 * children can be promoted to the list of instructions ready to
1515 * be scheduled. Update the children's unblocked time for this
1516 * DAG edge as we do so.
1517 */
1518 for (int i = chosen->children_count - 1; i >= 0; i--) {
1519 schedule_node_child *child = &chosen->children[i];
1520
1521 child->n->tmp.unblocked_time = MAX2(child->n->tmp.unblocked_time,
1522 current.time + child->effective_latency);
1523
1524 if (debug) {
1525 fprintf(stderr, "\tchild %d, %d parents: ", i, child->n->tmp.parent_count);
1526 brw_print_instruction(*s, child->n->inst);
1527 }
1528
1529 child->n->tmp.cand_generation = current.cand_generation;
1530 child->n->tmp.parent_count--;
1531 if (child->n->tmp.parent_count == 0) {
1532 if (debug) {
1533 fprintf(stderr, "\t\tnow available\n");
1534 }
1535 current.available.push_head(child->n);
1536 }
1537 }
1538 current.cand_generation++;
1539 }
1540
1541 void
schedule_instructions()1542 instruction_scheduler::schedule_instructions()
1543 {
1544 if (!post_reg_alloc)
1545 reg_pressure = reg_pressure_in[current.block->num];
1546
1547 assert(current.available.is_empty());
1548 for (schedule_node *n = current.start; n < current.end; n++) {
1549 reset_node_tmp(n);
1550
1551 /* Add DAG heads to the list of available instructions. */
1552 if (n->tmp.parent_count == 0)
1553 current.available.push_tail(n);
1554 }
1555
1556 current.block->instructions.make_empty();
1557
1558 while (!current.available.is_empty()) {
1559 schedule_node *chosen = choose_instruction_to_schedule();
1560 schedule(chosen);
1561
1562 if (!post_reg_alloc) {
1563 reg_pressure -= get_register_pressure_benefit(chosen->inst);
1564 update_register_pressure(chosen->inst);
1565 if (debug)
1566 fprintf(stderr, "(register pressure %d)\n", reg_pressure);
1567 }
1568
1569 update_children(chosen);
1570 }
1571 }
1572
1573 void
run(instruction_scheduler_mode mode)1574 instruction_scheduler::run(instruction_scheduler_mode mode)
1575 {
1576 this->mode = mode;
1577
1578 if (debug && !post_reg_alloc) {
1579 fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
1580 post_reg_alloc);
1581 brw_print_instructions(*s);
1582 }
1583
1584 if (!post_reg_alloc) {
1585 memset(reads_remaining, 0, grf_count * sizeof(*reads_remaining));
1586 memset(hw_reads_remaining, 0, hw_reg_count * sizeof(*hw_reads_remaining));
1587 memset(written, 0, grf_count * sizeof(*written));
1588 }
1589
1590 foreach_block(block, s->cfg) {
1591 set_current_block(block);
1592
1593 if (!post_reg_alloc) {
1594 for (schedule_node *n = current.start; n < current.end; n++)
1595 count_reads_remaining(n->inst);
1596 }
1597
1598 schedule_instructions();
1599 }
1600
1601 if (debug && !post_reg_alloc) {
1602 fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
1603 post_reg_alloc);
1604 brw_print_instructions(*s);
1605 }
1606 }
1607
1608 instruction_scheduler *
brw_prepare_scheduler(fs_visitor & s,void * mem_ctx)1609 brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
1610 {
1611 const int grf_count = s.alloc.count;
1612
1613 instruction_scheduler *empty = rzalloc(mem_ctx, instruction_scheduler);
1614 return new (empty) instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
1615 s.cfg->num_blocks, /* post_reg_alloc */ false);
1616 }
1617
1618 void
brw_schedule_instructions_pre_ra(fs_visitor & s,instruction_scheduler * sched,instruction_scheduler_mode mode)1619 brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
1620 instruction_scheduler_mode mode)
1621 {
1622 if (mode == SCHEDULE_NONE)
1623 return;
1624
1625 sched->run(mode);
1626
1627 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1628 }
1629
1630 void
brw_schedule_instructions_post_ra(fs_visitor & s)1631 brw_schedule_instructions_post_ra(fs_visitor &s)
1632 {
1633 const bool post_reg_alloc = true;
1634 const int grf_count = reg_unit(s.devinfo) * s.grf_used;
1635
1636 void *mem_ctx = ralloc_context(NULL);
1637
1638 instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
1639 s.cfg->num_blocks, post_reg_alloc);
1640 sched.run(SCHEDULE_POST);
1641
1642 ralloc_free(mem_ctx);
1643
1644 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1645 }
1646