1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
29 * it.
30 *
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
35 *
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
39 *
40 * - sp stack pointer
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
46 * - acc2-9 special accumulator registers on TGL
47 * - mme0-7 math macro extended accumulator registers
48 *
49 * The following ARF registers don't need to be tracked here because data
50 * coherency is still provided transparently by the hardware:
51 *
52 * - f0-1 flag registers
53 * - n0 notification register
54 * - tdr0 thread dependency register
55 */
56
57 #include "brw_fs.h"
58 #include "brw_fs_builder.h"
59 #include "brw_cfg.h"
60
61 using namespace brw;
62
63 namespace {
64 /**
65 * In-order instruction accounting.
66 * @{
67 */
68
69 /**
70 * Return the RegDist pipeline the hardware will synchronize with if no
71 * pipeline information is provided in the SWSB annotation of an
72 * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
73 */
74 tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)75 inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
76 {
77 if (devinfo->verx10 >= 125) {
78 bool has_int_src = false, has_long_src = false;
79 const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
80
81 if (is_send(inst))
82 return TGL_PIPE_NONE;
83
84 for (unsigned i = 0; i < inst->sources; i++) {
85 if (inst->src[i].file != BAD_FILE &&
86 !inst->is_control_source(i)) {
87 const brw_reg_type t = inst->src[i].type;
88 has_int_src |= !brw_type_is_float(t);
89 has_long_src |= brw_type_size_bytes(t) >= 8;
90 }
91 }
92
93 /* Avoid the emitting (RegDist, SWSB) annotations for long
94 * instructions on platforms where they are unordered. It's not clear
95 * what the inferred sync pipe is for them or if we are even allowed
96 * to use these annotations in this case. Return NONE, which should
97 * prevent baked_{un,}ordered_dependency_mode functions from even
98 * trying to emit these annotations.
99 */
100 if (!has_long_pipe && has_long_src)
101 return TGL_PIPE_NONE;
102
103 return has_long_src ? TGL_PIPE_LONG :
104 has_int_src ? TGL_PIPE_INT :
105 TGL_PIPE_FLOAT;
106
107 } else {
108 return TGL_PIPE_FLOAT;
109 }
110 }
111
112 /**
113 * Return the RegDist pipeline that will execute an instruction, or
114 * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
115 * RegDist synchronization mechanism.
116 */
117 tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)118 inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
119 {
120 const brw_reg_type t = get_exec_type(inst);
121 const bool is_dword_multiply = !brw_type_is_float(t) &&
122 ((inst->opcode == BRW_OPCODE_MUL &&
123 MIN2(brw_type_size_bytes(inst->src[0].type),
124 brw_type_size_bytes(inst->src[1].type)) >= 4) ||
125 (inst->opcode == BRW_OPCODE_MAD &&
126 MIN2(brw_type_size_bytes(inst->src[1].type),
127 brw_type_size_bytes(inst->src[2].type)) >= 4));
128
129 if (is_unordered(devinfo, inst))
130 return TGL_PIPE_NONE;
131 else if (devinfo->verx10 < 125)
132 return TGL_PIPE_FLOAT;
133 else if (inst->is_math() && devinfo->ver >= 20)
134 return TGL_PIPE_MATH;
135 else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
136 inst->opcode == SHADER_OPCODE_BROADCAST ||
137 inst->opcode == SHADER_OPCODE_SHUFFLE)
138 return TGL_PIPE_INT;
139 else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
140 return TGL_PIPE_FLOAT;
141 else if (devinfo->ver >= 20 &&
142 brw_type_size_bytes(inst->dst.type) >= 8 &&
143 brw_type_is_float(inst->dst.type)) {
144 assert(devinfo->has_64bit_float);
145 return TGL_PIPE_LONG;
146 } else if (devinfo->ver < 20 &&
147 (brw_type_size_bytes(inst->dst.type) >= 8 ||
148 brw_type_size_bytes(t) >= 8 || is_dword_multiply)) {
149 assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
150 devinfo->has_integer_dword_mul);
151 return TGL_PIPE_LONG;
152 } else if (brw_type_is_float(inst->dst.type))
153 return TGL_PIPE_FLOAT;
154 else
155 return TGL_PIPE_INT;
156 }
157
158 /**
159 * Index of the \p p pipeline counter in the ordered_address vector defined
160 * below.
161 */
162 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \
163 (abort(), ~0u))
164
165 /**
166 * Number of in-order hardware instructions for pipeline index \p contained
167 * in this IR instruction. This determines the increment applied to the
168 * RegDist counter calculated for any ordered dependency that crosses this
169 * instruction.
170 */
171 unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)172 ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
173 unsigned p)
174 {
175 switch (inst->opcode) {
176 case BRW_OPCODE_SYNC:
177 case BRW_OPCODE_DO:
178 case SHADER_OPCODE_UNDEF:
179 case SHADER_OPCODE_HALT_TARGET:
180 case FS_OPCODE_SCHEDULING_FENCE:
181 return 0;
182 default:
183 /* Note that the following is inaccurate for virtual instructions
184 * that expand to more in-order instructions than assumed here, but
185 * that can only lead to suboptimal execution ordering, data
186 * coherency won't be impacted. Providing exact RegDist counts for
187 * each virtual instruction would allow better ALU performance, but
188 * it would require keeping this switch statement in perfect sync
189 * with the generator in order to avoid data corruption. Lesson is
190 * (again) don't use virtual instructions if you want optimal
191 * scheduling.
192 */
193 if (!is_unordered(devinfo, inst) &&
194 (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
195 p == IDX(TGL_PIPE_ALL)))
196 return 1;
197 else
198 return 0;
199 }
200 }
201
202 /**
203 * Type for an instruction counter that increments for in-order
204 * instructions only, arbitrarily denoted 'jp' throughout this lowering
205 * pass in order to distinguish it from the regular instruction counter.
206 * This is represented as a vector with an independent counter for each
207 * asynchronous ALU pipeline in the EU.
208 */
209 struct ordered_address {
210 /**
211 * Construct the ordered address of a dependency known to execute on a
212 * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
213 * is provided), in which case the vector counter will be initialized
214 * with all components equal to INT_MIN (always satisfied) except for
215 * component IDX(p).
216 */
ordered_address__anon548472d60111::ordered_address217 ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
218 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
219 jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
220 INT_MIN : jp0);
221 }
222
223 int jp[IDX(TGL_PIPE_ALL)];
224
225 friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)226 operator==(const ordered_address &jp0, const ordered_address &jp1)
227 {
228 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
229 if (jp0.jp[p] != jp1.jp[p])
230 return false;
231 }
232
233 return true;
234 }
235 };
236
237 /**
238 * Return true if the specified ordered address is trivially satisfied for
239 * all pipelines except potentially for the specified pipeline \p p.
240 */
241 bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)242 is_single_pipe(const ordered_address &jp, tgl_pipe p)
243 {
244 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
245 if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
246 return false;
247 }
248
249 return true;
250 }
251
252 /**
253 * Return the number of instructions in the program.
254 */
255 unsigned
num_instructions(const fs_visitor * shader)256 num_instructions(const fs_visitor *shader)
257 {
258 return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
259 }
260
261 /**
262 * Calculate the local ordered_address instruction counter at every
263 * instruction of the shader for subsequent constant-time look-up.
264 */
265 ordered_address *
ordered_inst_addresses(const fs_visitor * shader)266 ordered_inst_addresses(const fs_visitor *shader)
267 {
268 ordered_address *jps = new ordered_address[num_instructions(shader)];
269 ordered_address jp(TGL_PIPE_ALL, 0);
270 unsigned ip = 0;
271
272 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
273 jps[ip] = jp;
274 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
275 jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
276 ip++;
277 }
278
279 return jps;
280 }
281
282 /**
283 * Synchronization mode required for data manipulated by in-order
284 * instructions.
285 *
286 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
287 * enum for additional type safety. The hardware doesn't provide control
288 * over the synchronization mode for RegDist annotations, this is only used
289 * internally in this pass in order to optimize out redundant read
290 * dependencies where possible.
291 */
292 enum tgl_regdist_mode {
293 TGL_REGDIST_NULL = 0,
294 TGL_REGDIST_SRC = 1,
295 TGL_REGDIST_DST = 2
296 };
297
298 /**
299 * Allow bitwise arithmetic of tgl_regdist_mode enums.
300 */
301 tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)302 operator|(tgl_regdist_mode x, tgl_regdist_mode y)
303 {
304 return tgl_regdist_mode(unsigned(x) | unsigned(y));
305 }
306
307 tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)308 operator&(tgl_regdist_mode x, tgl_regdist_mode y)
309 {
310 return tgl_regdist_mode(unsigned(x) & unsigned(y));
311 }
312
313 tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)314 operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
315 {
316 return x = x | y;
317 }
318
319 tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)320 operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
321 {
322 return x = x & y;
323 }
324
325 /** @} */
326
327 /**
328 * Representation of an equivalence relation among the set of unsigned
329 * integers.
330 *
331 * Its initial state is the identity relation '~' such that i ~ j if and
332 * only if i == j for every pair of unsigned integers i and j.
333 */
334 struct equivalence_relation {
equivalence_relation__anon548472d60111::equivalence_relation335 equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
336 {
337 for (unsigned i = 0; i < n; i++)
338 is[i] = i;
339 }
340
~equivalence_relation__anon548472d60111::equivalence_relation341 ~equivalence_relation()
342 {
343 delete[] is;
344 }
345
346 /**
347 * Return equivalence class index of the specified element. Effectively
348 * this is the numeric value of an arbitrary representative from the
349 * equivalence class.
350 *
351 * Allows the evaluation of the equivalence relation according to the
352 * rule that i ~ j if and only if lookup(i) == lookup(j).
353 */
354 unsigned
lookup__anon548472d60111::equivalence_relation355 lookup(unsigned i) const
356 {
357 if (i < n && is[i] != i)
358 return lookup(is[i]);
359 else
360 return i;
361 }
362
363 /**
364 * Create an array with the results of the lookup() method for
365 * constant-time evaluation.
366 */
367 unsigned *
flatten__anon548472d60111::equivalence_relation368 flatten() const
369 {
370 unsigned *ids = new unsigned[n];
371
372 for (unsigned i = 0; i < n; i++)
373 ids[i] = lookup(i);
374
375 return ids;
376 }
377
378 /**
379 * Mutate the existing equivalence relation minimally by imposing the
380 * additional requirement that i ~ j.
381 *
382 * The algorithm updates the internal representation recursively in
383 * order to guarantee transitivity while preserving the previously
384 * specified equivalence requirements.
385 */
386 unsigned
link__anon548472d60111::equivalence_relation387 link(unsigned i, unsigned j)
388 {
389 const unsigned k = lookup(i);
390 assign(i, k);
391 assign(j, k);
392 return k;
393 }
394
395 private:
396 equivalence_relation(const equivalence_relation &);
397
398 equivalence_relation &
399 operator=(const equivalence_relation &);
400
401 /**
402 * Assign the representative of \p from to be equivalent to \p to.
403 *
404 * At the same time the data structure is partially flattened as much as
405 * it's possible without increasing the number of recursive calls.
406 */
407 void
assign__anon548472d60111::equivalence_relation408 assign(unsigned from, unsigned to)
409 {
410 if (from != to) {
411 assert(from < n);
412
413 if (is[from] != from)
414 assign(is[from], to);
415
416 is[from] = to;
417 }
418 }
419
420 unsigned *is;
421 unsigned n;
422 };
423
424 /**
425 * Representation of a data dependency between two instructions in the
426 * program.
427 * @{
428 */
429 struct dependency {
430 /**
431 * No dependency information.
432 */
dependency__anon548472d60111::dependency433 dependency() : ordered(TGL_REGDIST_NULL), jp(),
434 unordered(TGL_SBID_NULL), id(0),
435 exec_all(false) {}
436
437 /**
438 * Construct a dependency on the in-order instruction with the provided
439 * ordered_address instruction counter.
440 */
dependency__anon548472d60111::dependency441 dependency(tgl_regdist_mode mode, const ordered_address &jp,
442 bool exec_all) :
443 ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
444 exec_all(exec_all) {}
445
446 /**
447 * Construct a dependency on the out-of-order instruction with the
448 * specified synchronization token.
449 */
dependency__anon548472d60111::dependency450 dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
451 ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
452 exec_all(exec_all) {}
453
454 /**
455 * Synchronization mode of in-order dependency, or zero if no in-order
456 * dependency is present.
457 */
458 tgl_regdist_mode ordered;
459
460 /**
461 * Instruction counter of in-order dependency.
462 *
463 * For a dependency part of a different block in the program, this is
464 * relative to the specific control flow path taken between the
465 * dependency and the current block: It is the ordered_address such that
466 * the difference between it and the ordered_address of the first
467 * instruction of the current block is exactly the number of in-order
468 * instructions across that control flow path. It is not guaranteed to
469 * be equal to the local ordered_address of the generating instruction
470 * [as returned by ordered_inst_addresses()], except for block-local
471 * dependencies.
472 */
473 ordered_address jp;
474
475 /**
476 * Synchronization mode of unordered dependency, or zero if no unordered
477 * dependency is present.
478 */
479 tgl_sbid_mode unordered;
480
481 /** Synchronization token of out-of-order dependency. */
482 unsigned id;
483
484 /**
485 * Whether the dependency could be run with execution masking disabled,
486 * which might lead to the unwanted execution of the generating
487 * instruction in cases where a BB is executed with all channels
488 * disabled due to hardware bug Wa_1407528679.
489 */
490 bool exec_all;
491
492 /**
493 * Trivial in-order dependency that's always satisfied.
494 *
495 * Note that unlike a default-constructed dependency() which is also
496 * trivially satisfied, this is considered to provide dependency
497 * information and can be used to clear a previously pending dependency
498 * via shadow().
499 */
500 static const dependency done;
501
502 friend bool
operator ==(const dependency & dep0,const dependency & dep1)503 operator==(const dependency &dep0, const dependency &dep1)
504 {
505 return dep0.ordered == dep1.ordered &&
506 dep0.jp == dep1.jp &&
507 dep0.unordered == dep1.unordered &&
508 dep0.id == dep1.id &&
509 dep0.exec_all == dep1.exec_all;
510 }
511
512 friend bool
operator !=(const dependency & dep0,const dependency & dep1)513 operator!=(const dependency &dep0, const dependency &dep1)
514 {
515 return !(dep0 == dep1);
516 }
517 };
518
519 const dependency dependency::done =
520 dependency(TGL_REGDIST_DST, ordered_address(), false);
521
522 /**
523 * Return whether \p dep contains any dependency information.
524 */
525 bool
is_valid(const dependency & dep)526 is_valid(const dependency &dep)
527 {
528 return dep.ordered || dep.unordered;
529 }
530
531 /**
532 * Combine \p dep0 and \p dep1 into a single dependency object that is only
533 * satisfied when both original dependencies are satisfied. This might
534 * involve updating the equivalence relation \p eq in order to make sure
535 * that both out-of-order dependencies are assigned the same hardware SBID
536 * as synchronization token.
537 */
538 dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)539 merge(equivalence_relation &eq,
540 const dependency &dep0, const dependency &dep1)
541 {
542 dependency dep;
543
544 if (dep0.ordered || dep1.ordered) {
545 dep.ordered = dep0.ordered | dep1.ordered;
546 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
547 dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
548 }
549
550 if (dep0.unordered || dep1.unordered) {
551 dep.unordered = dep0.unordered | dep1.unordered;
552 dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
553 dep1.unordered ? dep1.id : dep0.id);
554 }
555
556 dep.exec_all = dep0.exec_all || dep1.exec_all;
557
558 return dep;
559 }
560
561 /**
562 * Override dependency information of \p dep0 with that of \p dep1.
563 */
564 dependency
shadow(const dependency & dep0,const dependency & dep1)565 shadow(const dependency &dep0, const dependency &dep1)
566 {
567 if (dep0.ordered == TGL_REGDIST_SRC &&
568 is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
569 !(dep1.ordered & TGL_REGDIST_DST)) {
570 /* As an optimization (see dependency_for_read()),
571 * instructions with a RaR dependency don't synchronize
572 * against a previous in-order read, so we need to pass
573 * through both ordered dependencies instead of simply
574 * dropping the first one. Otherwise we could encounter a
575 * WaR data hazard between OP0 and OP2 in cases like:
576 *
577 * OP0 r1:f r0:d
578 * OP1 r2:d r0:d
579 * OP2 r0:d r3:d
580 *
581 * since only the integer-pipeline r0 dependency from OP1
582 * would be visible to OP2, even though OP0 could technically
583 * execute after OP1 due to the floating-point and integer
584 * pipelines being asynchronous on Gfx12.5+ platforms, so
585 * synchronizing OP2 against OP1 would be insufficient.
586 */
587 dependency dep = dep1;
588
589 dep.ordered |= dep0.ordered;
590 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
591 dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
592
593 return dep;
594 } else {
595 return is_valid(dep1) ? dep1 : dep0;
596 }
597 }
598
599 /**
600 * Translate dependency information across the program.
601 *
602 * This returns a dependency on the same instruction translated to the
603 * ordered_address space of a different block. The correct shift for
604 * transporting a dependency across an edge of the CFG is the difference
605 * between the local ordered_address of the first instruction of the target
606 * block and the local ordered_address of the instruction immediately after
607 * the end of the origin block.
608 */
609 dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])610 transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
611 {
612 if (dep.ordered) {
613 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
614 if (dep.jp.jp[p] > INT_MIN)
615 dep.jp.jp[p] += delta[p];
616 }
617 }
618
619 return dep;
620 }
621
622 /**
623 * Return simplified dependency removing any synchronization modes not
624 * applicable to an instruction reading the same register location.
625 */
626 dependency
dependency_for_read(dependency dep)627 dependency_for_read(dependency dep)
628 {
629 dep.ordered &= TGL_REGDIST_DST;
630 return dep;
631 }
632
633 /**
634 * Return simplified dependency removing any synchronization modes not
635 * applicable to an instruction \p inst writing the same register location.
636 *
637 * This clears any WaR dependency for writes performed from the same
638 * pipeline as the read, since there is no possibility for a data hazard.
639 */
640 dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)641 dependency_for_write(const struct intel_device_info *devinfo,
642 const fs_inst *inst, dependency dep)
643 {
644 if (!is_unordered(devinfo, inst) &&
645 is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
646 dep.ordered &= TGL_REGDIST_DST;
647 return dep;
648 }
649
650 /** @} */
651
652 /**
653 * Scoreboard representation. This keeps track of the data dependencies of
654 * registers with GRF granularity.
655 */
656 class scoreboard {
657 public:
658 /**
659 * Look up the most current data dependency for register \p r.
660 */
661 dependency
get(const brw_reg & r) const662 get(const brw_reg &r) const
663 {
664 if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
665 return *p;
666 else
667 return dependency();
668 }
669
670 /**
671 * Specify the most current data dependency for register \p r.
672 */
673 void
set(const brw_reg & r,const dependency & d)674 set(const brw_reg &r, const dependency &d)
675 {
676 if (dependency *p = dep(r))
677 *p = d;
678 }
679
680 /**
681 * Component-wise merge() of corresponding dependencies from two
682 * scoreboard objects. \sa merge().
683 */
684 friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)685 merge(equivalence_relation &eq,
686 const scoreboard &sb0, const scoreboard &sb1)
687 {
688 scoreboard sb;
689
690 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
691 sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
692
693 sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
694 sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
695
696 return sb;
697 }
698
699 /**
700 * Component-wise shadow() of corresponding dependencies from two
701 * scoreboard objects. \sa shadow().
702 */
703 friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)704 shadow(const scoreboard &sb0, const scoreboard &sb1)
705 {
706 scoreboard sb;
707
708 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
709 sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
710
711 sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
712 sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
713
714 return sb;
715 }
716
717 /**
718 * Component-wise transport() of dependencies from a scoreboard
719 * object. \sa transport().
720 */
721 friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])722 transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
723 {
724 scoreboard sb;
725
726 for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
727 sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
728
729 sb.addr_dep = transport(sb0.addr_dep, delta);
730 sb.accum_dep = transport(sb0.accum_dep, delta);
731
732 return sb;
733 }
734
735 friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)736 operator==(const scoreboard &sb0, const scoreboard &sb1)
737 {
738 for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
739 if (sb0.grf_deps[i] != sb1.grf_deps[i])
740 return false;
741 }
742
743 if (sb0.addr_dep != sb1.addr_dep)
744 return false;
745
746 if (sb0.accum_dep != sb1.accum_dep)
747 return false;
748
749 return true;
750 }
751
752 friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)753 operator!=(const scoreboard &sb0, const scoreboard &sb1)
754 {
755 return !(sb0 == sb1);
756 }
757
758 private:
759 dependency grf_deps[XE2_MAX_GRF];
760 dependency addr_dep;
761 dependency accum_dep;
762
763 dependency *
dep(const brw_reg & r)764 dep(const brw_reg &r)
765 {
766 const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
767 reg_offset(r) / REG_SIZE);
768
769 return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
770 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
771 reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
772 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
773 reg < BRW_ARF_FLAG ? &accum_dep :
774 NULL);
775 }
776 };
777
778 /**
779 * Dependency list handling.
780 * @{
781 */
782 struct dependency_list {
dependency_list__anon548472d60111::dependency_list783 dependency_list() : deps(NULL), n(0) {}
784
~dependency_list__anon548472d60111::dependency_list785 ~dependency_list()
786 {
787 free(deps);
788 }
789
790 void
push_back__anon548472d60111::dependency_list791 push_back(const dependency &dep)
792 {
793 deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
794 deps[n++] = dep;
795 }
796
797 unsigned
size__anon548472d60111::dependency_list798 size() const
799 {
800 return n;
801 }
802
803 const dependency &
operator []__anon548472d60111::dependency_list804 operator[](unsigned i) const
805 {
806 assert(i < n);
807 return deps[i];
808 }
809
810 dependency &
operator []__anon548472d60111::dependency_list811 operator[](unsigned i)
812 {
813 assert(i < n);
814 return deps[i];
815 }
816
817 private:
818 dependency_list(const dependency_list &);
819 dependency_list &
820 operator=(const dependency_list &);
821
822 dependency *deps;
823 unsigned n;
824 };
825
826 /**
827 * Add dependency \p dep to the list of dependencies of an instruction
828 * \p deps.
829 */
830 void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)831 add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
832 {
833 if (is_valid(dep)) {
834 /* Translate the unordered dependency token first in order to keep
835 * the list minimally redundant.
836 */
837 if (dep.unordered)
838 dep.id = ids[dep.id];
839
840 /* Try to combine the specified dependency with any existing ones. */
841 for (unsigned i = 0; i < deps.size(); i++) {
842 /* Don't combine otherwise matching dependencies if there is an
843 * exec_all mismatch which would cause a SET dependency to gain an
844 * exec_all flag, since that would prevent it from being baked
845 * into the instruction we want to allocate an SBID for.
846 */
847 if (deps[i].exec_all != dep.exec_all &&
848 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
849 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
850 continue;
851
852 if (dep.ordered && deps[i].ordered) {
853 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
854 deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
855
856 deps[i].ordered |= dep.ordered;
857 deps[i].exec_all |= dep.exec_all;
858 dep.ordered = TGL_REGDIST_NULL;
859 }
860
861 if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
862 deps[i].unordered |= dep.unordered;
863 deps[i].exec_all |= dep.exec_all;
864 dep.unordered = TGL_SBID_NULL;
865 }
866 }
867
868 /* Add it to the end of the list if necessary. */
869 if (is_valid(dep))
870 deps.push_back(dep);
871 }
872 }
873
874 /**
875 * Construct a tgl_swsb annotation encoding any ordered dependencies from
876 * the dependency list \p deps of an instruction with ordered_address \p
877 * jp. If \p exec_all is false only dependencies known to be executed with
878 * channel masking applied will be considered in the calculation.
879 */
880 tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)881 ordered_dependency_swsb(const dependency_list &deps,
882 const ordered_address &jp,
883 bool exec_all)
884 {
885 tgl_pipe p = TGL_PIPE_NONE;
886 unsigned min_dist = ~0u;
887
888 for (unsigned i = 0; i < deps.size(); i++) {
889 if (deps[i].ordered && exec_all >= deps[i].exec_all) {
890 for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
891 const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
892 const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
893 assert(jp.jp[q] > deps[i].jp.jp[q]);
894 if (dist <= max_dist) {
895 p = (p && IDX(p) != q ? TGL_PIPE_ALL :
896 tgl_pipe(TGL_PIPE_FLOAT + q));
897 min_dist = MIN3(min_dist, dist, 7);
898 }
899 }
900 }
901 }
902
903 return { p ? min_dist : 0, p };
904 }
905
906 /**
907 * Return whether the dependency list \p deps of an instruction with
908 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
909 * exec_all is false only dependencies known to be executed with channel
910 * masking applied will be considered in the calculation.
911 */
912 bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)913 find_ordered_dependency(const dependency_list &deps,
914 const ordered_address &jp,
915 bool exec_all)
916 {
917 return ordered_dependency_swsb(deps, jp, exec_all).regdist;
918 }
919
920 /**
921 * Return the full tgl_sbid_mode bitset for the first unordered dependency
922 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
923 * no such dependency is present. If \p exec_all is false only
924 * dependencies known to be executed with channel masking applied will be
925 * considered in the calculation.
926 */
927 tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)928 find_unordered_dependency(const dependency_list &deps,
929 tgl_sbid_mode unordered,
930 bool exec_all)
931 {
932 if (unordered) {
933 for (unsigned i = 0; i < deps.size(); i++) {
934 if ((unordered & deps[i].unordered) &&
935 exec_all >= deps[i].exec_all)
936 return deps[i].unordered;
937 }
938 }
939
940 return TGL_SBID_NULL;
941 }
942
943 /**
944 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
945 * \p deps that can be represented directly in the SWSB annotation of the
946 * instruction without additional SYNC instructions, or zero if no such
947 * dependency is present.
948 */
949 tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)950 baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
951 const fs_inst *inst,
952 const dependency_list &deps,
953 const ordered_address &jp)
954 {
955 const bool exec_all = inst->force_writemask_all;
956 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
957 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
958 exec_all).pipe;
959
960 if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
961 return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
962 else if (has_ordered && is_unordered(devinfo, inst))
963 return TGL_SBID_NULL;
964 else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
965 (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
966 return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
967 else if (!has_ordered)
968 return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
969 else
970 return TGL_SBID_NULL;
971 }
972
973 /**
974 * Return whether an ordered dependency from the list \p deps can be
975 * represented directly in the SWSB annotation of the instruction without
976 * additional SYNC instructions.
977 */
978 bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)979 baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
980 const fs_inst *inst,
981 const dependency_list &deps,
982 const ordered_address &jp)
983 {
984 const bool exec_all = inst->force_writemask_all;
985 const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
986 const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
987 exec_all).pipe;
988 const tgl_sbid_mode unordered_mode =
989 baked_unordered_dependency_mode(devinfo, inst, deps, jp);
990
991 if (!has_ordered)
992 return false;
993 else if (!unordered_mode)
994 return true;
995 else
996 return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
997 unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
998 TGL_SBID_DST);
999 }
1000
1001 /** @} */
1002
1003 /**
1004 * Shader instruction dependency calculation.
1005 * @{
1006 */
1007
1008 /**
1009 * Update scoreboard object \p sb to account for the execution of
1010 * instruction \p inst.
1011 */
1012 void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)1013 update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
1014 const fs_inst *inst, unsigned ip, scoreboard &sb)
1015 {
1016 const bool exec_all = inst->force_writemask_all;
1017 const struct intel_device_info *devinfo = shader->devinfo;
1018 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1019 const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1020 ordered_address();
1021 const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1022 const bool is_unordered_math =
1023 (inst->is_math() && devinfo->ver < 20) ||
1024 (devinfo->has_64bit_float_via_math_pipe &&
1025 (get_exec_type(inst) == BRW_TYPE_DF ||
1026 inst->dst.type == BRW_TYPE_DF));
1027
1028 /* Track any source registers that may be fetched asynchronously by this
1029 * instruction, otherwise clear the dependency in order to avoid
1030 * subsequent redundant synchronization.
1031 */
1032 for (unsigned i = 0; i < inst->sources; i++) {
1033 const dependency rd_dep =
1034 (inst->is_payload(i) ||
1035 inst->opcode == BRW_OPCODE_DPAS ||
1036 is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1037 is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1038 dependency::done;
1039
1040 for (unsigned j = 0; j < regs_read(inst, i); j++) {
1041 const brw_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1042 sb.set(r, shadow(sb.get(r), rd_dep));
1043 }
1044 }
1045
1046 if (inst->reads_accumulator_implicitly())
1047 sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1048
1049 /* Track any destination registers of this instruction. */
1050 const dependency wr_dep =
1051 is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1052 is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1053 dependency();
1054
1055 if (inst->writes_accumulator_implicitly(devinfo))
1056 sb.set(brw_acc_reg(8), wr_dep);
1057
1058 if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1059 !inst->dst.is_null()) {
1060 for (unsigned j = 0; j < regs_written(inst); j++)
1061 sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1062 }
1063 }
1064
1065 /**
1066 * Calculate scoreboard objects locally that represent any pending (and
1067 * unconditionally resolved) dependencies at the end of each block of the
1068 * program.
1069 */
1070 scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1071 gather_block_scoreboards(const fs_visitor *shader,
1072 const ordered_address *jps)
1073 {
1074 scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1075 unsigned ip = 0;
1076
1077 foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1078 update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1079
1080 return sbs;
1081 }
1082
1083 /**
1084 * Propagate data dependencies globally through the control flow graph
1085 * until a fixed point is reached.
1086 *
1087 * Calculates the set of dependencies potentially pending at the beginning
1088 * of each block, and returns it as an array of scoreboard objects.
1089 */
1090 scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1091 propagate_block_scoreboards(const fs_visitor *shader,
1092 const ordered_address *jps,
1093 equivalence_relation &eq)
1094 {
1095 const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1096 scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1097 scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1098
1099 for (bool progress = true; progress;) {
1100 progress = false;
1101
1102 foreach_block(block, shader->cfg) {
1103 const scoreboard sb = shadow(in_sbs[block->num],
1104 delta_sbs[block->num]);
1105
1106 if (sb != out_sbs[block->num]) {
1107 foreach_list_typed(bblock_link, child_link, link,
1108 &block->children) {
1109 scoreboard &in_sb = in_sbs[child_link->block->num];
1110 int delta[IDX(TGL_PIPE_ALL)];
1111
1112 for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1113 delta[p] = jps[child_link->block->start_ip].jp[p]
1114 - jps[block->end_ip].jp[p]
1115 - ordered_unit(shader->devinfo,
1116 static_cast<const fs_inst *>(block->end()), p);
1117
1118 in_sb = merge(eq, in_sb, transport(sb, delta));
1119 }
1120
1121 out_sbs[block->num] = sb;
1122 progress = true;
1123 }
1124 }
1125 }
1126
1127 delete[] delta_sbs;
1128 delete[] out_sbs;
1129
1130 return in_sbs;
1131 }
1132
1133 /**
1134 * Return the list of potential dependencies of each instruction in the
1135 * shader based on the result of global dependency analysis.
1136 */
1137 dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1138 gather_inst_dependencies(const fs_visitor *shader,
1139 const ordered_address *jps)
1140 {
1141 const struct intel_device_info *devinfo = shader->devinfo;
1142 equivalence_relation eq(num_instructions(shader));
1143 scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1144 const unsigned *ids = eq.flatten();
1145 dependency_list *deps = new dependency_list[num_instructions(shader)];
1146 unsigned ip = 0;
1147
1148 foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1149 const bool exec_all = inst->force_writemask_all;
1150 const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1151 scoreboard &sb = sbs[block->num];
1152
1153 for (unsigned i = 0; i < inst->sources; i++) {
1154 for (unsigned j = 0; j < regs_read(inst, i); j++)
1155 add_dependency(ids, deps[ip], dependency_for_read(
1156 sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1157 }
1158
1159 if (inst->reads_accumulator_implicitly()) {
1160 /* Wa_22012725308:
1161 *
1162 * "When the accumulator registers are used as source and/or
1163 * destination, hardware does not ensure prevention of write
1164 * after read hazard across execution pipes."
1165 */
1166 const dependency dep = sb.get(brw_acc_reg(8));
1167 if (dep.ordered && !is_single_pipe(dep.jp, p))
1168 add_dependency(ids, deps[ip], dep);
1169 }
1170
1171 if (is_unordered(devinfo, inst) && !inst->eot)
1172 add_dependency(ids, deps[ip],
1173 dependency(TGL_SBID_SET, ip, exec_all));
1174
1175 if (!inst->no_dd_check) {
1176 if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1177 !inst->dst.is_accumulator()) {
1178 for (unsigned j = 0; j < regs_written(inst); j++) {
1179 add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1180 sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1181 }
1182 }
1183
1184 if (inst->writes_accumulator_implicitly(devinfo) ||
1185 inst->dst.is_accumulator()) {
1186 /* Wa_22012725308:
1187 *
1188 * "When the accumulator registers are used as source and/or
1189 * destination, hardware does not ensure prevention of write
1190 * after read hazard across execution pipes."
1191 */
1192 const dependency dep = sb.get(brw_acc_reg(8));
1193 if (dep.ordered && !is_single_pipe(dep.jp, p))
1194 add_dependency(ids, deps[ip], dep);
1195 }
1196 }
1197
1198 update_inst_scoreboard(shader, jps, inst, ip, sb);
1199 ip++;
1200 }
1201
1202 delete[] sbs;
1203 delete[] ids;
1204
1205 return deps;
1206 }
1207
1208 /** @} */
1209
1210 /**
1211 * Allocate SBID tokens to track the execution of every out-of-order
1212 * instruction of the shader.
1213 */
1214 dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1215 allocate_inst_dependencies(const fs_visitor *shader,
1216 const dependency_list *deps0)
1217 {
1218 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1219 * shaders with a large number of SEND messages.
1220 *
1221 * XXX - Use 32 SBIDs on Xe2+ while in large GRF mode.
1222 */
1223 const unsigned num_sbids = 16;
1224
1225 /* Allocate an unordered dependency ID to hardware SBID translation
1226 * table with as many entries as instructions there are in the shader,
1227 * which is the maximum number of unordered IDs we can find in the
1228 * program.
1229 */
1230 unsigned *ids = new unsigned[num_instructions(shader)];
1231 for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1232 ids[ip] = ~0u;
1233
1234 dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1235 unsigned next_id = 0;
1236
1237 for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1238 for (unsigned i = 0; i < deps0[ip].size(); i++) {
1239 const dependency &dep = deps0[ip][i];
1240
1241 if (dep.unordered && ids[dep.id] == ~0u)
1242 ids[dep.id] = (next_id++) & (num_sbids - 1);
1243
1244 add_dependency(ids, deps1[ip], dep);
1245 }
1246 }
1247
1248 delete[] ids;
1249
1250 return deps1;
1251 }
1252
1253 /**
1254 * Emit dependency information provided by \p deps into the shader,
1255 * inserting additional SYNC instructions for dependencies that can't be
1256 * represented directly by annotating existing instructions.
1257 */
1258 void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1259 emit_inst_dependencies(fs_visitor *shader,
1260 const ordered_address *jps,
1261 const dependency_list *deps)
1262 {
1263 const struct intel_device_info *devinfo = shader->devinfo;
1264 unsigned ip = 0;
1265
1266 foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1267 const bool exec_all = inst->force_writemask_all;
1268 const bool ordered_mode =
1269 baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1270 const tgl_sbid_mode unordered_mode =
1271 baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1272 tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1273 ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1274
1275 for (unsigned i = 0; i < deps[ip].size(); i++) {
1276 const dependency &dep = deps[ip][i];
1277
1278 if (dep.unordered) {
1279 if (unordered_mode == dep.unordered &&
1280 exec_all >= dep.exec_all && !swsb.mode) {
1281 /* Bake unordered dependency into the instruction's SWSB if
1282 * possible, except in cases where the current instruction
1283 * isn't marked NoMask but the dependency is, since that
1284 * might lead to data coherency issues due to
1285 * Wa_1407528679.
1286 */
1287 swsb.sbid = dep.id;
1288 swsb.mode = dep.unordered;
1289 } else {
1290 /* Emit dependency into the SWSB of an extra SYNC
1291 * instruction.
1292 */
1293 const fs_builder ibld = fs_builder(shader, block, inst)
1294 .exec_all().group(1, 0);
1295 fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1296 sync->sched.sbid = dep.id;
1297 sync->sched.mode = dep.unordered;
1298 assert(!(sync->sched.mode & TGL_SBID_SET));
1299 }
1300 }
1301 }
1302
1303 for (unsigned i = 0; i < deps[ip].size(); i++) {
1304 const dependency &dep = deps[ip][i];
1305
1306 if (dep.ordered &&
1307 find_ordered_dependency(deps[ip], jps[ip], true) &&
1308 (!ordered_mode || dep.exec_all > exec_all)) {
1309 /* If the current instruction is not marked NoMask but an
1310 * ordered dependency is, perform the synchronization as a
1311 * separate NoMask SYNC instruction in order to avoid data
1312 * coherency issues due to Wa_1407528679. The similar
1313 * scenario with unordered dependencies should have been
1314 * handled above.
1315 */
1316 const fs_builder ibld = fs_builder(shader, block, inst)
1317 .exec_all().group(1, 0);
1318 fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1319 sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1320 break;
1321 }
1322 }
1323
1324 /* Update the IR. */
1325 inst->sched = swsb;
1326 inst->no_dd_check = inst->no_dd_clear = false;
1327 ip++;
1328 }
1329 }
1330 }
1331
1332 bool
brw_fs_lower_scoreboard(fs_visitor & s)1333 brw_fs_lower_scoreboard(fs_visitor &s)
1334 {
1335 if (s.devinfo->ver >= 12) {
1336 const ordered_address *jps = ordered_inst_addresses(&s);
1337 const dependency_list *deps0 = gather_inst_dependencies(&s, jps);
1338 const dependency_list *deps1 = allocate_inst_dependencies(&s, deps0);
1339 emit_inst_dependencies(&s, jps, deps1);
1340 delete[] deps1;
1341 delete[] deps0;
1342 delete[] jps;
1343 }
1344
1345 return true;
1346 }
1347