xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_scoreboard.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file
25  *
26  * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27  * data coherency between register reads and writes in previous generations.
28  * This lowering pass runs after register allocation in order to make up for
29  * it.
30  *
31  * It works by performing global dataflow analysis in order to determine the
32  * set of potential dependencies of every instruction in the shader, and then
33  * inserts any required SWSB annotations and additional SYNC instructions in
34  * order to guarantee data coherency.
35  *
36  * WARNING - Access of the following (rarely used) ARF registers is not
37  *           tracked here, and require the RegDist SWSB annotation to be set
38  *           to 1 by the generator in order to avoid data races:
39  *
40  *  - sp stack pointer
41  *  - sr0 state register
42  *  - cr0 control register
43  *  - ip instruction pointer
44  *  - tm0 timestamp register
45  *  - dbg0 debug register
46  *  - acc2-9 special accumulator registers on TGL
47  *  - mme0-7 math macro extended accumulator registers
48  *
49  * The following ARF registers don't need to be tracked here because data
50  * coherency is still provided transparently by the hardware:
51  *
52  *  - f0-1 flag registers
53  *  - n0 notification register
54  *  - tdr0 thread dependency register
55  */
56 
57 #include "brw_fs.h"
58 #include "brw_fs_builder.h"
59 #include "brw_cfg.h"
60 
61 using namespace brw;
62 
63 namespace {
64    /**
65     * In-order instruction accounting.
66     * @{
67     */
68 
69    /**
70     * Return the RegDist pipeline the hardware will synchronize with if no
71     * pipeline information is provided in the SWSB annotation of an
72     * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
73     */
74    tgl_pipe
inferred_sync_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)75    inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
76    {
77       if (devinfo->verx10 >= 125) {
78          bool has_int_src = false, has_long_src = false;
79          const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
80 
81          if (is_send(inst))
82             return TGL_PIPE_NONE;
83 
84          for (unsigned i = 0; i < inst->sources; i++) {
85             if (inst->src[i].file != BAD_FILE &&
86                 !inst->is_control_source(i)) {
87                const brw_reg_type t = inst->src[i].type;
88                has_int_src |= !brw_type_is_float(t);
89                has_long_src |= brw_type_size_bytes(t) >= 8;
90             }
91          }
92 
93          /* Avoid the emitting (RegDist, SWSB) annotations for long
94           * instructions on platforms where they are unordered. It's not clear
95           * what the inferred sync pipe is for them or if we are even allowed
96           * to use these annotations in this case. Return NONE, which should
97           * prevent baked_{un,}ordered_dependency_mode functions from even
98           * trying to emit these annotations.
99           */
100          if (!has_long_pipe && has_long_src)
101             return TGL_PIPE_NONE;
102 
103          return has_long_src ? TGL_PIPE_LONG :
104                 has_int_src ? TGL_PIPE_INT :
105                 TGL_PIPE_FLOAT;
106 
107       } else {
108          return TGL_PIPE_FLOAT;
109       }
110    }
111 
112    /**
113     * Return the RegDist pipeline that will execute an instruction, or
114     * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
115     * RegDist synchronization mechanism.
116     */
117    tgl_pipe
inferred_exec_pipe(const struct intel_device_info * devinfo,const fs_inst * inst)118    inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
119    {
120       const brw_reg_type t = get_exec_type(inst);
121       const bool is_dword_multiply = !brw_type_is_float(t) &&
122          ((inst->opcode == BRW_OPCODE_MUL &&
123            MIN2(brw_type_size_bytes(inst->src[0].type),
124                 brw_type_size_bytes(inst->src[1].type)) >= 4) ||
125           (inst->opcode == BRW_OPCODE_MAD &&
126            MIN2(brw_type_size_bytes(inst->src[1].type),
127                 brw_type_size_bytes(inst->src[2].type)) >= 4));
128 
129       if (is_unordered(devinfo, inst))
130          return TGL_PIPE_NONE;
131       else if (devinfo->verx10 < 125)
132          return TGL_PIPE_FLOAT;
133       else if (inst->is_math() && devinfo->ver >= 20)
134          return TGL_PIPE_MATH;
135       else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
136                inst->opcode == SHADER_OPCODE_BROADCAST ||
137                inst->opcode == SHADER_OPCODE_SHUFFLE)
138          return TGL_PIPE_INT;
139       else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
140          return TGL_PIPE_FLOAT;
141       else if (devinfo->ver >= 20 &&
142                brw_type_size_bytes(inst->dst.type) >= 8 &&
143                brw_type_is_float(inst->dst.type)) {
144          assert(devinfo->has_64bit_float);
145          return TGL_PIPE_LONG;
146       } else if (devinfo->ver < 20 &&
147                  (brw_type_size_bytes(inst->dst.type) >= 8 ||
148                   brw_type_size_bytes(t) >= 8 || is_dword_multiply)) {
149          assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
150                 devinfo->has_integer_dword_mul);
151          return TGL_PIPE_LONG;
152       } else if (brw_type_is_float(inst->dst.type))
153          return TGL_PIPE_FLOAT;
154       else
155          return TGL_PIPE_INT;
156    }
157 
158    /**
159     * Index of the \p p pipeline counter in the ordered_address vector defined
160     * below.
161     */
162 #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) :    \
163                 (abort(), ~0u))
164 
165    /**
166     * Number of in-order hardware instructions for pipeline index \p contained
167     * in this IR instruction.  This determines the increment applied to the
168     * RegDist counter calculated for any ordered dependency that crosses this
169     * instruction.
170     */
171    unsigned
ordered_unit(const struct intel_device_info * devinfo,const fs_inst * inst,unsigned p)172    ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
173                 unsigned p)
174    {
175       switch (inst->opcode) {
176       case BRW_OPCODE_SYNC:
177       case BRW_OPCODE_DO:
178       case SHADER_OPCODE_UNDEF:
179       case SHADER_OPCODE_HALT_TARGET:
180       case FS_OPCODE_SCHEDULING_FENCE:
181          return 0;
182       default:
183          /* Note that the following is inaccurate for virtual instructions
184           * that expand to more in-order instructions than assumed here, but
185           * that can only lead to suboptimal execution ordering, data
186           * coherency won't be impacted.  Providing exact RegDist counts for
187           * each virtual instruction would allow better ALU performance, but
188           * it would require keeping this switch statement in perfect sync
189           * with the generator in order to avoid data corruption.  Lesson is
190           * (again) don't use virtual instructions if you want optimal
191           * scheduling.
192           */
193          if (!is_unordered(devinfo, inst) &&
194              (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
195               p == IDX(TGL_PIPE_ALL)))
196             return 1;
197          else
198             return 0;
199       }
200    }
201 
202    /**
203     * Type for an instruction counter that increments for in-order
204     * instructions only, arbitrarily denoted 'jp' throughout this lowering
205     * pass in order to distinguish it from the regular instruction counter.
206     * This is represented as a vector with an independent counter for each
207     * asynchronous ALU pipeline in the EU.
208     */
209    struct ordered_address {
210       /**
211        * Construct the ordered address of a dependency known to execute on a
212        * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
213        * is provided), in which case the vector counter will be initialized
214        * with all components equal to INT_MIN (always satisfied) except for
215        * component IDX(p).
216        */
ordered_address__anon548472d60111::ordered_address217       ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
218          for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
219             jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
220                      INT_MIN : jp0);
221       }
222 
223       int jp[IDX(TGL_PIPE_ALL)];
224 
225       friend bool
operator ==(const ordered_address & jp0,const ordered_address & jp1)226       operator==(const ordered_address &jp0, const ordered_address &jp1)
227       {
228          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
229             if (jp0.jp[p] != jp1.jp[p])
230                return false;
231          }
232 
233          return true;
234       }
235    };
236 
237    /**
238     * Return true if the specified ordered address is trivially satisfied for
239     * all pipelines except potentially for the specified pipeline \p p.
240     */
241    bool
is_single_pipe(const ordered_address & jp,tgl_pipe p)242    is_single_pipe(const ordered_address &jp, tgl_pipe p)
243    {
244       for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
245          if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
246             return false;
247       }
248 
249       return true;
250    }
251 
252    /**
253     * Return the number of instructions in the program.
254     */
255    unsigned
num_instructions(const fs_visitor * shader)256    num_instructions(const fs_visitor *shader)
257    {
258       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
259    }
260 
261    /**
262     * Calculate the local ordered_address instruction counter at every
263     * instruction of the shader for subsequent constant-time look-up.
264     */
265    ordered_address *
ordered_inst_addresses(const fs_visitor * shader)266    ordered_inst_addresses(const fs_visitor *shader)
267    {
268       ordered_address *jps = new ordered_address[num_instructions(shader)];
269       ordered_address jp(TGL_PIPE_ALL, 0);
270       unsigned ip = 0;
271 
272       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
273          jps[ip] = jp;
274          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
275             jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
276          ip++;
277       }
278 
279       return jps;
280    }
281 
282    /**
283     * Synchronization mode required for data manipulated by in-order
284     * instructions.
285     *
286     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
287     * enum for additional type safety.  The hardware doesn't provide control
288     * over the synchronization mode for RegDist annotations, this is only used
289     * internally in this pass in order to optimize out redundant read
290     * dependencies where possible.
291     */
292    enum tgl_regdist_mode {
293       TGL_REGDIST_NULL = 0,
294       TGL_REGDIST_SRC = 1,
295       TGL_REGDIST_DST = 2
296    };
297 
298    /**
299     * Allow bitwise arithmetic of tgl_regdist_mode enums.
300     */
301    tgl_regdist_mode
operator |(tgl_regdist_mode x,tgl_regdist_mode y)302    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
303    {
304       return tgl_regdist_mode(unsigned(x) | unsigned(y));
305    }
306 
307    tgl_regdist_mode
operator &(tgl_regdist_mode x,tgl_regdist_mode y)308    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
309    {
310       return tgl_regdist_mode(unsigned(x) & unsigned(y));
311    }
312 
313    tgl_regdist_mode &
operator |=(tgl_regdist_mode & x,tgl_regdist_mode y)314    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
315    {
316       return x = x | y;
317    }
318 
319    tgl_regdist_mode &
operator &=(tgl_regdist_mode & x,tgl_regdist_mode y)320    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
321    {
322       return x = x & y;
323    }
324 
325    /** @} */
326 
327    /**
328     * Representation of an equivalence relation among the set of unsigned
329     * integers.
330     *
331     * Its initial state is the identity relation '~' such that i ~ j if and
332     * only if i == j for every pair of unsigned integers i and j.
333     */
334    struct equivalence_relation {
equivalence_relation__anon548472d60111::equivalence_relation335       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
336       {
337          for (unsigned i = 0; i < n; i++)
338             is[i] = i;
339       }
340 
~equivalence_relation__anon548472d60111::equivalence_relation341       ~equivalence_relation()
342       {
343          delete[] is;
344       }
345 
346       /**
347        * Return equivalence class index of the specified element.  Effectively
348        * this is the numeric value of an arbitrary representative from the
349        * equivalence class.
350        *
351        * Allows the evaluation of the equivalence relation according to the
352        * rule that i ~ j if and only if lookup(i) == lookup(j).
353        */
354       unsigned
lookup__anon548472d60111::equivalence_relation355       lookup(unsigned i) const
356       {
357          if (i < n && is[i] != i)
358             return lookup(is[i]);
359          else
360             return i;
361       }
362 
363       /**
364        * Create an array with the results of the lookup() method for
365        * constant-time evaluation.
366        */
367       unsigned *
flatten__anon548472d60111::equivalence_relation368       flatten() const
369       {
370          unsigned *ids = new unsigned[n];
371 
372          for (unsigned i = 0; i < n; i++)
373             ids[i] = lookup(i);
374 
375          return ids;
376       }
377 
378       /**
379        * Mutate the existing equivalence relation minimally by imposing the
380        * additional requirement that i ~ j.
381        *
382        * The algorithm updates the internal representation recursively in
383        * order to guarantee transitivity while preserving the previously
384        * specified equivalence requirements.
385        */
386       unsigned
link__anon548472d60111::equivalence_relation387       link(unsigned i, unsigned j)
388       {
389          const unsigned k = lookup(i);
390          assign(i, k);
391          assign(j, k);
392          return k;
393       }
394 
395    private:
396       equivalence_relation(const equivalence_relation &);
397 
398       equivalence_relation &
399       operator=(const equivalence_relation &);
400 
401       /**
402        * Assign the representative of \p from to be equivalent to \p to.
403        *
404        * At the same time the data structure is partially flattened as much as
405        * it's possible without increasing the number of recursive calls.
406        */
407       void
assign__anon548472d60111::equivalence_relation408       assign(unsigned from, unsigned to)
409       {
410          if (from != to) {
411             assert(from < n);
412 
413             if (is[from] != from)
414                assign(is[from], to);
415 
416             is[from] = to;
417          }
418       }
419 
420       unsigned *is;
421       unsigned n;
422    };
423 
424    /**
425     * Representation of a data dependency between two instructions in the
426     * program.
427     * @{
428     */
429    struct dependency {
430       /**
431        * No dependency information.
432        */
dependency__anon548472d60111::dependency433       dependency() : ordered(TGL_REGDIST_NULL), jp(),
434                      unordered(TGL_SBID_NULL), id(0),
435                      exec_all(false) {}
436 
437       /**
438        * Construct a dependency on the in-order instruction with the provided
439        * ordered_address instruction counter.
440        */
dependency__anon548472d60111::dependency441       dependency(tgl_regdist_mode mode, const ordered_address &jp,
442                  bool exec_all) :
443          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
444          exec_all(exec_all) {}
445 
446       /**
447        * Construct a dependency on the out-of-order instruction with the
448        * specified synchronization token.
449        */
dependency__anon548472d60111::dependency450       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
451          ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
452          exec_all(exec_all) {}
453 
454       /**
455        * Synchronization mode of in-order dependency, or zero if no in-order
456        * dependency is present.
457        */
458       tgl_regdist_mode ordered;
459 
460       /**
461        * Instruction counter of in-order dependency.
462        *
463        * For a dependency part of a different block in the program, this is
464        * relative to the specific control flow path taken between the
465        * dependency and the current block: It is the ordered_address such that
466        * the difference between it and the ordered_address of the first
467        * instruction of the current block is exactly the number of in-order
468        * instructions across that control flow path.  It is not guaranteed to
469        * be equal to the local ordered_address of the generating instruction
470        * [as returned by ordered_inst_addresses()], except for block-local
471        * dependencies.
472        */
473       ordered_address jp;
474 
475       /**
476        * Synchronization mode of unordered dependency, or zero if no unordered
477        * dependency is present.
478        */
479       tgl_sbid_mode unordered;
480 
481       /** Synchronization token of out-of-order dependency. */
482       unsigned id;
483 
484       /**
485        * Whether the dependency could be run with execution masking disabled,
486        * which might lead to the unwanted execution of the generating
487        * instruction in cases where a BB is executed with all channels
488        * disabled due to hardware bug Wa_1407528679.
489        */
490       bool exec_all;
491 
492       /**
493        * Trivial in-order dependency that's always satisfied.
494        *
495        * Note that unlike a default-constructed dependency() which is also
496        * trivially satisfied, this is considered to provide dependency
497        * information and can be used to clear a previously pending dependency
498        * via shadow().
499        */
500       static const dependency done;
501 
502       friend bool
operator ==(const dependency & dep0,const dependency & dep1)503       operator==(const dependency &dep0, const dependency &dep1)
504       {
505          return dep0.ordered == dep1.ordered &&
506                 dep0.jp == dep1.jp &&
507                 dep0.unordered == dep1.unordered &&
508                 dep0.id == dep1.id &&
509                 dep0.exec_all == dep1.exec_all;
510       }
511 
512       friend bool
operator !=(const dependency & dep0,const dependency & dep1)513       operator!=(const dependency &dep0, const dependency &dep1)
514       {
515          return !(dep0 == dep1);
516       }
517    };
518 
519    const dependency dependency::done =
520         dependency(TGL_REGDIST_DST, ordered_address(), false);
521 
522    /**
523     * Return whether \p dep contains any dependency information.
524     */
525    bool
is_valid(const dependency & dep)526    is_valid(const dependency &dep)
527    {
528       return dep.ordered || dep.unordered;
529    }
530 
531    /**
532     * Combine \p dep0 and \p dep1 into a single dependency object that is only
533     * satisfied when both original dependencies are satisfied.  This might
534     * involve updating the equivalence relation \p eq in order to make sure
535     * that both out-of-order dependencies are assigned the same hardware SBID
536     * as synchronization token.
537     */
538    dependency
merge(equivalence_relation & eq,const dependency & dep0,const dependency & dep1)539    merge(equivalence_relation &eq,
540          const dependency &dep0, const dependency &dep1)
541    {
542       dependency dep;
543 
544       if (dep0.ordered || dep1.ordered) {
545          dep.ordered = dep0.ordered | dep1.ordered;
546          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
547             dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
548       }
549 
550       if (dep0.unordered || dep1.unordered) {
551          dep.unordered = dep0.unordered | dep1.unordered;
552          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
553                           dep1.unordered ? dep1.id : dep0.id);
554       }
555 
556       dep.exec_all = dep0.exec_all || dep1.exec_all;
557 
558       return dep;
559    }
560 
561    /**
562     * Override dependency information of \p dep0 with that of \p dep1.
563     */
564    dependency
shadow(const dependency & dep0,const dependency & dep1)565    shadow(const dependency &dep0, const dependency &dep1)
566    {
567       if (dep0.ordered == TGL_REGDIST_SRC &&
568           is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
569                             !(dep1.ordered & TGL_REGDIST_DST)) {
570          /* As an optimization (see dependency_for_read()),
571           * instructions with a RaR dependency don't synchronize
572           * against a previous in-order read, so we need to pass
573           * through both ordered dependencies instead of simply
574           * dropping the first one.  Otherwise we could encounter a
575           * WaR data hazard between OP0 and OP2 in cases like:
576           *
577           *   OP0 r1:f r0:d
578           *   OP1 r2:d r0:d
579           *   OP2 r0:d r3:d
580           *
581           * since only the integer-pipeline r0 dependency from OP1
582           * would be visible to OP2, even though OP0 could technically
583           * execute after OP1 due to the floating-point and integer
584           * pipelines being asynchronous on Gfx12.5+ platforms, so
585           * synchronizing OP2 against OP1 would be insufficient.
586           */
587          dependency dep = dep1;
588 
589          dep.ordered |= dep0.ordered;
590          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
591                dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
592 
593          return dep;
594       } else {
595          return is_valid(dep1) ? dep1 : dep0;
596       }
597    }
598 
599    /**
600     * Translate dependency information across the program.
601     *
602     * This returns a dependency on the same instruction translated to the
603     * ordered_address space of a different block.  The correct shift for
604     * transporting a dependency across an edge of the CFG is the difference
605     * between the local ordered_address of the first instruction of the target
606     * block and the local ordered_address of the instruction immediately after
607     * the end of the origin block.
608     */
609    dependency
transport(dependency dep,int delta[IDX (TGL_PIPE_ALL)])610    transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
611    {
612       if (dep.ordered) {
613          for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
614             if (dep.jp.jp[p] > INT_MIN)
615                dep.jp.jp[p] += delta[p];
616          }
617       }
618 
619       return dep;
620    }
621 
622    /**
623     * Return simplified dependency removing any synchronization modes not
624     * applicable to an instruction reading the same register location.
625     */
626    dependency
dependency_for_read(dependency dep)627    dependency_for_read(dependency dep)
628    {
629       dep.ordered &= TGL_REGDIST_DST;
630       return dep;
631    }
632 
633    /**
634     * Return simplified dependency removing any synchronization modes not
635     * applicable to an instruction \p inst writing the same register location.
636     *
637     * This clears any WaR dependency for writes performed from the same
638     * pipeline as the read, since there is no possibility for a data hazard.
639     */
640    dependency
dependency_for_write(const struct intel_device_info * devinfo,const fs_inst * inst,dependency dep)641    dependency_for_write(const struct intel_device_info *devinfo,
642                         const fs_inst *inst, dependency dep)
643    {
644       if (!is_unordered(devinfo, inst) &&
645           is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
646          dep.ordered &= TGL_REGDIST_DST;
647       return dep;
648    }
649 
650    /** @} */
651 
652    /**
653     * Scoreboard representation.  This keeps track of the data dependencies of
654     * registers with GRF granularity.
655     */
656    class scoreboard {
657    public:
658       /**
659        * Look up the most current data dependency for register \p r.
660        */
661       dependency
get(const brw_reg & r) const662       get(const brw_reg &r) const
663       {
664          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
665             return *p;
666          else
667             return dependency();
668       }
669 
670       /**
671        * Specify the most current data dependency for register \p r.
672        */
673       void
set(const brw_reg & r,const dependency & d)674       set(const brw_reg &r, const dependency &d)
675       {
676          if (dependency *p = dep(r))
677             *p = d;
678       }
679 
680       /**
681        * Component-wise merge() of corresponding dependencies from two
682        * scoreboard objects.  \sa merge().
683        */
684       friend scoreboard
merge(equivalence_relation & eq,const scoreboard & sb0,const scoreboard & sb1)685       merge(equivalence_relation &eq,
686             const scoreboard &sb0, const scoreboard &sb1)
687       {
688          scoreboard sb;
689 
690          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
691             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
692 
693          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
694          sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
695 
696          return sb;
697       }
698 
699       /**
700        * Component-wise shadow() of corresponding dependencies from two
701        * scoreboard objects.  \sa shadow().
702        */
703       friend scoreboard
shadow(const scoreboard & sb0,const scoreboard & sb1)704       shadow(const scoreboard &sb0, const scoreboard &sb1)
705       {
706          scoreboard sb;
707 
708          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
709             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
710 
711          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
712          sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
713 
714          return sb;
715       }
716 
717       /**
718        * Component-wise transport() of dependencies from a scoreboard
719        * object.  \sa transport().
720        */
721       friend scoreboard
transport(const scoreboard & sb0,int delta[IDX (TGL_PIPE_ALL)])722       transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
723       {
724          scoreboard sb;
725 
726          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
727             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
728 
729          sb.addr_dep = transport(sb0.addr_dep, delta);
730          sb.accum_dep = transport(sb0.accum_dep, delta);
731 
732          return sb;
733       }
734 
735       friend bool
operator ==(const scoreboard & sb0,const scoreboard & sb1)736       operator==(const scoreboard &sb0, const scoreboard &sb1)
737       {
738          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
739             if (sb0.grf_deps[i] != sb1.grf_deps[i])
740                return false;
741          }
742 
743          if (sb0.addr_dep != sb1.addr_dep)
744             return false;
745 
746          if (sb0.accum_dep != sb1.accum_dep)
747             return false;
748 
749          return true;
750       }
751 
752       friend bool
operator !=(const scoreboard & sb0,const scoreboard & sb1)753       operator!=(const scoreboard &sb0, const scoreboard &sb1)
754       {
755          return !(sb0 == sb1);
756       }
757 
758    private:
759       dependency grf_deps[XE2_MAX_GRF];
760       dependency addr_dep;
761       dependency accum_dep;
762 
763       dependency *
dep(const brw_reg & r)764       dep(const brw_reg &r)
765       {
766          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
767                                reg_offset(r) / REG_SIZE);
768 
769          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
770                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
771                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
772                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
773                                   reg < BRW_ARF_FLAG ? &accum_dep :
774                  NULL);
775       }
776    };
777 
778    /**
779     * Dependency list handling.
780     * @{
781     */
782    struct dependency_list {
dependency_list__anon548472d60111::dependency_list783       dependency_list() : deps(NULL), n(0) {}
784 
~dependency_list__anon548472d60111::dependency_list785       ~dependency_list()
786       {
787          free(deps);
788       }
789 
790       void
push_back__anon548472d60111::dependency_list791       push_back(const dependency &dep)
792       {
793          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
794          deps[n++] = dep;
795       }
796 
797       unsigned
size__anon548472d60111::dependency_list798       size() const
799       {
800          return n;
801       }
802 
803       const dependency &
operator []__anon548472d60111::dependency_list804       operator[](unsigned i) const
805       {
806          assert(i < n);
807          return deps[i];
808       }
809 
810       dependency &
operator []__anon548472d60111::dependency_list811       operator[](unsigned i)
812       {
813          assert(i < n);
814          return deps[i];
815       }
816 
817    private:
818       dependency_list(const dependency_list &);
819       dependency_list &
820       operator=(const dependency_list &);
821 
822       dependency *deps;
823       unsigned n;
824    };
825 
826    /**
827     * Add dependency \p dep to the list of dependencies of an instruction
828     * \p deps.
829     */
830    void
add_dependency(const unsigned * ids,dependency_list & deps,dependency dep)831    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
832    {
833       if (is_valid(dep)) {
834          /* Translate the unordered dependency token first in order to keep
835           * the list minimally redundant.
836           */
837          if (dep.unordered)
838             dep.id = ids[dep.id];
839 
840          /* Try to combine the specified dependency with any existing ones. */
841          for (unsigned i = 0; i < deps.size(); i++) {
842             /* Don't combine otherwise matching dependencies if there is an
843              * exec_all mismatch which would cause a SET dependency to gain an
844              * exec_all flag, since that would prevent it from being baked
845              * into the instruction we want to allocate an SBID for.
846              */
847             if (deps[i].exec_all != dep.exec_all &&
848                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
849                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
850                continue;
851 
852             if (dep.ordered && deps[i].ordered) {
853                for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
854                   deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
855 
856                deps[i].ordered |= dep.ordered;
857                deps[i].exec_all |= dep.exec_all;
858                dep.ordered = TGL_REGDIST_NULL;
859             }
860 
861             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
862                deps[i].unordered |= dep.unordered;
863                deps[i].exec_all |= dep.exec_all;
864                dep.unordered = TGL_SBID_NULL;
865             }
866          }
867 
868          /* Add it to the end of the list if necessary. */
869          if (is_valid(dep))
870             deps.push_back(dep);
871       }
872    }
873 
874    /**
875     * Construct a tgl_swsb annotation encoding any ordered dependencies from
876     * the dependency list \p deps of an instruction with ordered_address \p
877     * jp.  If \p exec_all is false only dependencies known to be executed with
878     * channel masking applied will be considered in the calculation.
879     */
880    tgl_swsb
ordered_dependency_swsb(const dependency_list & deps,const ordered_address & jp,bool exec_all)881    ordered_dependency_swsb(const dependency_list &deps,
882                            const ordered_address &jp,
883                            bool exec_all)
884    {
885       tgl_pipe p = TGL_PIPE_NONE;
886       unsigned min_dist = ~0u;
887 
888       for (unsigned i = 0; i < deps.size(); i++) {
889          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
890             for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
891                const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
892                const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
893                assert(jp.jp[q] > deps[i].jp.jp[q]);
894                if (dist <= max_dist) {
895                   p = (p && IDX(p) != q ? TGL_PIPE_ALL :
896                        tgl_pipe(TGL_PIPE_FLOAT + q));
897                   min_dist = MIN3(min_dist, dist, 7);
898                }
899             }
900          }
901       }
902 
903       return { p ? min_dist : 0, p };
904    }
905 
906    /**
907     * Return whether the dependency list \p deps of an instruction with
908     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
909     * exec_all is false only dependencies known to be executed with channel
910     * masking applied will be considered in the calculation.
911     */
912    bool
find_ordered_dependency(const dependency_list & deps,const ordered_address & jp,bool exec_all)913    find_ordered_dependency(const dependency_list &deps,
914                            const ordered_address &jp,
915                            bool exec_all)
916    {
917       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
918    }
919 
920    /**
921     * Return the full tgl_sbid_mode bitset for the first unordered dependency
922     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
923     * no such dependency is present.  If \p exec_all is false only
924     * dependencies known to be executed with channel masking applied will be
925     * considered in the calculation.
926     */
927    tgl_sbid_mode
find_unordered_dependency(const dependency_list & deps,tgl_sbid_mode unordered,bool exec_all)928    find_unordered_dependency(const dependency_list &deps,
929                              tgl_sbid_mode unordered,
930                              bool exec_all)
931    {
932       if (unordered) {
933          for (unsigned i = 0; i < deps.size(); i++) {
934             if ((unordered & deps[i].unordered) &&
935                 exec_all >= deps[i].exec_all)
936                return deps[i].unordered;
937          }
938       }
939 
940       return TGL_SBID_NULL;
941    }
942 
943    /**
944     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
945     * \p deps that can be represented directly in the SWSB annotation of the
946     * instruction without additional SYNC instructions, or zero if no such
947     * dependency is present.
948     */
949    tgl_sbid_mode
baked_unordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)950    baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
951                                    const fs_inst *inst,
952                                    const dependency_list &deps,
953                                    const ordered_address &jp)
954    {
955       const bool exec_all = inst->force_writemask_all;
956       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
957       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
958                                                             exec_all).pipe;
959 
960       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
961          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
962       else if (has_ordered && is_unordered(devinfo, inst))
963          return TGL_SBID_NULL;
964       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
965                (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
966          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
967       else if (!has_ordered)
968          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
969       else
970          return TGL_SBID_NULL;
971    }
972 
973    /**
974     * Return whether an ordered dependency from the list \p deps can be
975     * represented directly in the SWSB annotation of the instruction without
976     * additional SYNC instructions.
977     */
978    bool
baked_ordered_dependency_mode(const struct intel_device_info * devinfo,const fs_inst * inst,const dependency_list & deps,const ordered_address & jp)979    baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
980                                  const fs_inst *inst,
981                                  const dependency_list &deps,
982                                  const ordered_address &jp)
983    {
984       const bool exec_all = inst->force_writemask_all;
985       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
986       const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
987                                                             exec_all).pipe;
988       const tgl_sbid_mode unordered_mode =
989          baked_unordered_dependency_mode(devinfo, inst, deps, jp);
990 
991       if (!has_ordered)
992          return false;
993       else if (!unordered_mode)
994          return true;
995       else
996          return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
997                 unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
998                                    TGL_SBID_DST);
999    }
1000 
1001    /** @} */
1002 
1003    /**
1004     * Shader instruction dependency calculation.
1005     * @{
1006     */
1007 
1008    /**
1009     * Update scoreboard object \p sb to account for the execution of
1010     * instruction \p inst.
1011     */
1012    void
update_inst_scoreboard(const fs_visitor * shader,const ordered_address * jps,const fs_inst * inst,unsigned ip,scoreboard & sb)1013    update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
1014                           const fs_inst *inst, unsigned ip, scoreboard &sb)
1015    {
1016       const bool exec_all = inst->force_writemask_all;
1017       const struct intel_device_info *devinfo = shader->devinfo;
1018       const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1019       const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
1020                                      ordered_address();
1021       const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
1022       const bool is_unordered_math =
1023          (inst->is_math() && devinfo->ver < 20) ||
1024          (devinfo->has_64bit_float_via_math_pipe &&
1025           (get_exec_type(inst) == BRW_TYPE_DF ||
1026            inst->dst.type == BRW_TYPE_DF));
1027 
1028       /* Track any source registers that may be fetched asynchronously by this
1029        * instruction, otherwise clear the dependency in order to avoid
1030        * subsequent redundant synchronization.
1031        */
1032       for (unsigned i = 0; i < inst->sources; i++) {
1033          const dependency rd_dep =
1034             (inst->is_payload(i) ||
1035              inst->opcode == BRW_OPCODE_DPAS ||
1036              is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
1037             is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
1038             dependency::done;
1039 
1040          for (unsigned j = 0; j < regs_read(inst, i); j++) {
1041             const brw_reg r = byte_offset(inst->src[i], REG_SIZE * j);
1042             sb.set(r, shadow(sb.get(r), rd_dep));
1043          }
1044       }
1045 
1046       if (inst->reads_accumulator_implicitly())
1047          sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
1048 
1049       /* Track any destination registers of this instruction. */
1050       const dependency wr_dep =
1051          is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
1052          is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
1053          dependency();
1054 
1055       if (inst->writes_accumulator_implicitly(devinfo))
1056          sb.set(brw_acc_reg(8), wr_dep);
1057 
1058       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1059           !inst->dst.is_null()) {
1060          for (unsigned j = 0; j < regs_written(inst); j++)
1061             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1062       }
1063    }
1064 
1065    /**
1066     * Calculate scoreboard objects locally that represent any pending (and
1067     * unconditionally resolved) dependencies at the end of each block of the
1068     * program.
1069     */
1070    scoreboard *
gather_block_scoreboards(const fs_visitor * shader,const ordered_address * jps)1071    gather_block_scoreboards(const fs_visitor *shader,
1072                             const ordered_address *jps)
1073    {
1074       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1075       unsigned ip = 0;
1076 
1077       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1078          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1079 
1080       return sbs;
1081    }
1082 
1083    /**
1084     * Propagate data dependencies globally through the control flow graph
1085     * until a fixed point is reached.
1086     *
1087     * Calculates the set of dependencies potentially pending at the beginning
1088     * of each block, and returns it as an array of scoreboard objects.
1089     */
1090    scoreboard *
propagate_block_scoreboards(const fs_visitor * shader,const ordered_address * jps,equivalence_relation & eq)1091    propagate_block_scoreboards(const fs_visitor *shader,
1092                                const ordered_address *jps,
1093                                equivalence_relation &eq)
1094    {
1095       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1096       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1097       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1098 
1099       for (bool progress = true; progress;) {
1100          progress = false;
1101 
1102          foreach_block(block, shader->cfg) {
1103             const scoreboard sb = shadow(in_sbs[block->num],
1104                                          delta_sbs[block->num]);
1105 
1106             if (sb != out_sbs[block->num]) {
1107                foreach_list_typed(bblock_link, child_link, link,
1108                                   &block->children) {
1109                   scoreboard &in_sb = in_sbs[child_link->block->num];
1110                   int delta[IDX(TGL_PIPE_ALL)];
1111 
1112                   for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1113                      delta[p] = jps[child_link->block->start_ip].jp[p]
1114                         - jps[block->end_ip].jp[p]
1115                         - ordered_unit(shader->devinfo,
1116                                        static_cast<const fs_inst *>(block->end()), p);
1117 
1118                   in_sb = merge(eq, in_sb, transport(sb, delta));
1119                }
1120 
1121                out_sbs[block->num] = sb;
1122                progress = true;
1123             }
1124          }
1125       }
1126 
1127       delete[] delta_sbs;
1128       delete[] out_sbs;
1129 
1130       return in_sbs;
1131    }
1132 
1133    /**
1134     * Return the list of potential dependencies of each instruction in the
1135     * shader based on the result of global dependency analysis.
1136     */
1137    dependency_list *
gather_inst_dependencies(const fs_visitor * shader,const ordered_address * jps)1138    gather_inst_dependencies(const fs_visitor *shader,
1139                             const ordered_address *jps)
1140    {
1141       const struct intel_device_info *devinfo = shader->devinfo;
1142       equivalence_relation eq(num_instructions(shader));
1143       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1144       const unsigned *ids = eq.flatten();
1145       dependency_list *deps = new dependency_list[num_instructions(shader)];
1146       unsigned ip = 0;
1147 
1148       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1149          const bool exec_all = inst->force_writemask_all;
1150          const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1151          scoreboard &sb = sbs[block->num];
1152 
1153          for (unsigned i = 0; i < inst->sources; i++) {
1154             for (unsigned j = 0; j < regs_read(inst, i); j++)
1155                add_dependency(ids, deps[ip], dependency_for_read(
1156                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1157          }
1158 
1159          if (inst->reads_accumulator_implicitly()) {
1160             /* Wa_22012725308:
1161              *
1162              * "When the accumulator registers are used as source and/or
1163              *  destination, hardware does not ensure prevention of write
1164              *  after read hazard across execution pipes."
1165              */
1166             const dependency dep = sb.get(brw_acc_reg(8));
1167             if (dep.ordered && !is_single_pipe(dep.jp, p))
1168                add_dependency(ids, deps[ip], dep);
1169          }
1170 
1171          if (is_unordered(devinfo, inst) && !inst->eot)
1172             add_dependency(ids, deps[ip],
1173                            dependency(TGL_SBID_SET, ip, exec_all));
1174 
1175          if (!inst->no_dd_check) {
1176             if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1177                 !inst->dst.is_accumulator()) {
1178                for (unsigned j = 0; j < regs_written(inst); j++) {
1179                   add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1180                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1181                }
1182             }
1183 
1184             if (inst->writes_accumulator_implicitly(devinfo) ||
1185                 inst->dst.is_accumulator()) {
1186                /* Wa_22012725308:
1187                 *
1188                 * "When the accumulator registers are used as source and/or
1189                 *  destination, hardware does not ensure prevention of write
1190                 *  after read hazard across execution pipes."
1191                 */
1192                const dependency dep = sb.get(brw_acc_reg(8));
1193                if (dep.ordered && !is_single_pipe(dep.jp, p))
1194                   add_dependency(ids, deps[ip], dep);
1195             }
1196          }
1197 
1198          update_inst_scoreboard(shader, jps, inst, ip, sb);
1199          ip++;
1200       }
1201 
1202       delete[] sbs;
1203       delete[] ids;
1204 
1205       return deps;
1206    }
1207 
1208    /** @} */
1209 
1210    /**
1211     * Allocate SBID tokens to track the execution of every out-of-order
1212     * instruction of the shader.
1213     */
1214    dependency_list *
allocate_inst_dependencies(const fs_visitor * shader,const dependency_list * deps0)1215    allocate_inst_dependencies(const fs_visitor *shader,
1216                               const dependency_list *deps0)
1217    {
1218       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1219        *       shaders with a large number of SEND messages.
1220        *
1221        * XXX - Use 32 SBIDs on Xe2+ while in large GRF mode.
1222        */
1223       const unsigned num_sbids = 16;
1224 
1225       /* Allocate an unordered dependency ID to hardware SBID translation
1226        * table with as many entries as instructions there are in the shader,
1227        * which is the maximum number of unordered IDs we can find in the
1228        * program.
1229        */
1230       unsigned *ids = new unsigned[num_instructions(shader)];
1231       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1232          ids[ip] = ~0u;
1233 
1234       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1235       unsigned next_id = 0;
1236 
1237       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1238          for (unsigned i = 0; i < deps0[ip].size(); i++) {
1239             const dependency &dep = deps0[ip][i];
1240 
1241             if (dep.unordered && ids[dep.id] == ~0u)
1242                ids[dep.id] = (next_id++) & (num_sbids - 1);
1243 
1244             add_dependency(ids, deps1[ip], dep);
1245          }
1246       }
1247 
1248       delete[] ids;
1249 
1250       return deps1;
1251    }
1252 
1253    /**
1254     * Emit dependency information provided by \p deps into the shader,
1255     * inserting additional SYNC instructions for dependencies that can't be
1256     * represented directly by annotating existing instructions.
1257     */
1258    void
emit_inst_dependencies(fs_visitor * shader,const ordered_address * jps,const dependency_list * deps)1259    emit_inst_dependencies(fs_visitor *shader,
1260                           const ordered_address *jps,
1261                           const dependency_list *deps)
1262    {
1263       const struct intel_device_info *devinfo = shader->devinfo;
1264       unsigned ip = 0;
1265 
1266       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1267          const bool exec_all = inst->force_writemask_all;
1268          const bool ordered_mode =
1269             baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1270          const tgl_sbid_mode unordered_mode =
1271             baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1272          tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1273             ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1274 
1275          for (unsigned i = 0; i < deps[ip].size(); i++) {
1276             const dependency &dep = deps[ip][i];
1277 
1278             if (dep.unordered) {
1279                if (unordered_mode == dep.unordered &&
1280                    exec_all >= dep.exec_all && !swsb.mode) {
1281                   /* Bake unordered dependency into the instruction's SWSB if
1282                    * possible, except in cases where the current instruction
1283                    * isn't marked NoMask but the dependency is, since that
1284                    * might lead to data coherency issues due to
1285                    * Wa_1407528679.
1286                    */
1287                   swsb.sbid = dep.id;
1288                   swsb.mode = dep.unordered;
1289                } else {
1290                   /* Emit dependency into the SWSB of an extra SYNC
1291                    * instruction.
1292                    */
1293                   const fs_builder ibld = fs_builder(shader, block, inst)
1294                                           .exec_all().group(1, 0);
1295                   fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1296                   sync->sched.sbid = dep.id;
1297                   sync->sched.mode = dep.unordered;
1298                   assert(!(sync->sched.mode & TGL_SBID_SET));
1299                }
1300             }
1301          }
1302 
1303          for (unsigned i = 0; i < deps[ip].size(); i++) {
1304             const dependency &dep = deps[ip][i];
1305 
1306             if (dep.ordered &&
1307                 find_ordered_dependency(deps[ip], jps[ip], true) &&
1308                 (!ordered_mode || dep.exec_all > exec_all)) {
1309                /* If the current instruction is not marked NoMask but an
1310                 * ordered dependency is, perform the synchronization as a
1311                 * separate NoMask SYNC instruction in order to avoid data
1312                 * coherency issues due to Wa_1407528679.  The similar
1313                 * scenario with unordered dependencies should have been
1314                 * handled above.
1315                 */
1316                const fs_builder ibld = fs_builder(shader, block, inst)
1317                                        .exec_all().group(1, 0);
1318                fs_inst *sync = ibld.SYNC(TGL_SYNC_NOP);
1319                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1320                break;
1321             }
1322          }
1323 
1324          /* Update the IR. */
1325          inst->sched = swsb;
1326          inst->no_dd_check = inst->no_dd_clear = false;
1327          ip++;
1328       }
1329    }
1330 }
1331 
1332 bool
brw_fs_lower_scoreboard(fs_visitor & s)1333 brw_fs_lower_scoreboard(fs_visitor &s)
1334 {
1335    if (s.devinfo->ver >= 12) {
1336       const ordered_address *jps = ordered_inst_addresses(&s);
1337       const dependency_list *deps0 = gather_inst_dependencies(&s, jps);
1338       const dependency_list *deps1 = allocate_inst_dependencies(&s, deps0);
1339       emit_inst_dependencies(&s, jps, deps1);
1340       delete[] deps1;
1341       delete[] deps0;
1342       delete[] jps;
1343    }
1344 
1345    return true;
1346 }
1347