xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_fs_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_private.h"
38 #include "intel_nir.h"
39 #include "shader_enums.h"
40 #include "dev/intel_debug.h"
41 #include "dev/intel_wa.h"
42 #include "compiler/glsl_types.h"
43 #include "compiler/nir/nir_builder.h"
44 #include "util/u_math.h"
45 
46 using namespace brw;
47 
48 static void
49 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
50 
51 void
init(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
53               const brw_reg *src, unsigned sources)
54 {
55    memset((void*)this, 0, sizeof(*this));
56 
57    initialize_sources(this, src, sources);
58 
59    for (unsigned i = 0; i < sources; i++)
60       this->src[i] = src[i];
61 
62    this->opcode = opcode;
63    this->dst = dst;
64    this->exec_size = exec_size;
65 
66    assert(dst.file != IMM && dst.file != UNIFORM);
67 
68    assert(this->exec_size != 0);
69 
70    this->conditional_mod = BRW_CONDITIONAL_NONE;
71 
72    /* This will be the case for almost all instructions. */
73    switch (dst.file) {
74    case VGRF:
75    case ARF:
76    case FIXED_GRF:
77    case ATTR:
78       this->size_written = dst.component_size(exec_size);
79       break;
80    case BAD_FILE:
81       this->size_written = 0;
82       break;
83    case IMM:
84    case UNIFORM:
85       unreachable("Invalid destination register file");
86    }
87 
88    this->writes_accumulator = false;
89 }
90 
fs_inst()91 fs_inst::fs_inst()
92 {
93    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
94 }
95 
fs_inst(enum opcode opcode,uint8_t exec_size)96 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
97 {
98    init(opcode, exec_size, reg_undef, NULL, 0);
99 }
100 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst)101 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
102 {
103    init(opcode, exec_size, dst, NULL, 0);
104 }
105 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0)106 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
107                  const brw_reg &src0)
108 {
109    const brw_reg src[1] = { src0 };
110    init(opcode, exec_size, dst, src, 1);
111 }
112 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)113 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
114                  const brw_reg &src0, const brw_reg &src1)
115 {
116    const brw_reg src[2] = { src0, src1 };
117    init(opcode, exec_size, dst, src, 2);
118 }
119 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)120 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
121                  const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
122 {
123    const brw_reg src[3] = { src0, src1, src2 };
124    init(opcode, exec_size, dst, src, 3);
125 }
126 
fs_inst(enum opcode opcode,uint8_t exec_width,const brw_reg & dst,const brw_reg src[],unsigned sources)127 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
128                  const brw_reg src[], unsigned sources)
129 {
130    init(opcode, exec_width, dst, src, sources);
131 }
132 
fs_inst(const fs_inst & that)133 fs_inst::fs_inst(const fs_inst &that)
134 {
135    memcpy((void*)this, &that, sizeof(that));
136    initialize_sources(this, that.src, that.sources);
137 }
138 
~fs_inst()139 fs_inst::~fs_inst()
140 {
141    if (this->src != this->builtin_src)
142       delete[] this->src;
143 }
144 
145 static void
initialize_sources(fs_inst * inst,const brw_reg src[],uint8_t num_sources)146 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
147 {
148    if (num_sources > ARRAY_SIZE(inst->builtin_src))
149       inst->src = new brw_reg[num_sources];
150    else
151       inst->src = inst->builtin_src;
152 
153    for (unsigned i = 0; i < num_sources; i++)
154       inst->src[i] = src[i];
155 
156    inst->sources = num_sources;
157 }
158 
159 void
resize_sources(uint8_t num_sources)160 fs_inst::resize_sources(uint8_t num_sources)
161 {
162    if (this->sources == num_sources)
163       return;
164 
165    brw_reg *old_src = this->src;
166    brw_reg *new_src;
167 
168    const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
169 
170    if (old_src == this->builtin_src) {
171       if (num_sources > builtin_size) {
172          new_src = new brw_reg[num_sources];
173          for (unsigned i = 0; i < this->sources; i++)
174             new_src[i] = old_src[i];
175 
176       } else {
177          new_src = old_src;
178       }
179    } else {
180       if (num_sources <= builtin_size) {
181          new_src = this->builtin_src;
182          assert(this->sources > num_sources);
183          for (unsigned i = 0; i < num_sources; i++)
184             new_src[i] = old_src[i];
185 
186       } else if (num_sources < this->sources) {
187          new_src = old_src;
188 
189       } else {
190          new_src = new brw_reg[num_sources];
191          for (unsigned i = 0; i < num_sources; i++)
192             new_src[i] = old_src[i];
193       }
194 
195       if (old_src != new_src)
196          delete[] old_src;
197    }
198 
199    this->sources = num_sources;
200    this->src = new_src;
201 }
202 
203 bool
is_send_from_grf() const204 fs_inst::is_send_from_grf() const
205 {
206    switch (opcode) {
207    case SHADER_OPCODE_SEND:
208    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
209    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
210    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
211    case SHADER_OPCODE_INTERLOCK:
212    case SHADER_OPCODE_MEMORY_FENCE:
213    case SHADER_OPCODE_BARRIER:
214       return true;
215    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
216       return src[1].file == VGRF;
217    default:
218       return false;
219    }
220 }
221 
222 bool
is_control_source(unsigned arg) const223 fs_inst::is_control_source(unsigned arg) const
224 {
225    switch (opcode) {
226    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
227       return arg == 0;
228 
229    case SHADER_OPCODE_BROADCAST:
230    case SHADER_OPCODE_SHUFFLE:
231    case SHADER_OPCODE_QUAD_SWIZZLE:
232    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
233    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
234    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
235       return arg == 1;
236 
237    case SHADER_OPCODE_MOV_INDIRECT:
238    case SHADER_OPCODE_CLUSTER_BROADCAST:
239       return arg == 1 || arg == 2;
240 
241    case SHADER_OPCODE_SEND:
242       return arg == 0 || arg == 1;
243 
244    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
245    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
246    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
247       return arg != MEMORY_LOGICAL_BINDING &&
248              arg != MEMORY_LOGICAL_ADDRESS &&
249              arg != MEMORY_LOGICAL_DATA0 &&
250              arg != MEMORY_LOGICAL_DATA1;
251 
252    default:
253       return false;
254    }
255 }
256 
257 bool
is_payload(unsigned arg) const258 fs_inst::is_payload(unsigned arg) const
259 {
260    switch (opcode) {
261    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
262    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
263    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
264    case SHADER_OPCODE_INTERLOCK:
265    case SHADER_OPCODE_MEMORY_FENCE:
266    case SHADER_OPCODE_BARRIER:
267       return arg == 0;
268 
269    case SHADER_OPCODE_SEND:
270       return arg == 2 || arg == 3;
271 
272    default:
273       return false;
274    }
275 }
276 
277 /**
278  * Returns true if this instruction's sources and destinations cannot
279  * safely be the same register.
280  *
281  * In most cases, a register can be written over safely by the same
282  * instruction that is its last use.  For a single instruction, the
283  * sources are dereferenced before writing of the destination starts
284  * (naturally).
285  *
286  * However, there are a few cases where this can be problematic:
287  *
288  * - Virtual opcodes that translate to multiple instructions in the
289  *   code generator: if src == dst and one instruction writes the
290  *   destination before a later instruction reads the source, then
291  *   src will have been clobbered.
292  *
293  * - SIMD16 compressed instructions with certain regioning (see below).
294  *
295  * The register allocator uses this information to set up conflicts between
296  * GRF sources and the destination.
297  */
298 bool
has_source_and_destination_hazard() const299 fs_inst::has_source_and_destination_hazard() const
300 {
301    switch (opcode) {
302    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
303       /* Multiple partial writes to the destination */
304       return true;
305    case SHADER_OPCODE_SHUFFLE:
306       /* This instruction returns an arbitrary channel from the source and
307        * gets split into smaller instructions in the generator.  It's possible
308        * that one of the instructions will read from a channel corresponding
309        * to an earlier instruction.
310        */
311    case SHADER_OPCODE_SEL_EXEC:
312       /* This is implemented as
313        *
314        * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
315        * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
316        *
317        * Because the source is only read in the second instruction, the first
318        * may stomp all over it.
319        */
320       return true;
321    case SHADER_OPCODE_QUAD_SWIZZLE:
322       switch (src[1].ud) {
323       case BRW_SWIZZLE_XXXX:
324       case BRW_SWIZZLE_YYYY:
325       case BRW_SWIZZLE_ZZZZ:
326       case BRW_SWIZZLE_WWWW:
327       case BRW_SWIZZLE_XXZZ:
328       case BRW_SWIZZLE_YYWW:
329       case BRW_SWIZZLE_XYXY:
330       case BRW_SWIZZLE_ZWZW:
331          /* These can be implemented as a single Align1 region on all
332           * platforms, so there's never a hazard between source and
333           * destination.  C.f. fs_generator::generate_quad_swizzle().
334           */
335          return false;
336       default:
337          return !is_uniform(src[0]);
338       }
339    case BRW_OPCODE_DPAS:
340       /* This is overly conservative. The actual hazard is more complicated to
341        * describe. When the repeat count is N, the single instruction behaves
342        * like N instructions with a repeat count of one, but the destination
343        * and source registers are incremented (in somewhat complex ways) for
344        * each instruction.
345        *
346        * This means the source and destination register is actually a range of
347        * registers. The hazard exists of an earlier iteration would write a
348        * register that should be read by a later iteration.
349        *
350        * There may be some advantage to properly modeling this, but for now,
351        * be overly conservative.
352        */
353       return rcount > 1;
354    default:
355       /* The SIMD16 compressed instruction
356        *
357        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
358        *
359        * is actually decoded in hardware as:
360        *
361        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
362        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
363        *
364        * Which is safe.  However, if we have uniform accesses
365        * happening, we get into trouble:
366        *
367        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
368        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
369        *
370        * Now our destination for the first instruction overwrote the
371        * second instruction's src0, and we get garbage for those 8
372        * pixels.  There's a similar issue for the pre-gfx6
373        * pixel_x/pixel_y, which are registers of 16-bit values and thus
374        * would get stomped by the first decode as well.
375        */
376       if (exec_size == 16) {
377          for (int i = 0; i < sources; i++) {
378             if (src[i].file == VGRF && (src[i].stride == 0 ||
379                                         src[i].type == BRW_TYPE_UW ||
380                                         src[i].type == BRW_TYPE_W ||
381                                         src[i].type == BRW_TYPE_UB ||
382                                         src[i].type == BRW_TYPE_B)) {
383                return true;
384             }
385          }
386       }
387       return false;
388    }
389 }
390 
391 bool
can_do_source_mods(const struct intel_device_info * devinfo) const392 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
393 {
394    if (is_send_from_grf())
395       return false;
396 
397    /* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
398     *
399     * "When multiplying a DW and any lower precision integer, source modifier
400     *  is not supported."
401     */
402    if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
403                               opcode == BRW_OPCODE_MAD)) {
404       const brw_reg_type exec_type = get_exec_type(this);
405       const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
406          MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
407          MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
408 
409       if (brw_type_is_int(exec_type) &&
410           brw_type_size_bytes(exec_type) >= 4 &&
411           brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
412          return false;
413    }
414 
415    switch (opcode) {
416    case BRW_OPCODE_ADDC:
417    case BRW_OPCODE_BFE:
418    case BRW_OPCODE_BFI1:
419    case BRW_OPCODE_BFI2:
420    case BRW_OPCODE_BFREV:
421    case BRW_OPCODE_CBIT:
422    case BRW_OPCODE_FBH:
423    case BRW_OPCODE_FBL:
424    case BRW_OPCODE_ROL:
425    case BRW_OPCODE_ROR:
426    case BRW_OPCODE_SUBB:
427    case BRW_OPCODE_DP4A:
428    case BRW_OPCODE_DPAS:
429    case SHADER_OPCODE_BROADCAST:
430    case SHADER_OPCODE_CLUSTER_BROADCAST:
431    case SHADER_OPCODE_MOV_INDIRECT:
432    case SHADER_OPCODE_SHUFFLE:
433    case SHADER_OPCODE_INT_QUOTIENT:
434    case SHADER_OPCODE_INT_REMAINDER:
435       return false;
436    default:
437       return true;
438    }
439 }
440 
441 bool
can_do_cmod() const442 fs_inst::can_do_cmod() const
443 {
444    switch (opcode) {
445    case BRW_OPCODE_ADD:
446    case BRW_OPCODE_ADD3:
447    case BRW_OPCODE_ADDC:
448    case BRW_OPCODE_AND:
449    case BRW_OPCODE_ASR:
450    case BRW_OPCODE_AVG:
451    case BRW_OPCODE_CMP:
452    case BRW_OPCODE_CMPN:
453    case BRW_OPCODE_DP2:
454    case BRW_OPCODE_DP3:
455    case BRW_OPCODE_DP4:
456    case BRW_OPCODE_DPH:
457    case BRW_OPCODE_FRC:
458    case BRW_OPCODE_LINE:
459    case BRW_OPCODE_LRP:
460    case BRW_OPCODE_LZD:
461    case BRW_OPCODE_MAC:
462    case BRW_OPCODE_MACH:
463    case BRW_OPCODE_MAD:
464    case BRW_OPCODE_MOV:
465    case BRW_OPCODE_MUL:
466    case BRW_OPCODE_NOT:
467    case BRW_OPCODE_OR:
468    case BRW_OPCODE_PLN:
469    case BRW_OPCODE_RNDD:
470    case BRW_OPCODE_RNDE:
471    case BRW_OPCODE_RNDU:
472    case BRW_OPCODE_RNDZ:
473    case BRW_OPCODE_SHL:
474    case BRW_OPCODE_SHR:
475    case BRW_OPCODE_SUBB:
476    case BRW_OPCODE_XOR:
477       break;
478    default:
479       return false;
480    }
481 
482    /* The accumulator result appears to get used for the conditional modifier
483     * generation.  When negating a UD value, there is a 33rd bit generated for
484     * the sign in the accumulator value, so now you can't check, for example,
485     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
486     */
487    for (unsigned i = 0; i < sources; i++) {
488       if (brw_type_is_uint(src[i].type) && src[i].negate)
489          return false;
490    }
491 
492    return true;
493 }
494 
495 bool
can_change_types() const496 fs_inst::can_change_types() const
497 {
498    return dst.type == src[0].type &&
499           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
500           (opcode == BRW_OPCODE_MOV ||
501            (opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
502            (opcode == BRW_OPCODE_SEL &&
503             dst.type == src[1].type &&
504             predicate != BRW_PREDICATE_NONE &&
505             !src[1].abs && !src[1].negate && src[1].file != ATTR));
506 }
507 
508 bool
equals(const brw_reg & r) const509 brw_reg::equals(const brw_reg &r) const
510 {
511    return brw_regs_equal(this, &r);
512 }
513 
514 bool
negative_equals(const brw_reg & r) const515 brw_reg::negative_equals(const brw_reg &r) const
516 {
517    return brw_regs_negative_equal(this, &r);
518 }
519 
520 bool
is_contiguous() const521 brw_reg::is_contiguous() const
522 {
523    switch (file) {
524    case ARF:
525    case FIXED_GRF:
526       return hstride == BRW_HORIZONTAL_STRIDE_1 &&
527              vstride == width + hstride;
528    case VGRF:
529    case ATTR:
530       return stride == 1;
531    case UNIFORM:
532    case IMM:
533    case BAD_FILE:
534       return true;
535    }
536 
537    unreachable("Invalid register file");
538 }
539 
540 unsigned
component_size(unsigned width) const541 brw_reg::component_size(unsigned width) const
542 {
543    if (file == ARF || file == FIXED_GRF) {
544       const unsigned w = MIN2(width, 1u << this->width);
545       const unsigned h = width >> this->width;
546       const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
547       const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
548       assert(w > 0);
549       /* Note this rounds up to next horizontal stride to be consistent with
550        * the VGRF case below.
551        */
552       return ((MAX2(1, h) - 1) * vs + MAX2(w * hs, 1)) * brw_type_size_bytes(type);
553    } else {
554       return MAX2(width * stride, 1) * brw_type_size_bytes(type);
555    }
556 }
557 
558 void
vfail(const char * format,va_list va)559 fs_visitor::vfail(const char *format, va_list va)
560 {
561    char *msg;
562 
563    if (failed)
564       return;
565 
566    failed = true;
567 
568    msg = ralloc_vasprintf(mem_ctx, format, va);
569    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
570          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
571 
572    this->fail_msg = msg;
573 
574    if (unlikely(debug_enabled)) {
575       fprintf(stderr, "%s",  msg);
576    }
577 }
578 
579 void
fail(const char * format,...)580 fs_visitor::fail(const char *format, ...)
581 {
582    va_list va;
583 
584    va_start(va, format);
585    vfail(format, va);
586    va_end(va);
587 }
588 
589 /**
590  * Mark this program as impossible to compile with dispatch width greater
591  * than n.
592  *
593  * During the SIMD8 compile (which happens first), we can detect and flag
594  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
595  * SIMD16+ compile altogether.
596  *
597  * During a compile of dispatch width greater than n (if one happens anyway),
598  * this just calls fail().
599  */
600 void
limit_dispatch_width(unsigned n,const char * msg)601 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
602 {
603    if (dispatch_width > n) {
604       fail("%s", msg);
605    } else {
606       max_dispatch_width = MIN2(max_dispatch_width, n);
607       brw_shader_perf_log(compiler, log_data,
608                           "Shader dispatch width limited to SIMD%d: %s\n",
609                           n, msg);
610    }
611 }
612 
613 /**
614  * Returns true if the instruction has a flag that means it won't
615  * update an entire destination register.
616  *
617  * For example, dead code elimination and live variable analysis want to know
618  * when a write to a variable screens off any preceding values that were in
619  * it.
620  */
621 bool
is_partial_write() const622 fs_inst::is_partial_write() const
623 {
624    if (this->predicate && !this->predicate_trivial &&
625        this->opcode != BRW_OPCODE_SEL)
626       return true;
627 
628    if (!this->dst.is_contiguous())
629       return true;
630 
631    if (this->dst.offset % REG_SIZE != 0)
632       return true;
633 
634    return this->size_written % REG_SIZE != 0;
635 }
636 
637 unsigned
components_read(unsigned i) const638 fs_inst::components_read(unsigned i) const
639 {
640    /* Return zero if the source is not present. */
641    if (src[i].file == BAD_FILE)
642       return 0;
643 
644    switch (opcode) {
645    case BRW_OPCODE_PLN:
646       return i == 0 ? 1 : 2;
647 
648    case FS_OPCODE_PIXEL_X:
649    case FS_OPCODE_PIXEL_Y:
650       assert(i < 2);
651       if (i == 0)
652          return 2;
653       else
654          return 1;
655 
656    case FS_OPCODE_FB_WRITE_LOGICAL:
657       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
658       /* First/second FB write color. */
659       if (i < 2)
660          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
661       else
662          return 1;
663 
664    case SHADER_OPCODE_TEX_LOGICAL:
665    case SHADER_OPCODE_TXD_LOGICAL:
666    case SHADER_OPCODE_TXF_LOGICAL:
667    case SHADER_OPCODE_TXL_LOGICAL:
668    case SHADER_OPCODE_TXS_LOGICAL:
669    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
670    case FS_OPCODE_TXB_LOGICAL:
671    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
672    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
673    case SHADER_OPCODE_TXF_MCS_LOGICAL:
674    case SHADER_OPCODE_LOD_LOGICAL:
675    case SHADER_OPCODE_TG4_LOGICAL:
676    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
677    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
678    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
679    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
680    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
681    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
682    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
683       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
684              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
685              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
686       /* Texture coordinates. */
687       if (i == TEX_LOGICAL_SRC_COORDINATE)
688          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
689       /* Texture derivatives. */
690       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
691                opcode == SHADER_OPCODE_TXD_LOGICAL)
692          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
693       /* Texture offset. */
694       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
695          return 2;
696       /* MCS */
697       else if (i == TEX_LOGICAL_SRC_MCS) {
698          if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
699             return 2;
700          else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
701             return 4;
702          else
703             return 1;
704       } else
705          return 1;
706 
707    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
708       if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
709          return 0;
710       /* fallthrough */
711    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
712       if (i == MEMORY_LOGICAL_DATA1)
713          return 0;
714       /* fallthrough */
715    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
716       if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
717          return src[MEMORY_LOGICAL_COMPONENTS].ud;
718       else if (i == MEMORY_LOGICAL_ADDRESS)
719          return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
720       else
721          return 1;
722 
723    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
724       return (i == 0 ? 2 : 1);
725 
726    case SHADER_OPCODE_URB_WRITE_LOGICAL:
727       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
728 
729       if (i == URB_LOGICAL_SRC_DATA)
730          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
731       else
732          return 1;
733 
734    case BRW_OPCODE_DPAS:
735       unreachable("Do not use components_read() for DPAS.");
736 
737    default:
738       return 1;
739    }
740 }
741 
742 unsigned
size_read(int arg) const743 fs_inst::size_read(int arg) const
744 {
745    switch (opcode) {
746    case SHADER_OPCODE_SEND:
747       if (arg == 2) {
748          return mlen * REG_SIZE;
749       } else if (arg == 3) {
750          return ex_mlen * REG_SIZE;
751       }
752       break;
753 
754    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
755    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
756       if (arg == 0)
757          return mlen * REG_SIZE;
758       break;
759 
760    case BRW_OPCODE_PLN:
761       if (arg == 0)
762          return 16;
763       break;
764 
765    case SHADER_OPCODE_LOAD_PAYLOAD:
766       if (arg < this->header_size)
767          return retype(src[arg], BRW_TYPE_UD).component_size(8);
768       break;
769 
770    case SHADER_OPCODE_BARRIER:
771       return REG_SIZE;
772 
773    case SHADER_OPCODE_MOV_INDIRECT:
774       if (arg == 0) {
775          assert(src[2].file == IMM);
776          return src[2].ud;
777       }
778       break;
779 
780    case BRW_OPCODE_DPAS: {
781       /* This is a little bit sketchy. There's no way to get at devinfo from
782        * here, so the regular reg_unit() cannot be used. However, on
783        * reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
784        * reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
785        * coincidence, so this isn't so bad.
786        */
787       const unsigned reg_unit = this->exec_size / 8;
788 
789       switch (arg) {
790       case 0:
791          if (src[0].type == BRW_TYPE_HF) {
792             return rcount * reg_unit * REG_SIZE / 2;
793          } else {
794             return rcount * reg_unit * REG_SIZE;
795          }
796       case 1:
797          return sdepth * reg_unit * REG_SIZE;
798       case 2:
799          /* This is simpler than the formula described in the Bspec, but it
800           * covers all of the cases that we support. Each inner sdepth
801           * iteration of the DPAS consumes a single dword for int8, uint8, or
802           * float16 types. These are the one source types currently
803           * supportable through Vulkan. This is independent of reg_unit.
804           */
805          return rcount * sdepth * 4;
806       default:
807          unreachable("Invalid source number.");
808       }
809       break;
810    }
811 
812    default:
813       break;
814    }
815 
816    switch (src[arg].file) {
817    case UNIFORM:
818    case IMM:
819       return components_read(arg) * brw_type_size_bytes(src[arg].type);
820    case BAD_FILE:
821    case ARF:
822    case FIXED_GRF:
823    case VGRF:
824    case ATTR:
825       return components_read(arg) * src[arg].component_size(exec_size);
826    }
827    return 0;
828 }
829 
830 namespace {
831    unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)832    predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
833    {
834       if (devinfo->ver >= 20) {
835          return 1;
836       } else {
837          switch (predicate) {
838          case BRW_PREDICATE_NONE:            return 1;
839          case BRW_PREDICATE_NORMAL:          return 1;
840          case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
841          case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
842          case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
843          case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
844          case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
845          case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
846          case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
847          case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
848          case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
849          case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
850          default: unreachable("Unsupported predicate");
851          }
852       }
853    }
854 }
855 
856 unsigned
flags_read(const intel_device_info * devinfo) const857 fs_inst::flags_read(const intel_device_info *devinfo) const
858 {
859    if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
860                              predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
861       /* The vertical predication modes combine corresponding bits from
862        * f0.0 and f1.0 on Gfx7+.
863        */
864       const unsigned shift = 4;
865       return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
866    } else if (predicate) {
867       return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
868    } else {
869       unsigned mask = 0;
870       for (int i = 0; i < sources; i++) {
871          mask |= brw_fs_flag_mask(src[i], size_read(i));
872       }
873       return mask;
874    }
875 }
876 
877 unsigned
flags_written(const intel_device_info * devinfo) const878 fs_inst::flags_written(const intel_device_info *devinfo) const
879 {
880    if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
881                            opcode != BRW_OPCODE_CSEL &&
882                            opcode != BRW_OPCODE_IF &&
883                            opcode != BRW_OPCODE_WHILE)) {
884       return brw_fs_flag_mask(this, 1);
885    } else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
886       return brw_fs_flag_mask(this, 32);
887    } else {
888       return brw_fs_flag_mask(dst, size_written);
889    }
890 }
891 
892 bool
has_sampler_residency() const893 fs_inst::has_sampler_residency() const
894 {
895    switch (opcode) {
896    case SHADER_OPCODE_TEX_LOGICAL:
897    case FS_OPCODE_TXB_LOGICAL:
898    case SHADER_OPCODE_TXL_LOGICAL:
899    case SHADER_OPCODE_TXD_LOGICAL:
900    case SHADER_OPCODE_TXF_LOGICAL:
901    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
902    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
903    case SHADER_OPCODE_TXS_LOGICAL:
904    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
905    case SHADER_OPCODE_TG4_LOGICAL:
906    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
907    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
908    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
909    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
910    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
911       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
912       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
913    default:
914       return false;
915    }
916 }
917 
918 /* \sa inst_is_raw_move in brw_eu_validate. */
919 bool
is_raw_move() const920 fs_inst::is_raw_move() const
921 {
922    if (opcode != BRW_OPCODE_MOV)
923       return false;
924 
925    if (src[0].file == IMM) {
926       if (brw_type_is_vector_imm(src[0].type))
927          return false;
928    } else if (src[0].negate || src[0].abs) {
929       return false;
930    }
931 
932    if (saturate)
933       return false;
934 
935    return src[0].type == dst.type ||
936           (brw_type_is_int(src[0].type) &&
937            brw_type_is_int(dst.type) &&
938            brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
939 }
940 
941 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
942  * This brings in those uniform definitions
943  */
944 void
import_uniforms(fs_visitor * v)945 fs_visitor::import_uniforms(fs_visitor *v)
946 {
947    this->push_constant_loc = v->push_constant_loc;
948    this->uniforms = v->uniforms;
949 }
950 
951 enum brw_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key * key,nir_intrinsic_instr * intr)952 brw_barycentric_mode(const struct brw_wm_prog_key *key,
953                      nir_intrinsic_instr *intr)
954 {
955    const glsl_interp_mode mode =
956       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
957 
958    /* Barycentric modes don't make sense for flat inputs. */
959    assert(mode != INTERP_MODE_FLAT);
960 
961    unsigned bary;
962    switch (intr->intrinsic) {
963    case nir_intrinsic_load_barycentric_pixel:
964    case nir_intrinsic_load_barycentric_at_offset:
965       /* When per sample interpolation is dynamic, assume sample
966        * interpolation. We'll dynamically remap things so that the FS thread
967        * payload is not affected.
968        */
969       bary = key->persample_interp == BRW_SOMETIMES ?
970              BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE :
971              BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
972       break;
973    case nir_intrinsic_load_barycentric_centroid:
974       bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
975       break;
976    case nir_intrinsic_load_barycentric_sample:
977    case nir_intrinsic_load_barycentric_at_sample:
978       bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
979       break;
980    default:
981       unreachable("invalid intrinsic");
982    }
983 
984    if (mode == INTERP_MODE_NOPERSPECTIVE)
985       bary += 3;
986 
987    return (enum brw_barycentric_mode) bary;
988 }
989 
990 /**
991  * Walk backwards from the end of the program looking for a URB write that
992  * isn't in control flow, and mark it with EOT.
993  *
994  * Return true if successful or false if a separate EOT write is needed.
995  */
996 bool
mark_last_urb_write_with_eot()997 fs_visitor::mark_last_urb_write_with_eot()
998 {
999    foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1000       if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
1001          prev->eot = true;
1002 
1003          /* Delete now dead instructions. */
1004          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1005             if (dead == prev)
1006                break;
1007             dead->remove();
1008          }
1009          return true;
1010       } else if (prev->is_control_flow() || prev->has_side_effects()) {
1011          break;
1012       }
1013    }
1014 
1015    return false;
1016 }
1017 
1018 static unsigned
round_components_to_whole_registers(const intel_device_info * devinfo,unsigned c)1019 round_components_to_whole_registers(const intel_device_info *devinfo,
1020                                     unsigned c)
1021 {
1022    return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
1023 }
1024 
1025 void
assign_curb_setup()1026 fs_visitor::assign_curb_setup()
1027 {
1028    unsigned uniform_push_length =
1029       round_components_to_whole_registers(devinfo, prog_data->nr_params);
1030 
1031    unsigned ubo_push_length = 0;
1032    unsigned ubo_push_start[4];
1033    for (int i = 0; i < 4; i++) {
1034       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1035       ubo_push_length += prog_data->ubo_ranges[i].length;
1036 
1037       assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
1038       assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
1039    }
1040 
1041    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1042    if (stage == MESA_SHADER_FRAGMENT &&
1043        ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
1044       prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
1045 
1046    uint64_t used = 0;
1047    bool is_compute = gl_shader_stage_is_compute(stage);
1048 
1049    if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
1050       /* With COMPUTE_WALKER, we can push up to one register worth of data via
1051        * the inline data parameter in the COMPUTE_WALKER command itself.
1052        *
1053        * TODO: Support inline data and push at the same time.
1054        */
1055       assert(devinfo->verx10 >= 125);
1056       assert(uniform_push_length <= reg_unit(devinfo));
1057    } else if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
1058       assert(devinfo->has_lsc);
1059       fs_builder ubld = fs_builder(this, 1).exec_all().at(
1060          cfg->first_block(), cfg->first_block()->start());
1061 
1062       /* The base offset for our push data is passed in as R0.0[31:6]. We have
1063        * to mask off the bottom 6 bits.
1064        */
1065       brw_reg base_addr =
1066          ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
1067                   brw_imm_ud(INTEL_MASK(31, 6)));
1068 
1069       /* On Gfx12-HP we load constants at the start of the program using A32
1070        * stateless messages.
1071        */
1072       for (unsigned i = 0; i < uniform_push_length;) {
1073          /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
1074          unsigned num_regs = MIN2(uniform_push_length - i, 8);
1075          assert(num_regs > 0);
1076          num_regs = 1 << util_logbase2(num_regs);
1077 
1078          /* This pass occurs after all of the optimization passes, so don't
1079           * emit an 'ADD addr, base_addr, 0' instruction.
1080           */
1081          brw_reg addr = i == 0 ? base_addr :
1082             ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
1083 
1084          brw_reg srcs[4] = {
1085             brw_imm_ud(0), /* desc */
1086             brw_imm_ud(0), /* ex_desc */
1087             addr,          /* payload */
1088             brw_reg(),      /* payload2 */
1089          };
1090 
1091          brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
1092                               BRW_TYPE_UD);
1093          fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
1094 
1095          send->sfid = GFX12_SFID_UGM;
1096          send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1097                                    LSC_ADDR_SURFTYPE_FLAT,
1098                                    LSC_ADDR_SIZE_A32,
1099                                    LSC_DATA_SIZE_D32,
1100                                    num_regs * 8 /* num_channels */,
1101                                    true /* transpose */,
1102                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1103          send->header_size = 0;
1104          send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
1105          send->size_written =
1106             lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
1107          send->send_is_volatile = true;
1108 
1109          i += num_regs;
1110       }
1111 
1112       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1113    }
1114 
1115    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1116    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1117       for (unsigned int i = 0; i < inst->sources; i++) {
1118 	 if (inst->src[i].file == UNIFORM) {
1119             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1120             int constant_nr;
1121             if (inst->src[i].nr >= UBO_START) {
1122                /* constant_nr is in 32-bit units, the rest are in bytes */
1123                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1124                              inst->src[i].offset / 4;
1125             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1126                constant_nr = push_constant_loc[uniform_nr];
1127             } else {
1128                /* Section 5.11 of the OpenGL 4.1 spec says:
1129                 * "Out-of-bounds reads return undefined values, which include
1130                 *  values from other variables of the active program or zero."
1131                 * Just return the first push constant.
1132                 */
1133                constant_nr = 0;
1134             }
1135 
1136             assert(constant_nr / 8 < 64);
1137             used |= BITFIELD64_BIT(constant_nr / 8);
1138 
1139 	    struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1140 						  constant_nr / 8,
1141 						  constant_nr % 8);
1142             brw_reg.abs = inst->src[i].abs;
1143             brw_reg.negate = inst->src[i].negate;
1144 
1145             assert(inst->src[i].stride == 0);
1146             inst->src[i] = byte_offset(
1147                retype(brw_reg, inst->src[i].type),
1148                inst->src[i].offset % 4);
1149 	 }
1150       }
1151    }
1152 
1153    uint64_t want_zero = used & prog_data->zero_push_reg;
1154    if (want_zero) {
1155       fs_builder ubld = fs_builder(this, 8).exec_all().at(
1156          cfg->first_block(), cfg->first_block()->start());
1157 
1158       /* push_reg_mask_param is in 32-bit units */
1159       unsigned mask_param = prog_data->push_reg_mask_param;
1160       struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1161                                                               mask_param % 8);
1162 
1163       brw_reg b32;
1164       for (unsigned i = 0; i < 64; i++) {
1165          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1166             brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
1167             ubld.SHL(horiz_offset(shifted, 8),
1168                      byte_offset(retype(mask, BRW_TYPE_W), i / 8),
1169                      brw_imm_v(0x01234567));
1170             ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1171 
1172             fs_builder ubld16 = ubld.group(16, 0);
1173             b32 = ubld16.vgrf(BRW_TYPE_D);
1174             ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1175          }
1176 
1177          if (want_zero & BITFIELD64_BIT(i)) {
1178             assert(i < prog_data->curb_read_length);
1179             struct brw_reg push_reg =
1180                retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
1181 
1182             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1183          }
1184       }
1185 
1186       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1187    }
1188 
1189    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1190    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1191 }
1192 
1193 /*
1194  * Build up an array of indices into the urb_setup array that
1195  * references the active entries of the urb_setup array.
1196  * Used to accelerate walking the active entries of the urb_setup array
1197  * on each upload.
1198  */
1199 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1200 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1201 {
1202    /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1203     * skip per-primitive attributes here.
1204     */
1205 
1206    /* Make sure uint8_t is sufficient */
1207    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1208    uint8_t index = 0;
1209    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1210       if (wm_prog_data->urb_setup[attr] >= 0) {
1211          wm_prog_data->urb_setup_attribs[index++] = attr;
1212       }
1213    }
1214    wm_prog_data->urb_setup_attribs_count = index;
1215 }
1216 
1217 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1218 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1219 {
1220    for (int i = 0; i < inst->sources; i++) {
1221       if (inst->src[i].file == ATTR) {
1222          assert(inst->src[i].nr == 0);
1223          int grf = payload().num_regs +
1224                    prog_data->curb_read_length +
1225                    inst->src[i].offset / REG_SIZE;
1226 
1227          /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1228           *
1229           * VertStride must be used to cross GRF register boundaries. This
1230           * rule implies that elements within a 'Width' cannot cross GRF
1231           * boundaries.
1232           *
1233           * So, for registers that are large enough, we have to split the exec
1234           * size in two and trust the compression state to sort it out.
1235           */
1236          unsigned total_size = inst->exec_size *
1237                                inst->src[i].stride *
1238                                brw_type_size_bytes(inst->src[i].type);
1239 
1240          assert(total_size <= 2 * REG_SIZE);
1241          const unsigned exec_size =
1242             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1243 
1244          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1245          struct brw_reg reg =
1246             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1247                                inst->src[i].offset % REG_SIZE),
1248                    exec_size * inst->src[i].stride,
1249                    width, inst->src[i].stride);
1250          reg.abs = inst->src[i].abs;
1251          reg.negate = inst->src[i].negate;
1252 
1253          inst->src[i] = reg;
1254       }
1255    }
1256 }
1257 
1258 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1259 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1260                                 const brw_stage_prog_data *prog_data)
1261 {
1262    if (prog_data->nr_params == 0)
1263       return -1;
1264 
1265    if (devinfo->verx10 >= 125)
1266       return -1;
1267 
1268    /* The local thread id is always the last parameter in the list */
1269    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1270    if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1271       return prog_data->nr_params - 1;
1272 
1273    return -1;
1274 }
1275 
1276 /**
1277  * Assign UNIFORM file registers to either push constants or pull constants.
1278  *
1279  * We allow a fragment shader to have more than the specified minimum
1280  * maximum number of fragment shader uniform components (64).  If
1281  * there are too many of these, they'd fill up all of register space.
1282  * So, this will push some of them out to the pull constant buffer and
1283  * update the program to load them.
1284  */
1285 void
assign_constant_locations()1286 fs_visitor::assign_constant_locations()
1287 {
1288    /* Only the first compile gets to decide on locations. */
1289    if (push_constant_loc)
1290       return;
1291 
1292    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1293    for (unsigned u = 0; u < uniforms; u++)
1294       push_constant_loc[u] = u;
1295 
1296    /* Now that we know how many regular uniforms we'll push, reduce the
1297     * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1298     *
1299     * If changing this value, note the limitation about total_regs in
1300     * brw_curbe.c/crocus_state.c
1301     */
1302    const unsigned max_push_length = 64;
1303    unsigned push_length =
1304       round_components_to_whole_registers(devinfo, prog_data->nr_params);
1305    for (int i = 0; i < 4; i++) {
1306       struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
1307 
1308       if (push_length + range->length > max_push_length)
1309          range->length = max_push_length - push_length;
1310 
1311       push_length += range->length;
1312 
1313       assert(push_length % (1 * reg_unit(devinfo)) == 0);
1314 
1315    }
1316    assert(push_length <= max_push_length);
1317 }
1318 
1319 bool
get_pull_locs(const brw_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)1320 fs_visitor::get_pull_locs(const brw_reg &src,
1321                           unsigned *out_surf_index,
1322                           unsigned *out_pull_index)
1323 {
1324    assert(src.file == UNIFORM);
1325 
1326    if (src.nr < UBO_START)
1327       return false;
1328 
1329    const struct brw_ubo_range *range =
1330       &prog_data->ubo_ranges[src.nr - UBO_START];
1331 
1332    /* If this access is in our (reduced) range, use the push data. */
1333    if (src.offset / 32 < range->length)
1334       return false;
1335 
1336    *out_surf_index = range->block;
1337    *out_pull_index = (32 * range->start + src.offset) / 4;
1338 
1339    prog_data->has_ubo_pull = true;
1340 
1341    return true;
1342 }
1343 
1344 /**
1345  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
1346  * by discard.  Due to the layout of the sample mask in the fragment shader
1347  * thread payload, \p bld is required to have a dispatch_width() not greater
1348  * than 16 for fragment shaders.
1349  */
1350 brw_reg
brw_sample_mask_reg(const fs_builder & bld)1351 brw_sample_mask_reg(const fs_builder &bld)
1352 {
1353    const fs_visitor &s = *bld.shader;
1354 
1355    if (s.stage != MESA_SHADER_FRAGMENT) {
1356       return brw_imm_ud(0xffffffff);
1357    } else if (s.devinfo->ver >= 20 ||
1358               brw_wm_prog_data(s.prog_data)->uses_kill) {
1359       return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
1360    } else {
1361       assert(bld.dispatch_width() <= 16);
1362       assert(s.devinfo->ver < 20);
1363       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
1364                     BRW_TYPE_UW);
1365    }
1366 }
1367 
1368 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)1369 brw_fb_write_msg_control(const fs_inst *inst,
1370                          const struct brw_wm_prog_data *prog_data)
1371 {
1372    uint32_t mctl;
1373 
1374    if (prog_data->dual_src_blend) {
1375       assert(inst->exec_size < 32);
1376 
1377       if (inst->group % 16 == 0)
1378          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
1379       else if (inst->group % 16 == 8)
1380          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
1381       else
1382          unreachable("Invalid dual-source FB write instruction group");
1383    } else {
1384       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
1385 
1386       if (inst->exec_size == 16)
1387          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1388       else if (inst->exec_size == 8)
1389          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1390       else if (inst->exec_size == 32)
1391          mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
1392       else
1393          unreachable("Invalid FB write execution size");
1394    }
1395 
1396    return mctl;
1397 }
1398 
1399  /**
1400  * Predicate the specified instruction on the sample mask.
1401  */
1402 void
brw_emit_predicate_on_sample_mask(const fs_builder & bld,fs_inst * inst)1403 brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
1404 {
1405    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1406           bld.group() == inst->group &&
1407           bld.dispatch_width() == inst->exec_size);
1408 
1409    const fs_visitor &s = *bld.shader;
1410    const brw_reg sample_mask = brw_sample_mask_reg(bld);
1411    const unsigned subreg = sample_mask_flag_subreg(s);
1412 
1413    if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
1414       assert(sample_mask.file == ARF &&
1415              sample_mask.nr == brw_flag_subreg(subreg).nr &&
1416              sample_mask.subnr == brw_flag_subreg(
1417                 subreg + inst->group / 16).subnr);
1418    } else {
1419       bld.group(1, 0).exec_all()
1420          .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
1421    }
1422 
1423    if (inst->predicate) {
1424       assert(inst->predicate == BRW_PREDICATE_NORMAL);
1425       assert(!inst->predicate_inverse);
1426       assert(inst->flag_subreg == 0);
1427       assert(s.devinfo->ver < 20);
1428       /* Combine the sample mask with the existing predicate by using a
1429        * vertical predication mode.
1430        */
1431       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1432    } else {
1433       inst->flag_subreg = subreg;
1434       inst->predicate = BRW_PREDICATE_NORMAL;
1435       inst->predicate_inverse = false;
1436    }
1437 }
1438 
register_pressure(const fs_visitor * v)1439 brw::register_pressure::register_pressure(const fs_visitor *v)
1440 {
1441    const fs_live_variables &live = v->live_analysis.require();
1442    const unsigned num_instructions = v->cfg->num_blocks ?
1443       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
1444 
1445    regs_live_at_ip = new unsigned[num_instructions]();
1446 
1447    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
1448       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
1449          regs_live_at_ip[ip] += v->alloc.sizes[reg];
1450    }
1451 
1452    const unsigned payload_count = v->first_non_payload_grf;
1453 
1454    int *payload_last_use_ip = new int[payload_count];
1455    v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
1456 
1457    for (unsigned reg = 0; reg < payload_count; reg++) {
1458       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
1459          ++regs_live_at_ip[ip];
1460    }
1461 
1462    delete[] payload_last_use_ip;
1463 }
1464 
~register_pressure()1465 brw::register_pressure::~register_pressure()
1466 {
1467    delete[] regs_live_at_ip;
1468 }
1469 
1470 void
invalidate_analysis(brw::analysis_dependency_class c)1471 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
1472 {
1473    live_analysis.invalidate(c);
1474    regpressure_analysis.invalidate(c);
1475    idom_analysis.invalidate(c);
1476    def_analysis.invalidate(c);
1477 }
1478 
1479 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const1480 fs_visitor::debug_optimizer(const nir_shader *nir,
1481                             const char *pass_name,
1482                             int iteration, int pass_num) const
1483 {
1484    if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
1485       return;
1486 
1487    char *filename;
1488    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
1489                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
1490                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
1491                       iteration, pass_num, pass_name);
1492    if (ret == -1)
1493       return;
1494    brw_print_instructions(*this, filename);
1495    free(filename);
1496 }
1497 
1498 static uint32_t
brw_compute_max_register_pressure(fs_visitor & s)1499 brw_compute_max_register_pressure(fs_visitor &s)
1500 {
1501    const register_pressure &rp = s.regpressure_analysis.require();
1502    uint32_t ip = 0, max_pressure = 0;
1503    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1504       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
1505       ip++;
1506    }
1507    return max_pressure;
1508 }
1509 
1510 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)1511 save_instruction_order(const struct cfg_t *cfg)
1512 {
1513    /* Before we schedule anything, stash off the instruction order as an array
1514     * of fs_inst *.  This way, we can reset it between scheduling passes to
1515     * prevent dependencies between the different scheduling modes.
1516     */
1517    int num_insts = cfg->last_block()->end_ip + 1;
1518    fs_inst **inst_arr = new fs_inst * [num_insts];
1519 
1520    int ip = 0;
1521    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1522       assert(ip >= block->start_ip && ip <= block->end_ip);
1523       inst_arr[ip++] = inst;
1524    }
1525    assert(ip == num_insts);
1526 
1527    return inst_arr;
1528 }
1529 
1530 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)1531 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
1532 {
1533    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
1534 
1535    int ip = 0;
1536    foreach_block (block, cfg) {
1537       block->instructions.make_empty();
1538 
1539       assert(ip == block->start_ip);
1540       for (; ip <= block->end_ip; ip++)
1541          block->instructions.push_tail(inst_arr[ip]);
1542    }
1543    assert(ip == num_insts);
1544 }
1545 
1546 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
1547 static inline unsigned
brw_get_scratch_size(int size)1548 brw_get_scratch_size(int size)
1549 {
1550    return MAX2(1024, util_next_power_of_two(size));
1551 }
1552 
1553 void
brw_allocate_registers(fs_visitor & s,bool allow_spilling)1554 brw_allocate_registers(fs_visitor &s, bool allow_spilling)
1555 {
1556    const struct intel_device_info *devinfo = s.devinfo;
1557    const nir_shader *nir = s.nir;
1558    bool allocated;
1559 
1560    static const enum instruction_scheduler_mode pre_modes[] = {
1561       SCHEDULE_PRE,
1562       SCHEDULE_PRE_NON_LIFO,
1563       SCHEDULE_NONE,
1564       SCHEDULE_PRE_LIFO,
1565    };
1566 
1567    static const char *scheduler_mode_name[] = {
1568       [SCHEDULE_PRE] = "top-down",
1569       [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
1570       [SCHEDULE_PRE_LIFO] = "lifo",
1571       [SCHEDULE_POST] = "post",
1572       [SCHEDULE_NONE] = "none",
1573    };
1574 
1575    uint32_t best_register_pressure = UINT32_MAX;
1576    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
1577 
1578    brw_fs_opt_compact_virtual_grfs(s);
1579 
1580    if (s.needs_register_pressure)
1581       s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
1582 
1583    s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
1584 
1585    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
1586 
1587    /* Before we schedule anything, stash off the instruction order as an array
1588     * of fs_inst *.  This way, we can reset it between scheduling passes to
1589     * prevent dependencies between the different scheduling modes.
1590     */
1591    fs_inst **orig_order = save_instruction_order(s.cfg);
1592    fs_inst **best_pressure_order = NULL;
1593 
1594    void *scheduler_ctx = ralloc_context(NULL);
1595    instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
1596 
1597    /* Try each scheduling heuristic to see if it can successfully register
1598     * allocate without spilling.  They should be ordered by decreasing
1599     * performance but increasing likelihood of allocating.
1600     */
1601    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
1602       enum instruction_scheduler_mode sched_mode = pre_modes[i];
1603 
1604       brw_schedule_instructions_pre_ra(s, sched, sched_mode);
1605       s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
1606 
1607       s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
1608 
1609       if (0) {
1610          brw_assign_regs_trivial(s);
1611          allocated = true;
1612          break;
1613       }
1614 
1615       /* We should only spill registers on the last scheduling. */
1616       assert(!s.spilled_any_registers);
1617 
1618       allocated = brw_assign_regs(s, false, spill_all);
1619       if (allocated)
1620          break;
1621 
1622       /* Save the maximum register pressure */
1623       uint32_t this_pressure = brw_compute_max_register_pressure(s);
1624 
1625       if (0) {
1626          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
1627                  scheduler_mode_name[sched_mode], this_pressure);
1628       }
1629 
1630       if (this_pressure < best_register_pressure) {
1631          best_register_pressure = this_pressure;
1632          best_sched = sched_mode;
1633          delete[] best_pressure_order;
1634          best_pressure_order = save_instruction_order(s.cfg);
1635       }
1636 
1637       /* Reset back to the original order before trying the next mode */
1638       restore_instruction_order(s.cfg, orig_order);
1639       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1640    }
1641 
1642    ralloc_free(scheduler_ctx);
1643 
1644    if (!allocated) {
1645       if (0) {
1646          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
1647                  scheduler_mode_name[best_sched]);
1648       }
1649       restore_instruction_order(s.cfg, best_pressure_order);
1650       s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
1651 
1652       allocated = brw_assign_regs(s, allow_spilling, spill_all);
1653    }
1654 
1655    delete[] orig_order;
1656    delete[] best_pressure_order;
1657 
1658    if (!allocated) {
1659       s.fail("Failure to register allocate.  Reduce number of "
1660            "live scalar values to avoid this.");
1661    } else if (s.spilled_any_registers) {
1662       brw_shader_perf_log(s.compiler, s.log_data,
1663                           "%s shader triggered register spilling.  "
1664                           "Try reducing the number of live scalar "
1665                           "values to improve performance.\n",
1666                           _mesa_shader_stage_to_string(s.stage));
1667    }
1668 
1669    if (s.failed)
1670       return;
1671 
1672    s.debug_optimizer(nir, "post_ra_alloc", 96, 0);
1673 
1674    brw_fs_opt_bank_conflicts(s);
1675 
1676    s.debug_optimizer(nir, "bank_conflict", 96, 1);
1677 
1678    brw_schedule_instructions_post_ra(s);
1679 
1680    s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
1681 
1682    /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
1683     * of part of assign_regs since both bank conflicts optimization and post
1684     * RA scheduling take advantage of distinguishing references to registers
1685     * that were allocated from references that were already fixed.
1686     *
1687     * TODO: Change the passes above, then move this lowering to be part of
1688     * assign_regs.
1689     */
1690    brw_fs_lower_vgrfs_to_fixed_grfs(s);
1691 
1692    s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
1693 
1694    if (s.last_scratch > 0) {
1695       /* We currently only support up to 2MB of scratch space.  If we
1696        * need to support more eventually, the documentation suggests
1697        * that we could allocate a larger buffer, and partition it out
1698        * ourselves.  We'd just have to undo the hardware's address
1699        * calculation by subtracting (FFTID * Per Thread Scratch Space)
1700        * and then add FFTID * (Larger Per Thread Scratch Space).
1701        *
1702        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
1703        * Thread Group Tracking > Local Memory/Scratch Space.
1704        */
1705       if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
1706          /* Take the max of any previously compiled variant of the shader. In the
1707           * case of bindless shaders with return parts, this will also take the
1708           * max of all parts.
1709           */
1710          s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
1711                                            s.prog_data->total_scratch);
1712       } else {
1713          s.fail("Scratch space required is larger than supported");
1714       }
1715    }
1716 
1717    if (s.failed)
1718       return;
1719 
1720    brw_fs_lower_scoreboard(s);
1721 }
1722 
1723 /**
1724  * Move load_interpolated_input with simple (payload-based) barycentric modes
1725  * to the top of the program so we don't emit multiple PLNs for the same input.
1726  *
1727  * This works around CSE not being able to handle non-dominating cases
1728  * such as:
1729  *
1730  *    if (...) {
1731  *       interpolate input
1732  *    } else {
1733  *       interpolate the same exact input
1734  *    }
1735  *
1736  * This should be replaced by global value numbering someday.
1737  */
1738 bool
brw_nir_move_interpolation_to_top(nir_shader * nir)1739 brw_nir_move_interpolation_to_top(nir_shader *nir)
1740 {
1741    bool progress = false;
1742 
1743    nir_foreach_function_impl(impl, nir) {
1744       nir_block *top = nir_start_block(impl);
1745       nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
1746       bool impl_progress = false;
1747 
1748       for (nir_block *block = nir_block_cf_tree_next(top);
1749            block != NULL;
1750            block = nir_block_cf_tree_next(block)) {
1751 
1752          nir_foreach_instr_safe(instr, block) {
1753             if (instr->type != nir_instr_type_intrinsic)
1754                continue;
1755 
1756             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1757             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
1758                continue;
1759             nir_intrinsic_instr *bary_intrinsic =
1760                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
1761             nir_intrinsic_op op = bary_intrinsic->intrinsic;
1762 
1763             /* Leave interpolateAtSample/Offset() where they are. */
1764             if (op == nir_intrinsic_load_barycentric_at_sample ||
1765                 op == nir_intrinsic_load_barycentric_at_offset)
1766                continue;
1767 
1768             nir_instr *move[3] = {
1769                &bary_intrinsic->instr,
1770                intrin->src[1].ssa->parent_instr,
1771                instr
1772             };
1773 
1774             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
1775                if (move[i]->block != top) {
1776                   nir_instr_move(cursor, move[i]);
1777                   impl_progress = true;
1778                }
1779             }
1780          }
1781       }
1782 
1783       progress = progress || impl_progress;
1784 
1785       nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
1786                                                 : nir_metadata_all);
1787    }
1788 
1789    return progress;
1790 }
1791 
1792 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)1793 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1794                              unsigned threads)
1795 {
1796    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
1797    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
1798    return cs_prog_data->push.per_thread.size * threads +
1799           cs_prog_data->push.cross_thread.size;
1800 }
1801 
1802 static bool
filter_simd(const nir_instr * instr,const void *)1803 filter_simd(const nir_instr *instr, const void * /* options */)
1804 {
1805    if (instr->type != nir_instr_type_intrinsic)
1806       return false;
1807 
1808    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1809    case nir_intrinsic_load_simd_width_intel:
1810    case nir_intrinsic_load_subgroup_id:
1811       return true;
1812 
1813    default:
1814       return false;
1815    }
1816 }
1817 
1818 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)1819 lower_simd(nir_builder *b, nir_instr *instr, void *options)
1820 {
1821    uintptr_t simd_width = (uintptr_t)options;
1822 
1823    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1824    case nir_intrinsic_load_simd_width_intel:
1825       return nir_imm_int(b, simd_width);
1826 
1827    case nir_intrinsic_load_subgroup_id:
1828       /* If the whole workgroup fits in one thread, we can lower subgroup_id
1829        * to a constant zero.
1830        */
1831       if (!b->shader->info.workgroup_size_variable) {
1832          unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
1833                                          b->shader->info.workgroup_size[1] *
1834                                          b->shader->info.workgroup_size[2];
1835          if (local_workgroup_size <= simd_width)
1836             return nir_imm_int(b, 0);
1837       }
1838       return NULL;
1839 
1840    default:
1841       return NULL;
1842    }
1843 }
1844 
1845 bool
brw_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)1846 brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
1847 {
1848    return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
1849                                  (void *)(uintptr_t)dispatch_width);
1850 }
1851 
1852 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)1853 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1854                          const struct brw_cs_prog_data *prog_data,
1855                          const unsigned *override_local_size)
1856 {
1857    struct intel_cs_dispatch_info info = {};
1858 
1859    const unsigned *sizes =
1860       override_local_size ? override_local_size :
1861                             prog_data->local_size;
1862 
1863    const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
1864    assert(simd >= 0 && simd < 3);
1865 
1866    info.group_size = sizes[0] * sizes[1] * sizes[2];
1867    info.simd_size = 8u << simd;
1868    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
1869 
1870    const uint32_t remainder = info.group_size & (info.simd_size - 1);
1871    if (remainder > 0)
1872       info.right_mask = ~0u >> (32 - remainder);
1873    else
1874       info.right_mask = ~0u >> (32 - info.simd_size);
1875 
1876    return info;
1877 }
1878 
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)1879 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
1880 {
1881    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
1882 }
1883 
1884 namespace brw {
1885    brw_reg
fetch_payload_reg(const brw::fs_builder & bld,uint8_t regs[2],brw_reg_type type,unsigned n)1886    fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
1887                      brw_reg_type type, unsigned n)
1888    {
1889       if (!regs[0])
1890          return brw_reg();
1891 
1892       if (bld.dispatch_width() > 16) {
1893          const brw_reg tmp = bld.vgrf(type, n);
1894          const brw::fs_builder hbld = bld.exec_all().group(16, 0);
1895          const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1896          brw_reg *const components = new brw_reg[m * n];
1897 
1898          for (unsigned c = 0; c < n; c++) {
1899             for (unsigned g = 0; g < m; g++)
1900                components[c * m + g] =
1901                   offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
1902          }
1903 
1904          hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
1905 
1906          delete[] components;
1907          return tmp;
1908 
1909       } else {
1910          return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
1911       }
1912    }
1913 
1914    brw_reg
fetch_barycentric_reg(const brw::fs_builder & bld,uint8_t regs[2])1915    fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
1916    {
1917       if (!regs[0])
1918          return brw_reg();
1919       else if (bld.shader->devinfo->ver >= 20)
1920          return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
1921 
1922       const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
1923       const brw::fs_builder hbld = bld.exec_all().group(8, 0);
1924       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1925       brw_reg *const components = new brw_reg[2 * m];
1926 
1927       for (unsigned c = 0; c < 2; c++) {
1928          for (unsigned g = 0; g < m; g++)
1929             components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
1930                                            hbld, c + 2 * (g % 2));
1931       }
1932 
1933       hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
1934 
1935       delete[] components;
1936       return tmp;
1937    }
1938 
1939    void
check_dynamic_msaa_flag(const fs_builder & bld,const struct brw_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)1940    check_dynamic_msaa_flag(const fs_builder &bld,
1941                            const struct brw_wm_prog_data *wm_prog_data,
1942                            enum intel_msaa_flags flag)
1943    {
1944       fs_inst *inst = bld.AND(bld.null_reg_ud(),
1945                               dynamic_msaa_flags(wm_prog_data),
1946                               brw_imm_ud(flag));
1947       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1948    }
1949 }
1950 
1951