xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_generator.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file
25  *
26  * This file supports generating code from the FS LIR to the actual
27  * native instructions.
28  */
29 
30 #include "brw_eu.h"
31 #include "brw_disasm_info.h"
32 #include "brw_fs.h"
33 #include "brw_cfg.h"
34 #include "dev/intel_debug.h"
35 #include "util/mesa-sha1.h"
36 #include "util/half_float.h"
37 
38 static uint32_t
brw_math_function(enum opcode op)39 brw_math_function(enum opcode op)
40 {
41    switch (op) {
42    case SHADER_OPCODE_RCP:
43       return BRW_MATH_FUNCTION_INV;
44    case SHADER_OPCODE_RSQ:
45       return BRW_MATH_FUNCTION_RSQ;
46    case SHADER_OPCODE_SQRT:
47       return BRW_MATH_FUNCTION_SQRT;
48    case SHADER_OPCODE_EXP2:
49       return BRW_MATH_FUNCTION_EXP;
50    case SHADER_OPCODE_LOG2:
51       return BRW_MATH_FUNCTION_LOG;
52    case SHADER_OPCODE_POW:
53       return BRW_MATH_FUNCTION_POW;
54    case SHADER_OPCODE_SIN:
55       return BRW_MATH_FUNCTION_SIN;
56    case SHADER_OPCODE_COS:
57       return BRW_MATH_FUNCTION_COS;
58    case SHADER_OPCODE_INT_QUOTIENT:
59       return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
60    case SHADER_OPCODE_INT_REMAINDER:
61       return BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
62    default:
63       unreachable("not reached: unknown math function");
64    }
65 }
66 
67 static struct brw_reg
normalize_brw_reg_for_encoding(brw_reg * reg)68 normalize_brw_reg_for_encoding(brw_reg *reg)
69 {
70    struct brw_reg brw_reg;
71 
72    switch (reg->file) {
73    case ARF:
74    case FIXED_GRF:
75    case IMM:
76       assert(reg->offset == 0);
77       brw_reg = *reg;
78       break;
79    case BAD_FILE:
80       /* Probably unused. */
81       brw_reg = brw_null_reg();
82       break;
83    case VGRF:
84    case ATTR:
85    case UNIFORM:
86       unreachable("not reached");
87    }
88 
89    return brw_reg;
90 }
91 
fs_generator(const struct brw_compiler * compiler,const struct brw_compile_params * params,struct brw_stage_prog_data * prog_data,gl_shader_stage stage)92 fs_generator::fs_generator(const struct brw_compiler *compiler,
93                            const struct brw_compile_params *params,
94                            struct brw_stage_prog_data *prog_data,
95                            gl_shader_stage stage)
96 
97    : compiler(compiler), params(params),
98      devinfo(compiler->devinfo),
99      prog_data(prog_data), dispatch_width(0),
100      debug_flag(false),
101      shader_name(NULL), stage(stage), mem_ctx(params->mem_ctx)
102 {
103    p = rzalloc(mem_ctx, struct brw_codegen);
104    brw_init_codegen(&compiler->isa, p, mem_ctx);
105 }
106 
~fs_generator()107 fs_generator::~fs_generator()
108 {
109 }
110 
111 class ip_record : public exec_node {
112 public:
113    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
114 
ip_record(int ip)115    ip_record(int ip)
116    {
117       this->ip = ip;
118    }
119 
120    int ip;
121 };
122 
123 bool
patch_halt_jumps()124 fs_generator::patch_halt_jumps()
125 {
126    if (this->discard_halt_patches.is_empty())
127       return false;
128 
129    int scale = brw_jump_scale(p->devinfo);
130 
131    /* There is a somewhat strange undocumented requirement of using
132     * HALT, according to the simulator.  If some channel has HALTed to
133     * a particular UIP, then by the end of the program, every channel
134     * must have HALTed to that UIP.  Furthermore, the tracking is a
135     * stack, so you can't do the final halt of a UIP after starting
136     * halting to a new UIP.
137     *
138     * Symptoms of not emitting this instruction on actual hardware
139     * included GPU hangs and sparkly rendering on the piglit discard
140     * tests.
141     */
142    brw_inst *last_halt = brw_HALT(p);
143    brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
144    brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
145 
146    int ip = p->nr_insn;
147 
148    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
149       brw_inst *patch = &p->store[patch_ip->ip];
150 
151       assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT);
152       /* HALT takes a half-instruction distance from the pre-incremented IP. */
153       brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
154    }
155 
156    this->discard_halt_patches.make_empty();
157 
158    return true;
159 }
160 
161 void
generate_send(fs_inst * inst,struct brw_reg dst,struct brw_reg desc,struct brw_reg ex_desc,struct brw_reg payload,struct brw_reg payload2)162 fs_generator::generate_send(fs_inst *inst,
163                             struct brw_reg dst,
164                             struct brw_reg desc,
165                             struct brw_reg ex_desc,
166                             struct brw_reg payload,
167                             struct brw_reg payload2)
168 {
169    const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
170 
171    uint32_t desc_imm = inst->desc |
172       brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
173 
174    uint32_t ex_desc_imm = inst->ex_desc |
175       brw_message_ex_desc(devinfo, inst->ex_mlen);
176 
177    if (ex_desc.file != IMM || ex_desc.ud || ex_desc_imm ||
178        inst->send_ex_desc_scratch) {
179       /* If we have any sort of extended descriptor, then we need SENDS.  This
180        * also covers the dual-payload case because ex_mlen goes in ex_desc.
181        */
182       brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
183                                       desc, desc_imm, ex_desc, ex_desc_imm,
184                                       inst->send_ex_desc_scratch,
185                                       inst->send_ex_bso, inst->eot);
186       if (inst->check_tdr)
187          brw_inst_set_opcode(p->isa, brw_last_inst,
188                              devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
189    } else {
190       brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
191                                    inst->eot);
192       if (inst->check_tdr)
193          brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
194    }
195 }
196 
197 void
generate_mov_indirect(fs_inst * inst,struct brw_reg dst,struct brw_reg reg,struct brw_reg indirect_byte_offset)198 fs_generator::generate_mov_indirect(fs_inst *inst,
199                                     struct brw_reg dst,
200                                     struct brw_reg reg,
201                                     struct brw_reg indirect_byte_offset)
202 {
203    assert(indirect_byte_offset.type == BRW_TYPE_UD);
204    assert(indirect_byte_offset.file == FIXED_GRF);
205    assert(!reg.abs && !reg.negate);
206 
207    /* Gen12.5 adds the following region restriction:
208     *
209     *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
210     *    and Quad-Word data must not be used."
211     *
212     * We require the source and destination types to match so stomp to an
213     * unsigned integer type.
214     */
215    assert(reg.type == dst.type);
216    reg.type = dst.type =
217       brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(reg.type));
218 
219    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
220 
221    if (indirect_byte_offset.file == IMM) {
222       imm_byte_offset += indirect_byte_offset.ud;
223 
224       reg.nr = imm_byte_offset / REG_SIZE;
225       reg.subnr = imm_byte_offset % REG_SIZE;
226       if (brw_type_size_bytes(reg.type) > 4 && !devinfo->has_64bit_int) {
227          brw_MOV(p, subscript(dst, BRW_TYPE_D, 0),
228                     subscript(reg, BRW_TYPE_D, 0));
229          brw_set_default_swsb(p, tgl_swsb_null());
230          brw_MOV(p, subscript(dst, BRW_TYPE_D, 1),
231                     subscript(reg, BRW_TYPE_D, 1));
232       } else {
233          brw_MOV(p, dst, reg);
234       }
235    } else {
236       /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
237       struct brw_reg addr = vec8(brw_address_reg(0));
238 
239       /* Whether we can use destination dependency control without running the
240        * risk of a hang if an instruction gets shot down.
241        */
242       const bool use_dep_ctrl = !inst->predicate &&
243                                 inst->exec_size == dispatch_width;
244       brw_inst *insn;
245 
246       /* The destination stride of an instruction (in bytes) must be greater
247        * than or equal to the size of the rest of the instruction.  Since the
248        * address register is of type UW, we can't use a D-type instruction.
249        * In order to get around this, re retype to UW and use a stride.
250        */
251       indirect_byte_offset =
252          retype(spread(indirect_byte_offset, 2), BRW_TYPE_UW);
253 
254       /* There are a number of reasons why we don't use the base offset here.
255        * One reason is that the field is only 9 bits which means we can only
256        * use it to access the first 16 GRFs.  Also, from the Haswell PRM
257        * section "Register Region Restrictions":
258        *
259        *    "The lower bits of the AddressImmediate must not overflow to
260        *    change the register address.  The lower 5 bits of Address
261        *    Immediate when added to lower 5 bits of address register gives
262        *    the sub-register offset. The upper bits of Address Immediate
263        *    when added to upper bits of address register gives the register
264        *    address. Any overflow from sub-register offset is dropped."
265        *
266        * Since the indirect may cause us to cross a register boundary, this
267        * makes the base offset almost useless.  We could try and do something
268        * clever where we use a actual base offset if base_offset % 32 == 0 but
269        * that would mean we were generating different code depending on the
270        * base offset.  Instead, for the sake of consistency, we'll just do the
271        * add ourselves.  This restriction is only listed in the Haswell PRM
272        * but empirical testing indicates that it applies on all older
273        * generations and is lifted on Broadwell.
274        *
275        * In the end, while base_offset is nice to look at in the generated
276        * code, using it saves us 0 instructions and would require quite a bit
277        * of case-by-case work.  It's just not worth it.
278        *
279        * Due to a hardware bug some platforms (particularly Gfx11+) seem to
280        * require the address components of all channels to be valid whether or
281        * not they're active, which causes issues if we use VxH addressing
282        * under non-uniform control-flow.  We can easily work around that by
283        * initializing the whole address register with a pipelined NoMask MOV
284        * instruction.
285        */
286       insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
287       brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
288       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
289       if (devinfo->ver >= 12)
290          brw_set_default_swsb(p, tgl_swsb_null());
291       else
292          brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
293 
294       insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
295       if (devinfo->ver >= 12)
296          brw_set_default_swsb(p, tgl_swsb_regdist(1));
297       else
298          brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
299 
300       if (brw_type_size_bytes(reg.type) > 4 &&
301           (intel_device_info_is_9lp(devinfo) || !devinfo->has_64bit_int)) {
302          /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
303           *
304           *   "When source or destination datatype is 64b or operation is
305           *    integer DWord multiply, indirect addressing must not be used."
306           *
307           * We may also not support Q/UQ types.
308           *
309           * To work around both of these, we do two integer MOVs instead
310           * of one 64-bit MOV.  Because no double value should ever cross
311           * a register boundary, it's safe to use the immediate offset in
312           * the indirect here to handle adding 4 bytes to the offset and
313           * avoid the extra ADD to the register file.
314           */
315          brw_MOV(p, subscript(dst, BRW_TYPE_D, 0),
316                     retype(brw_VxH_indirect(0, 0), BRW_TYPE_D));
317          brw_set_default_swsb(p, tgl_swsb_null());
318          brw_MOV(p, subscript(dst, BRW_TYPE_D, 1),
319                     retype(brw_VxH_indirect(0, 4), BRW_TYPE_D));
320       } else {
321          struct brw_reg ind_src = brw_VxH_indirect(0, 0);
322 
323          brw_MOV(p, dst, retype(ind_src, reg.type));
324       }
325    }
326 }
327 
328 void
generate_shuffle(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)329 fs_generator::generate_shuffle(fs_inst *inst,
330                                struct brw_reg dst,
331                                struct brw_reg src,
332                                struct brw_reg idx)
333 {
334    assert(src.file == FIXED_GRF);
335    assert(!src.abs && !src.negate);
336 
337    /* Ivy bridge has some strange behavior that makes this a real pain to
338     * implement for 64-bit values so we just don't bother.
339     */
340    assert(devinfo->has_64bit_float || brw_type_size_bytes(src.type) <= 4);
341 
342    /* Gen12.5 adds the following region restriction:
343     *
344     *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
345     *    and Quad-Word data must not be used."
346     *
347     * We require the source and destination types to match so stomp to an
348     * unsigned integer type.
349     */
350    assert(src.type == dst.type);
351    src.type = dst.type =
352       brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src.type));
353 
354    /* Because we're using the address register, we're limited to 16-wide
355     * by the address register file and 8-wide for 64-bit types.  We could try
356     * and make this instruction splittable higher up in the compiler but that
357     * gets weird because it reads all of the channels regardless of execution
358     * size.  It's easier just to split it here.
359     */
360    unsigned lower_width = MIN2(16, inst->exec_size);
361    if (devinfo->ver < 20 && (element_sz(src) > 4 || element_sz(dst) > 4)) {
362       lower_width = 8;
363    }
364 
365    brw_set_default_exec_size(p, cvt(lower_width) - 1);
366    for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
367       brw_set_default_group(p, group);
368 
369       if ((src.vstride == 0 && src.hstride == 0) ||
370           idx.file == IMM) {
371          /* Trivial, the source is already uniform or the index is a constant.
372           * We will typically not get here if the optimizer is doing its job,
373           * but asserting would be mean.
374           */
375          const unsigned i = idx.file == IMM ? idx.ud : 0;
376          struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
377          struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1));
378          brw_MOV(p, group_dst, group_src);
379       } else {
380          /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
381          struct brw_reg addr = vec8(brw_address_reg(0));
382 
383          struct brw_reg group_idx = suboffset(idx, group);
384 
385          if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
386             /* Things get grumpy if the register is too wide. */
387             group_idx.width--;
388             group_idx.vstride--;
389          }
390 
391          assert(brw_type_size_bytes(group_idx.type) <= 4);
392          if (brw_type_size_bytes(group_idx.type) == 4) {
393             /* The destination stride of an instruction (in bytes) must be
394              * greater than or equal to the size of the rest of the
395              * instruction.  Since the address register is of type UW, we
396              * can't use a D-type instruction.  In order to get around this,
397              * re retype to UW and use a stride.
398              */
399             group_idx = retype(spread(group_idx, 2), BRW_TYPE_W);
400          }
401 
402          uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
403 
404          /* From the Haswell PRM:
405           *
406           *    "When a sequence of NoDDChk and NoDDClr are used, the last
407           *    instruction that completes the scoreboard clear must have a
408           *    non-zero execution mask. This means, if any kind of predication
409           *    can change the execution mask or channel enable of the last
410           *    instruction, the optimization must be avoided.  This is to
411           *    avoid instructions being shot down the pipeline when no writes
412           *    are required."
413           *
414           * Whenever predication is enabled or the instructions being emitted
415           * aren't the full width, it's possible that it will be run with zero
416           * channels enabled so we can't use dependency control without
417           * running the risk of a hang if an instruction gets shot down.
418           */
419          const bool use_dep_ctrl = !inst->predicate &&
420                                    lower_width == dispatch_width;
421          brw_inst *insn;
422 
423          /* Due to a hardware bug some platforms (particularly Gfx11+) seem
424           * to require the address components of all channels to be valid
425           * whether or not they're active, which causes issues if we use VxH
426           * addressing under non-uniform control-flow.  We can easily work
427           * around that by initializing the whole address register with a
428           * pipelined NoMask MOV instruction.
429           */
430          insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
431          brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
432          brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
433          if (devinfo->ver >= 12)
434             brw_set_default_swsb(p, tgl_swsb_null());
435          else
436             brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
437 
438          /* Take into account the component size and horizontal stride. */
439          assert(src.vstride == src.hstride + src.width);
440          insn = brw_SHL(p, addr, group_idx,
441                         brw_imm_uw(util_logbase2(brw_type_size_bytes(src.type)) +
442                                    src.hstride - 1));
443          if (devinfo->ver >= 12)
444             brw_set_default_swsb(p, tgl_swsb_regdist(1));
445          else
446             brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
447 
448          /* Add on the register start offset */
449          brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
450          brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)),
451                  retype(brw_VxH_indirect(0, 0), src.type));
452       }
453 
454       brw_set_default_swsb(p, tgl_swsb_null());
455    }
456 }
457 
458 void
generate_quad_swizzle(const fs_inst * inst,struct brw_reg dst,struct brw_reg src,unsigned swiz)459 fs_generator::generate_quad_swizzle(const fs_inst *inst,
460                                     struct brw_reg dst, struct brw_reg src,
461                                     unsigned swiz)
462 {
463    /* Requires a quad. */
464    assert(inst->exec_size >= 4);
465 
466    if (src.file == IMM ||
467        has_scalar_region(src)) {
468       /* The value is uniform across all channels */
469       brw_MOV(p, dst, src);
470 
471    } else if (devinfo->ver < 11 && brw_type_size_bytes(src.type) == 4) {
472       /* This only works on 8-wide 32-bit values */
473       assert(inst->exec_size == 8);
474       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
475       assert(src.vstride == src.width + 1);
476       brw_set_default_access_mode(p, BRW_ALIGN_16);
477       struct brw_reg swiz_src = stride(src, 4, 4, 1);
478       swiz_src.swizzle = swiz;
479       brw_MOV(p, dst, swiz_src);
480 
481    } else {
482       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
483       assert(src.vstride == src.width + 1);
484       const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
485 
486       switch (swiz) {
487       case BRW_SWIZZLE_XXXX:
488       case BRW_SWIZZLE_YYYY:
489       case BRW_SWIZZLE_ZZZZ:
490       case BRW_SWIZZLE_WWWW:
491          brw_MOV(p, dst, stride(src_0, 4, 4, 0));
492          break;
493 
494       case BRW_SWIZZLE_XXZZ:
495       case BRW_SWIZZLE_YYWW:
496          brw_MOV(p, dst, stride(src_0, 2, 2, 0));
497          break;
498 
499       case BRW_SWIZZLE_XYXY:
500       case BRW_SWIZZLE_ZWZW:
501          assert(inst->exec_size == 4);
502          brw_MOV(p, dst, stride(src_0, 0, 2, 1));
503          break;
504 
505       default:
506          assert(inst->force_writemask_all);
507          brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
508 
509          for (unsigned c = 0; c < 4; c++) {
510             brw_inst *insn = brw_MOV(
511                p, stride(suboffset(dst, c),
512                          4 * inst->dst.stride, 1, 4 * inst->dst.stride),
513                stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
514 
515             if (devinfo->ver < 12) {
516                brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
517                brw_inst_set_no_dd_check(devinfo, insn, c > 0);
518             }
519 
520             brw_set_default_swsb(p, tgl_swsb_null());
521          }
522 
523          break;
524       }
525    }
526 }
527 
528 void
generate_barrier(fs_inst *,struct brw_reg src)529 fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
530 {
531    brw_barrier(p, src);
532    if (devinfo->ver >= 12) {
533       brw_set_default_swsb(p, tgl_swsb_null());
534       brw_SYNC(p, TGL_SYNC_BAR);
535    } else {
536       brw_WAIT(p);
537    }
538 }
539 
540 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
541  * looking like:
542  *
543  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
544  *
545  * Ideally, we want to produce:
546  *
547  *           DDX                     DDY
548  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
549  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
550  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
551  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
552  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
553  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
554  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
555  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
556  *
557  * and add another set of two more subspans if in 16-pixel dispatch mode.
558  *
559  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
560  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
561  * pair.  But the ideal approximation may impose a huge performance cost on
562  * sample_d.  On at least Haswell, sample_d instruction does some
563  * optimizations if the same LOD is used for all pixels in the subspan.
564  *
565  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
566  * appropriate swizzling.
567  */
568 void
generate_ddx(const fs_inst * inst,struct brw_reg dst,struct brw_reg src)569 fs_generator::generate_ddx(const fs_inst *inst,
570                            struct brw_reg dst, struct brw_reg src)
571 {
572    unsigned vstride, width;
573 
574    if (inst->opcode == FS_OPCODE_DDX_FINE) {
575       /* produce accurate derivatives */
576       vstride = BRW_VERTICAL_STRIDE_2;
577       width = BRW_WIDTH_2;
578    } else {
579       /* replicate the derivative at the top-left pixel to other pixels */
580       vstride = BRW_VERTICAL_STRIDE_4;
581       width = BRW_WIDTH_4;
582    }
583 
584    struct brw_reg src0 = byte_offset(src, brw_type_size_bytes(src.type));;
585    struct brw_reg src1 = src;
586 
587    src0.vstride = vstride;
588    src0.width   = width;
589    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
590    src1.vstride = vstride;
591    src1.width   = width;
592    src1.hstride = BRW_HORIZONTAL_STRIDE_0;
593 
594    brw_ADD(p, dst, src0, negate(src1));
595 }
596 
597 /* The negate_value boolean is used to negate the derivative computation for
598  * FBOs, since they place the origin at the upper left instead of the lower
599  * left.
600  */
601 void
generate_ddy(const fs_inst * inst,struct brw_reg dst,struct brw_reg src)602 fs_generator::generate_ddy(const fs_inst *inst,
603                            struct brw_reg dst, struct brw_reg src)
604 {
605    const uint32_t type_size = brw_type_size_bytes(src.type);
606 
607    if (inst->opcode == FS_OPCODE_DDY_FINE) {
608       /* produce accurate derivatives.
609        *
610        * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
611        * "Register Region Restrictions", Section "1. Special Restrictions":
612        *
613        *    "In Align16 mode, the channel selects and channel enables apply to
614        *     a pair of half-floats, because these parameters are defined for
615        *     DWord elements ONLY. This is applicable when both source and
616        *     destination are half-floats."
617        *
618        * So for half-float operations we use the Gfx11+ Align1 path. CHV
619        * inherits its FP16 hardware from SKL, so it is not affected.
620        */
621       if (devinfo->ver >= 11) {
622          src = stride(src, 0, 2, 1);
623 
624          brw_push_insn_state(p);
625          brw_set_default_exec_size(p, BRW_EXECUTE_4);
626          for (uint32_t g = 0; g < inst->exec_size; g += 4) {
627             brw_set_default_group(p, inst->group + g);
628             brw_ADD(p, byte_offset(dst, g * type_size),
629                        negate(byte_offset(src,  g * type_size)),
630                        byte_offset(src, (g + 2) * type_size));
631             brw_set_default_swsb(p, tgl_swsb_null());
632          }
633          brw_pop_insn_state(p);
634       } else {
635          struct brw_reg src0 = stride(src, 4, 4, 1);
636          struct brw_reg src1 = stride(src, 4, 4, 1);
637          src0.swizzle = BRW_SWIZZLE_XYXY;
638          src1.swizzle = BRW_SWIZZLE_ZWZW;
639 
640          brw_push_insn_state(p);
641          brw_set_default_access_mode(p, BRW_ALIGN_16);
642          brw_ADD(p, dst, negate(src0), src1);
643          brw_pop_insn_state(p);
644       }
645    } else {
646       /* replicate the derivative at the top-left pixel to other pixels */
647       struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
648       struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
649 
650       brw_ADD(p, dst, negate(src0), src1);
651    }
652 }
653 
654 void
generate_halt(fs_inst *)655 fs_generator::generate_halt(fs_inst *)
656 {
657    /* This HALT will be patched up at FB write time to point UIP at the end of
658     * the program, and at brw_uip_jip() JIP will be set to the end of the
659     * current block (or the program).
660     */
661    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
662    brw_HALT(p);
663 }
664 
665 /* The A32 messages take a buffer base address in header.5:[31:0] (See
666  * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
667  * and OWord block messages in the SKL PRM Vol. 2d for more details.)
668  * Unfortunately, there are a number of subtle differences:
669  *
670  * For the block read/write messages:
671  *
672  *   - We always stomp header.2 to fill in the actual scratch address (in
673  *     units of OWORDs) so we don't care what's in there.
674  *
675  *   - They rely on per-thread scratch space value in header.3[3:0] to do
676  *     bounds checking so that needs to be valid.  The upper bits of
677  *     header.3 are ignored, though, so we can copy all of g0.3.
678  *
679  *   - They ignore header.5[9:0] and assumes the address is 1KB aligned.
680  *
681  *
682  * For the byte/dword scattered read/write messages:
683  *
684  *   - We want header.2 to be zero because that gets added to the per-channel
685  *     offset in the non-header portion of the message.
686  *
687  *   - Contrary to what the docs claim, they don't do any bounds checking so
688  *     the value of header.3[3:0] doesn't matter.
689  *
690  *   - They consider all of header.5 for the base address and header.5[9:0]
691  *     are not ignored.  This means that we can't copy g0.5 verbatim because
692  *     g0.5[9:0] contains the FFTID on most platforms.  Instead, we have to
693  *     use an AND to mask off the bottom 10 bits.
694  *
695  *
696  * For block messages, just copying g0 gives a valid header because all the
697  * garbage gets ignored except for header.2 which we stomp as part of message
698  * setup.  For byte/dword scattered messages, we can just zero out the header
699  * and copy over the bits we need from g0.5.  This opcode, however, tries to
700  * satisfy the requirements of both by starting with 0 and filling out the
701  * information required by either set of opcodes.
702  */
703 void
generate_scratch_header(fs_inst * inst,struct brw_reg dst,struct brw_reg src)704 fs_generator::generate_scratch_header(fs_inst *inst,
705                                       struct brw_reg dst,
706                                       struct brw_reg src)
707 {
708    assert(inst->exec_size == 8 && inst->force_writemask_all);
709    assert(dst.file == FIXED_GRF);
710    assert(src.file == FIXED_GRF);
711    assert(src.type == BRW_TYPE_UD);
712 
713    dst.type = BRW_TYPE_UD;
714 
715    brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
716    if (devinfo->ver >= 12)
717       brw_set_default_swsb(p, tgl_swsb_null());
718    else
719       brw_inst_set_no_dd_clear(p->devinfo, insn, true);
720 
721    /* Copy the per-thread scratch space size from g0.3[3:0] */
722    brw_set_default_exec_size(p, BRW_EXECUTE_1);
723    insn = brw_AND(p, suboffset(dst, 3), component(src, 3),
724                      brw_imm_ud(INTEL_MASK(3, 0)));
725    if (devinfo->ver < 12) {
726       brw_inst_set_no_dd_clear(p->devinfo, insn, true);
727       brw_inst_set_no_dd_check(p->devinfo, insn, true);
728    }
729 
730    /* Copy the scratch base address from g0.5[31:10] */
731    insn = brw_AND(p, suboffset(dst, 5), component(src, 5),
732                      brw_imm_ud(INTEL_MASK(31, 10)));
733    if (devinfo->ver < 12)
734       brw_inst_set_no_dd_check(p->devinfo, insn, true);
735 }
736 
737 void
enable_debug(const char * shader_name)738 fs_generator::enable_debug(const char *shader_name)
739 {
740    debug_flag = true;
741    this->shader_name = shader_name;
742 }
743 
744 static gfx12_systolic_depth
translate_systolic_depth(unsigned d)745 translate_systolic_depth(unsigned d)
746 {
747    /* Could also return (ffs(d) - 1) & 3. */
748    switch (d) {
749    case 2:  return BRW_SYSTOLIC_DEPTH_2;
750    case 4:  return BRW_SYSTOLIC_DEPTH_4;
751    case 8:  return BRW_SYSTOLIC_DEPTH_8;
752    case 16: return BRW_SYSTOLIC_DEPTH_16;
753    default: unreachable("Invalid systolic depth.");
754    }
755 }
756 
757 int
generate_code(const cfg_t * cfg,int dispatch_width,struct shader_stats shader_stats,const brw::performance & perf,struct brw_compile_stats * stats,unsigned max_polygons)758 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
759                             struct shader_stats shader_stats,
760                             const brw::performance &perf,
761                             struct brw_compile_stats *stats,
762                             unsigned max_polygons)
763 {
764    /* align to 64 byte boundary. */
765    brw_realign(p, 64);
766 
767    this->dispatch_width = dispatch_width;
768 
769    int start_offset = p->next_insn_offset;
770 
771    int loop_count = 0, send_count = 0, nop_count = 0, sync_nop_count = 0;
772    bool is_accum_used = false;
773 
774    struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
775 
776    foreach_block_and_inst (block, fs_inst, inst, cfg) {
777       if (inst->opcode == SHADER_OPCODE_UNDEF)
778          continue;
779 
780       struct brw_reg src[4], dst;
781       unsigned int last_insn_offset = p->next_insn_offset;
782       bool multiple_instructions_emitted = false;
783       tgl_swsb swsb = inst->sched;
784 
785       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
786        * "Register Region Restrictions" section: for BDW, SKL:
787        *
788        *    "A POW/FDIV operation must not be followed by an instruction
789        *     that requires two destination registers."
790        *
791        * The documentation is often lacking annotations for Atom parts,
792        * and empirically this affects CHV as well.
793        */
794       if (devinfo->ver <= 9 &&
795           p->nr_insn > 1 &&
796           brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH &&
797           brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
798           inst->dst.component_size(inst->exec_size) > REG_SIZE) {
799          brw_NOP(p);
800          last_insn_offset = p->next_insn_offset;
801 
802          /* In order to avoid spurious instruction count differences when the
803           * instruction schedule changes, keep track of the number of inserted
804           * NOPs.
805           */
806          nop_count++;
807       }
808 
809       /* Wa_14010017096:
810        *
811        * Clear accumulator register before end of thread.
812        */
813       if (inst->eot && is_accum_used &&
814           intel_needs_workaround(devinfo, 14010017096)) {
815          brw_set_default_exec_size(p, BRW_EXECUTE_16);
816          brw_set_default_group(p, 0);
817          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
818          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
819          brw_set_default_flag_reg(p, 0, 0);
820          brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
821          brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
822          last_insn_offset = p->next_insn_offset;
823          swsb = tgl_swsb_dst_dep(swsb, 1);
824       }
825 
826       if (!is_accum_used && !inst->eot) {
827          is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
828                          inst->dst.is_accumulator();
829       }
830 
831       /* Wa_14013672992:
832        *
833        * Always use @1 SWSB for EOT.
834        */
835       if (inst->eot && intel_needs_workaround(devinfo, 14013672992)) {
836          if (tgl_swsb_src_dep(swsb).mode) {
837             brw_set_default_exec_size(p, BRW_EXECUTE_1);
838             brw_set_default_mask_control(p, BRW_MASK_DISABLE);
839             brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
840             brw_set_default_flag_reg(p, 0, 0);
841             brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
842             brw_SYNC(p, TGL_SYNC_NOP);
843             last_insn_offset = p->next_insn_offset;
844          }
845 
846          swsb = tgl_swsb_dst_dep(swsb, 1);
847       }
848 
849       if (unlikely(debug_flag))
850          disasm_annotate(disasm_info, inst, p->next_insn_offset);
851 
852       if (devinfo->ver >= 20 && inst->group % 8 != 0) {
853          assert(inst->force_writemask_all);
854          assert(!inst->predicate && !inst->conditional_mod);
855          assert(!inst->writes_accumulator_implicitly(devinfo) &&
856                 !inst->reads_accumulator_implicitly());
857          assert(inst->opcode != SHADER_OPCODE_SEL_EXEC);
858          brw_set_default_group(p, 0);
859       } else {
860          brw_set_default_group(p, inst->group);
861       }
862 
863       for (unsigned int i = 0; i < inst->sources; i++) {
864          src[i] = normalize_brw_reg_for_encoding(&inst->src[i]);
865 	 /* The accumulator result appears to get used for the
866 	  * conditional modifier generation.  When negating a UD
867 	  * value, there is a 33rd bit generated for the sign in the
868 	  * accumulator value, so now you can't check, for example,
869 	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
870 	  */
871 	 assert(!inst->conditional_mod ||
872 		inst->src[i].type != BRW_TYPE_UD ||
873 		!inst->src[i].negate);
874       }
875       dst = normalize_brw_reg_for_encoding(&inst->dst);
876 
877       brw_set_default_access_mode(p, BRW_ALIGN_1);
878       brw_set_default_predicate_control(p, inst->predicate);
879       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
880       /* On gfx7 and above, hardware automatically adds the group onto the
881        * flag subregister number.
882        */
883       const unsigned flag_subreg = inst->flag_subreg;
884       brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
885       brw_set_default_saturate(p, inst->saturate);
886       brw_set_default_mask_control(p, inst->force_writemask_all);
887       if (devinfo->ver >= 20 && inst->writes_accumulator) {
888          assert(inst->dst.is_accumulator() ||
889                 inst->opcode == BRW_OPCODE_ADDC ||
890                 inst->opcode == BRW_OPCODE_MACH ||
891                 inst->opcode == BRW_OPCODE_SUBB);
892       } else {
893          brw_set_default_acc_write_control(p, inst->writes_accumulator);
894       }
895       brw_set_default_swsb(p, swsb);
896 
897       unsigned exec_size = inst->exec_size;
898 
899       brw_set_default_exec_size(p, cvt(exec_size) - 1);
900 
901       assert(inst->force_writemask_all || inst->exec_size >= 4);
902       assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
903       assert(inst->mlen <= BRW_MAX_MSG_LENGTH * reg_unit(devinfo));
904 
905       switch (inst->opcode) {
906       case BRW_OPCODE_NOP:
907          brw_NOP(p);
908          break;
909       case BRW_OPCODE_SYNC:
910          assert(src[0].file == IMM);
911          brw_SYNC(p, tgl_sync_function(src[0].ud));
912 
913          if (tgl_sync_function(src[0].ud) == TGL_SYNC_NOP)
914             ++sync_nop_count;
915 
916          break;
917       case BRW_OPCODE_MOV:
918 	 brw_MOV(p, dst, src[0]);
919 	 break;
920       case BRW_OPCODE_ADD:
921 	 brw_ADD(p, dst, src[0], src[1]);
922 	 break;
923       case BRW_OPCODE_MUL:
924 	 brw_MUL(p, dst, src[0], src[1]);
925 	 break;
926       case BRW_OPCODE_AVG:
927 	 brw_AVG(p, dst, src[0], src[1]);
928 	 break;
929       case BRW_OPCODE_MACH:
930 	 brw_MACH(p, dst, src[0], src[1]);
931 	 break;
932 
933       case BRW_OPCODE_DP4A:
934          assert(devinfo->ver >= 12);
935          brw_DP4A(p, dst, src[0], src[1], src[2]);
936          break;
937 
938       case BRW_OPCODE_LINE:
939          brw_LINE(p, dst, src[0], src[1]);
940          break;
941 
942       case BRW_OPCODE_DPAS:
943          assert(devinfo->verx10 >= 125);
944          brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount,
945                   dst, src[0], src[1], src[2]);
946          break;
947 
948       case BRW_OPCODE_MAD:
949          if (devinfo->ver < 10)
950             brw_set_default_access_mode(p, BRW_ALIGN_16);
951          brw_MAD(p, dst, src[0], src[1], src[2]);
952 	 break;
953 
954       case BRW_OPCODE_LRP:
955          assert(devinfo->ver <= 10);
956          if (devinfo->ver < 10)
957             brw_set_default_access_mode(p, BRW_ALIGN_16);
958          brw_LRP(p, dst, src[0], src[1], src[2]);
959 	 break;
960 
961       case BRW_OPCODE_ADD3:
962          assert(devinfo->verx10 >= 125);
963          brw_ADD3(p, dst, src[0], src[1], src[2]);
964          break;
965 
966       case BRW_OPCODE_FRC:
967 	 brw_FRC(p, dst, src[0]);
968 	 break;
969       case BRW_OPCODE_RNDD:
970 	 brw_RNDD(p, dst, src[0]);
971 	 break;
972       case BRW_OPCODE_RNDE:
973 	 brw_RNDE(p, dst, src[0]);
974 	 break;
975       case BRW_OPCODE_RNDZ:
976 	 brw_RNDZ(p, dst, src[0]);
977 	 break;
978 
979       case BRW_OPCODE_AND:
980 	 brw_AND(p, dst, src[0], src[1]);
981 	 break;
982       case BRW_OPCODE_OR:
983 	 brw_OR(p, dst, src[0], src[1]);
984 	 break;
985       case BRW_OPCODE_XOR:
986 	 brw_XOR(p, dst, src[0], src[1]);
987 	 break;
988       case BRW_OPCODE_NOT:
989 	 brw_NOT(p, dst, src[0]);
990 	 break;
991       case BRW_OPCODE_ASR:
992 	 brw_ASR(p, dst, src[0], src[1]);
993 	 break;
994       case BRW_OPCODE_SHR:
995 	 brw_SHR(p, dst, src[0], src[1]);
996 	 break;
997       case BRW_OPCODE_SHL:
998 	 brw_SHL(p, dst, src[0], src[1]);
999 	 break;
1000       case BRW_OPCODE_ROL:
1001 	 assert(devinfo->ver >= 11);
1002 	 assert(src[0].type == dst.type);
1003 	 brw_ROL(p, dst, src[0], src[1]);
1004 	 break;
1005       case BRW_OPCODE_ROR:
1006 	 assert(devinfo->ver >= 11);
1007 	 assert(src[0].type == dst.type);
1008 	 brw_ROR(p, dst, src[0], src[1]);
1009 	 break;
1010       case BRW_OPCODE_CMP:
1011          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1012 	 break;
1013       case BRW_OPCODE_CMPN:
1014          brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
1015          break;
1016       case BRW_OPCODE_SEL:
1017 	 brw_SEL(p, dst, src[0], src[1]);
1018 	 break;
1019       case BRW_OPCODE_CSEL:
1020          if (devinfo->ver < 10)
1021             brw_set_default_access_mode(p, BRW_ALIGN_16);
1022          brw_CSEL(p, dst, src[0], src[1], src[2]);
1023          break;
1024       case BRW_OPCODE_BFREV:
1025          brw_BFREV(p, retype(dst, BRW_TYPE_UD), retype(src[0], BRW_TYPE_UD));
1026          break;
1027       case BRW_OPCODE_FBH:
1028          brw_FBH(p, retype(dst, src[0].type), src[0]);
1029          break;
1030       case BRW_OPCODE_FBL:
1031          brw_FBL(p, retype(dst, BRW_TYPE_UD), retype(src[0], BRW_TYPE_UD));
1032          break;
1033       case BRW_OPCODE_LZD:
1034          brw_LZD(p, dst, src[0]);
1035          break;
1036       case BRW_OPCODE_CBIT:
1037          brw_CBIT(p, retype(dst, BRW_TYPE_UD), retype(src[0], BRW_TYPE_UD));
1038          break;
1039       case BRW_OPCODE_ADDC:
1040          brw_ADDC(p, dst, src[0], src[1]);
1041          break;
1042       case BRW_OPCODE_SUBB:
1043          brw_SUBB(p, dst, src[0], src[1]);
1044          break;
1045       case BRW_OPCODE_MAC:
1046          brw_MAC(p, dst, src[0], src[1]);
1047          break;
1048 
1049       case BRW_OPCODE_BFE:
1050          if (devinfo->ver < 10)
1051             brw_set_default_access_mode(p, BRW_ALIGN_16);
1052          brw_BFE(p, dst, src[0], src[1], src[2]);
1053          break;
1054 
1055       case BRW_OPCODE_BFI1:
1056          brw_BFI1(p, dst, src[0], src[1]);
1057          break;
1058       case BRW_OPCODE_BFI2:
1059          if (devinfo->ver < 10)
1060             brw_set_default_access_mode(p, BRW_ALIGN_16);
1061          brw_BFI2(p, dst, src[0], src[1], src[2]);
1062          break;
1063 
1064       case BRW_OPCODE_IF:
1065          brw_IF(p, brw_get_default_exec_size(p));
1066 	 break;
1067 
1068       case BRW_OPCODE_ELSE:
1069 	 brw_ELSE(p);
1070 	 break;
1071       case BRW_OPCODE_ENDIF:
1072 	 brw_ENDIF(p);
1073 	 break;
1074 
1075       case BRW_OPCODE_DO:
1076 	 brw_DO(p, brw_get_default_exec_size(p));
1077 	 break;
1078 
1079       case BRW_OPCODE_BREAK:
1080 	 brw_BREAK(p);
1081 	 break;
1082       case BRW_OPCODE_CONTINUE:
1083          brw_CONT(p);
1084 	 break;
1085 
1086       case BRW_OPCODE_WHILE:
1087 	 brw_WHILE(p);
1088          loop_count++;
1089 	 break;
1090 
1091       case SHADER_OPCODE_RCP:
1092       case SHADER_OPCODE_RSQ:
1093       case SHADER_OPCODE_SQRT:
1094       case SHADER_OPCODE_EXP2:
1095       case SHADER_OPCODE_LOG2:
1096       case SHADER_OPCODE_SIN:
1097       case SHADER_OPCODE_COS:
1098          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1099          assert(inst->mlen == 0);
1100          gfx6_math(p, dst, brw_math_function(inst->opcode),
1101                    src[0], retype(brw_null_reg(), src[0].type));
1102 	 break;
1103       case SHADER_OPCODE_INT_QUOTIENT:
1104       case SHADER_OPCODE_INT_REMAINDER:
1105       case SHADER_OPCODE_POW:
1106          assert(devinfo->verx10 < 125);
1107          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1108          assert(inst->mlen == 0);
1109          assert(inst->opcode == SHADER_OPCODE_POW || inst->exec_size == 8);
1110          gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1111 	 break;
1112       case BRW_OPCODE_PLN:
1113          /* PLN reads:
1114           *                      /   in SIMD16   \
1115           *    -----------------------------------
1116           *   | src1+0 | src1+1 | src1+2 | src1+3 |
1117           *   |-----------------------------------|
1118           *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
1119           *    -----------------------------------
1120           */
1121          brw_PLN(p, dst, src[0], src[1]);
1122 	 break;
1123       case FS_OPCODE_PIXEL_X:
1124          assert(src[0].type == BRW_TYPE_UW);
1125          assert(src[1].type == BRW_TYPE_UW);
1126          src[0].subnr = 0 * brw_type_size_bytes(src[0].type);
1127          if (src[1].file == IMM) {
1128             assert(src[1].ud == 0);
1129             brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1130          } else {
1131             /* Coarse pixel case */
1132             brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
1133          }
1134          break;
1135       case FS_OPCODE_PIXEL_Y:
1136          assert(src[0].type == BRW_TYPE_UW);
1137          assert(src[1].type == BRW_TYPE_UW);
1138          src[0].subnr = 4 * brw_type_size_bytes(src[0].type);
1139          if (src[1].file == IMM) {
1140             assert(src[1].ud == 0);
1141             brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1142          } else {
1143             /* Coarse pixel case */
1144             brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
1145          }
1146          break;
1147 
1148       case SHADER_OPCODE_SEND:
1149          generate_send(inst, dst, src[0], src[1], src[2],
1150                        inst->ex_mlen > 0 ? src[3] : brw_null_reg());
1151          send_count++;
1152          break;
1153 
1154       case FS_OPCODE_DDX_COARSE:
1155       case FS_OPCODE_DDX_FINE:
1156          generate_ddx(inst, dst, src[0]);
1157          break;
1158       case FS_OPCODE_DDY_COARSE:
1159       case FS_OPCODE_DDY_FINE:
1160          generate_ddy(inst, dst, src[0]);
1161 	 break;
1162 
1163       case SHADER_OPCODE_SCRATCH_HEADER:
1164          generate_scratch_header(inst, dst, src[0]);
1165          break;
1166 
1167       case SHADER_OPCODE_MOV_INDIRECT:
1168          generate_mov_indirect(inst, dst, src[0], src[1]);
1169          break;
1170 
1171       case SHADER_OPCODE_MOV_RELOC_IMM:
1172          assert(src[0].file == IMM);
1173          assert(src[1].file == IMM);
1174          brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud, src[1].ud);
1175          break;
1176 
1177       case BRW_OPCODE_HALT:
1178          generate_halt(inst);
1179          break;
1180 
1181       case SHADER_OPCODE_INTERLOCK:
1182       case SHADER_OPCODE_MEMORY_FENCE: {
1183          assert(src[1].file == IMM);
1184          assert(src[2].file == IMM);
1185 
1186          const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
1187             BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
1188 
1189          brw_memory_fence(p, dst, src[0], send_op,
1190                           brw_message_target(inst->sfid),
1191                           inst->desc,
1192                           /* commit_enable */ src[1].ud,
1193                           /* bti */ src[2].ud);
1194          send_count++;
1195          break;
1196       }
1197 
1198       case FS_OPCODE_SCHEDULING_FENCE:
1199          if (inst->sources == 0 && swsb.regdist == 0 &&
1200                                    swsb.mode == TGL_SBID_NULL) {
1201             if (unlikely(debug_flag))
1202                disasm_info->use_tail = true;
1203             break;
1204          }
1205 
1206          if (devinfo->ver >= 12) {
1207             /* Use the available SWSB information to stall.  A single SYNC is
1208              * sufficient since if there were multiple dependencies, the
1209              * scoreboard algorithm already injected other SYNCs before this
1210              * instruction.
1211              */
1212             brw_SYNC(p, TGL_SYNC_NOP);
1213          } else {
1214             for (unsigned i = 0; i < inst->sources; i++) {
1215                /* Emit a MOV to force a stall until the instruction producing the
1216                 * registers finishes.
1217                 */
1218                brw_MOV(p, retype(brw_null_reg(), BRW_TYPE_UW),
1219                        retype(src[i], BRW_TYPE_UW));
1220             }
1221 
1222             if (inst->sources > 1)
1223                multiple_instructions_emitted = true;
1224          }
1225 
1226          break;
1227 
1228       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1229       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
1230       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
1231          unreachable("Should be lowered by lower_find_live_channel()");
1232          break;
1233 
1234       case FS_OPCODE_LOAD_LIVE_CHANNELS: {
1235          assert(inst->force_writemask_all && inst->group == 0);
1236          assert(inst->dst.file == BAD_FILE);
1237          brw_set_default_exec_size(p, BRW_EXECUTE_1);
1238          brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), BRW_TYPE_UD),
1239                  retype(brw_mask_reg(0), BRW_TYPE_UD));
1240          break;
1241       }
1242       case SHADER_OPCODE_BROADCAST:
1243          assert(inst->force_writemask_all);
1244          brw_broadcast(p, dst, src[0], src[1]);
1245          break;
1246 
1247       case SHADER_OPCODE_SHUFFLE:
1248          generate_shuffle(inst, dst, src[0], src[1]);
1249          break;
1250 
1251       case SHADER_OPCODE_SEL_EXEC:
1252          assert(inst->force_writemask_all);
1253          assert(devinfo->has_64bit_float || brw_type_size_bytes(dst.type) <= 4);
1254          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1255          brw_MOV(p, dst, src[1]);
1256          brw_set_default_mask_control(p, BRW_MASK_ENABLE);
1257          brw_set_default_swsb(p, tgl_swsb_null());
1258          brw_MOV(p, dst, src[0]);
1259          break;
1260 
1261       case SHADER_OPCODE_QUAD_SWIZZLE:
1262          assert(src[1].file == IMM);
1263          assert(src[1].type == BRW_TYPE_UD);
1264          generate_quad_swizzle(inst, dst, src[0], src[1].ud);
1265          break;
1266 
1267       case SHADER_OPCODE_CLUSTER_BROADCAST: {
1268          assert((!intel_device_info_is_9lp(devinfo) &&
1269                  devinfo->has_64bit_float) || brw_type_size_bytes(src[0].type) <= 4);
1270          assert(!src[0].negate && !src[0].abs);
1271          assert(src[1].file == IMM);
1272          assert(src[1].type == BRW_TYPE_UD);
1273          assert(src[2].file == IMM);
1274          assert(src[2].type == BRW_TYPE_UD);
1275          const unsigned component = src[1].ud;
1276          const unsigned cluster_size = src[2].ud;
1277          assert(inst->src[0].file != ARF);
1278 
1279          unsigned s;
1280          if (inst->src[0].file == FIXED_GRF) {
1281             s = inst->src[0].hstride ? 1 << (inst->src[0].hstride - 1) : 0;
1282          } else {
1283             s = inst->src[0].stride;
1284          }
1285          unsigned vstride = cluster_size * s;
1286          unsigned width = cluster_size;
1287 
1288          /* The maximum exec_size is 32, but the maximum width is only 16. */
1289          if (inst->exec_size == width) {
1290             vstride = 0;
1291             width = 1;
1292          }
1293 
1294          struct brw_reg strided = stride(suboffset(src[0], component * s),
1295                                          vstride, width, 0);
1296          brw_MOV(p, dst, strided);
1297          break;
1298       }
1299 
1300       case SHADER_OPCODE_HALT_TARGET:
1301          /* This is the place where the final HALT needs to be inserted if
1302           * we've emitted any discards.  If not, this will emit no code.
1303           */
1304          if (!patch_halt_jumps()) {
1305             if (unlikely(debug_flag)) {
1306                disasm_info->use_tail = true;
1307             }
1308          }
1309          break;
1310 
1311       case SHADER_OPCODE_BARRIER:
1312 	 generate_barrier(inst, src[0]);
1313          send_count++;
1314 	 break;
1315 
1316       case SHADER_OPCODE_RND_MODE: {
1317          assert(src[0].file == IMM);
1318          /*
1319           * Changes the floating point rounding mode updating the control
1320           * register field defined at cr0.0[5-6] bits.
1321           */
1322          enum brw_rnd_mode mode =
1323             (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
1324          brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
1325       }
1326          break;
1327 
1328       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
1329          assert(src[0].file == IMM);
1330          assert(src[1].file == IMM);
1331          brw_float_controls_mode(p, src[0].d, src[1].d);
1332          break;
1333 
1334       case SHADER_OPCODE_READ_ARCH_REG:
1335          if (devinfo->ver >= 12) {
1336             /* There is a SWSB restriction that requires that any time sr0 is
1337              * accessed both the instruction doing the access and the next one
1338              * have SWSB set to RegDist(1).
1339              */
1340             if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
1341                brw_SYNC(p, TGL_SYNC_NOP);
1342             brw_set_default_swsb(p, tgl_swsb_regdist(1));
1343             brw_MOV(p, dst, src[0]);
1344             brw_set_default_swsb(p, tgl_swsb_regdist(1));
1345             brw_AND(p, dst, dst, brw_imm_ud(0xffffffff));
1346          } else {
1347             brw_MOV(p, dst, src[0]);
1348          }
1349          break;
1350 
1351       default:
1352          unreachable("Unsupported opcode");
1353 
1354       case SHADER_OPCODE_LOAD_PAYLOAD:
1355          unreachable("Should be lowered by lower_load_payload()");
1356       }
1357 
1358       if (multiple_instructions_emitted)
1359          continue;
1360 
1361       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
1362          assert(p->next_insn_offset == last_insn_offset + 16 ||
1363                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
1364                  "emitting more than 1 instruction");
1365 
1366          brw_inst *last = &p->store[last_insn_offset / 16];
1367 
1368          if (inst->conditional_mod)
1369             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
1370          if (devinfo->ver < 12) {
1371             brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
1372             brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
1373          }
1374       }
1375 
1376       /* When enabled, insert sync NOP after every instruction and make sure
1377        * that current instruction depends on the previous instruction.
1378        */
1379       if (INTEL_DEBUG(DEBUG_SWSB_STALL) && devinfo->ver >= 12) {
1380          brw_set_default_swsb(p, tgl_swsb_regdist(1));
1381          brw_SYNC(p, TGL_SYNC_NOP);
1382       }
1383    }
1384 
1385    brw_set_uip_jip(p, start_offset);
1386 
1387    /* end of program sentinel */
1388    disasm_new_inst_group(disasm_info, p->next_insn_offset);
1389 
1390    /* `send_count` explicitly does not include spills or fills, as we'd
1391     * like to use it as a metric for intentional memory access or other
1392     * shared function use.  Otherwise, subtle changes to scheduling or
1393     * register allocation could cause it to fluctuate wildly - and that
1394     * effect is already counted in spill/fill counts.
1395     */
1396    send_count -= shader_stats.spill_count;
1397    send_count -= shader_stats.fill_count;
1398 
1399 #ifndef NDEBUG
1400    bool validated =
1401 #else
1402    if (unlikely(debug_flag))
1403 #endif
1404       brw_validate_instructions(&compiler->isa, p->store,
1405                                 start_offset,
1406                                 p->next_insn_offset,
1407                                 disasm_info);
1408 
1409    int before_size = p->next_insn_offset - start_offset;
1410    brw_compact_instructions(p, start_offset, disasm_info);
1411    int after_size = p->next_insn_offset - start_offset;
1412 
1413    bool dump_shader_bin = brw_should_dump_shader_bin();
1414    unsigned char sha1[21];
1415    char sha1buf[41];
1416 
1417    if (unlikely(debug_flag || dump_shader_bin)) {
1418       _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
1419                          after_size, sha1);
1420       _mesa_sha1_format(sha1buf, sha1);
1421    }
1422 
1423    if (unlikely(dump_shader_bin))
1424       brw_dump_shader_bin(p->store, start_offset, p->next_insn_offset,
1425                           sha1buf);
1426 
1427    if (unlikely(debug_flag)) {
1428       fprintf(stderr, "Native code for %s (src_hash 0x%08x) (sha1 %s)\n"
1429               "SIMD%d shader: %d instructions. %d loops. %u cycles. "
1430               "%d:%d spills:fills, %u sends, "
1431               "scheduled with mode %s. "
1432               "Promoted %u constants. "
1433               "Compacted %d to %d bytes (%.0f%%)\n",
1434               shader_name, params->source_hash, sha1buf,
1435               dispatch_width, before_size / 16,
1436               loop_count, perf.latency,
1437               shader_stats.spill_count,
1438               shader_stats.fill_count,
1439               send_count,
1440               shader_stats.scheduler_mode,
1441               shader_stats.promoted_constants,
1442               before_size, after_size,
1443               100.0f * (before_size - after_size) / before_size);
1444 
1445       /* overriding the shader makes disasm_info invalid */
1446       if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
1447          dump_assembly(p->store, start_offset, p->next_insn_offset,
1448                        disasm_info, perf.block_latency);
1449       } else {
1450          fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
1451       }
1452    }
1453    ralloc_free(disasm_info);
1454 #ifndef NDEBUG
1455    if (!validated && !debug_flag) {
1456       fprintf(stderr,
1457             "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
1458    }
1459 #endif
1460    assert(validated);
1461 
1462    brw_shader_debug_log(compiler, params->log_data,
1463                         "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
1464                         "%d:%d spills:fills, %u sends, "
1465                         "scheduled with mode %s, "
1466                         "Promoted %u constants, "
1467                         "compacted %d to %d bytes.\n",
1468                         _mesa_shader_stage_to_abbrev(stage),
1469                         dispatch_width,
1470                         before_size / 16 - nop_count - sync_nop_count,
1471                         loop_count, perf.latency,
1472                         shader_stats.spill_count,
1473                         shader_stats.fill_count,
1474                         send_count,
1475                         shader_stats.scheduler_mode,
1476                         shader_stats.promoted_constants,
1477                         before_size, after_size);
1478    if (stats) {
1479       stats->dispatch_width = dispatch_width;
1480       stats->max_polygons = max_polygons;
1481       stats->max_dispatch_width = dispatch_width;
1482       stats->instructions = before_size / 16 - nop_count - sync_nop_count;
1483       stats->sends = send_count;
1484       stats->loops = loop_count;
1485       stats->cycles = perf.latency;
1486       stats->spills = shader_stats.spill_count;
1487       stats->fills = shader_stats.fill_count;
1488       stats->max_live_registers = shader_stats.max_register_pressure;
1489    }
1490 
1491    return start_offset;
1492 }
1493 
1494 void
add_const_data(void * data,unsigned size)1495 fs_generator::add_const_data(void *data, unsigned size)
1496 {
1497    assert(prog_data->const_data_size == 0);
1498    if (size > 0) {
1499       prog_data->const_data_size = size;
1500       prog_data->const_data_offset = brw_append_data(p, data, size, 32);
1501    }
1502 }
1503 
1504 void
add_resume_sbt(unsigned num_resume_shaders,uint64_t * sbt)1505 fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
1506 {
1507    assert(brw_shader_stage_is_bindless(stage));
1508    struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
1509    if (num_resume_shaders > 0) {
1510       bs_prog_data->resume_sbt_offset =
1511          brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
1512       for (unsigned i = 0; i < num_resume_shaders; i++) {
1513          size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
1514          assert(offset <= UINT32_MAX);
1515          brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
1516                        BRW_SHADER_RELOC_TYPE_U32,
1517                        (uint32_t)offset, (uint32_t)sbt[i]);
1518       }
1519    }
1520 }
1521 
1522 const unsigned *
get_assembly()1523 fs_generator::get_assembly()
1524 {
1525    prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
1526 
1527    return brw_get_program(p, &prog_data->program_size);
1528 }
1529