1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "brw_eu.h"
31 #include "brw_disasm_info.h"
32 #include "brw_fs.h"
33 #include "brw_cfg.h"
34 #include "dev/intel_debug.h"
35 #include "util/mesa-sha1.h"
36 #include "util/half_float.h"
37
38 static uint32_t
brw_math_function(enum opcode op)39 brw_math_function(enum opcode op)
40 {
41 switch (op) {
42 case SHADER_OPCODE_RCP:
43 return BRW_MATH_FUNCTION_INV;
44 case SHADER_OPCODE_RSQ:
45 return BRW_MATH_FUNCTION_RSQ;
46 case SHADER_OPCODE_SQRT:
47 return BRW_MATH_FUNCTION_SQRT;
48 case SHADER_OPCODE_EXP2:
49 return BRW_MATH_FUNCTION_EXP;
50 case SHADER_OPCODE_LOG2:
51 return BRW_MATH_FUNCTION_LOG;
52 case SHADER_OPCODE_POW:
53 return BRW_MATH_FUNCTION_POW;
54 case SHADER_OPCODE_SIN:
55 return BRW_MATH_FUNCTION_SIN;
56 case SHADER_OPCODE_COS:
57 return BRW_MATH_FUNCTION_COS;
58 case SHADER_OPCODE_INT_QUOTIENT:
59 return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
60 case SHADER_OPCODE_INT_REMAINDER:
61 return BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
62 default:
63 unreachable("not reached: unknown math function");
64 }
65 }
66
67 static struct brw_reg
normalize_brw_reg_for_encoding(brw_reg * reg)68 normalize_brw_reg_for_encoding(brw_reg *reg)
69 {
70 struct brw_reg brw_reg;
71
72 switch (reg->file) {
73 case ARF:
74 case FIXED_GRF:
75 case IMM:
76 assert(reg->offset == 0);
77 brw_reg = *reg;
78 break;
79 case BAD_FILE:
80 /* Probably unused. */
81 brw_reg = brw_null_reg();
82 break;
83 case VGRF:
84 case ATTR:
85 case UNIFORM:
86 unreachable("not reached");
87 }
88
89 return brw_reg;
90 }
91
fs_generator(const struct brw_compiler * compiler,const struct brw_compile_params * params,struct brw_stage_prog_data * prog_data,gl_shader_stage stage)92 fs_generator::fs_generator(const struct brw_compiler *compiler,
93 const struct brw_compile_params *params,
94 struct brw_stage_prog_data *prog_data,
95 gl_shader_stage stage)
96
97 : compiler(compiler), params(params),
98 devinfo(compiler->devinfo),
99 prog_data(prog_data), dispatch_width(0),
100 debug_flag(false),
101 shader_name(NULL), stage(stage), mem_ctx(params->mem_ctx)
102 {
103 p = rzalloc(mem_ctx, struct brw_codegen);
104 brw_init_codegen(&compiler->isa, p, mem_ctx);
105 }
106
~fs_generator()107 fs_generator::~fs_generator()
108 {
109 }
110
111 class ip_record : public exec_node {
112 public:
113 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
114
ip_record(int ip)115 ip_record(int ip)
116 {
117 this->ip = ip;
118 }
119
120 int ip;
121 };
122
123 bool
patch_halt_jumps()124 fs_generator::patch_halt_jumps()
125 {
126 if (this->discard_halt_patches.is_empty())
127 return false;
128
129 int scale = brw_jump_scale(p->devinfo);
130
131 /* There is a somewhat strange undocumented requirement of using
132 * HALT, according to the simulator. If some channel has HALTed to
133 * a particular UIP, then by the end of the program, every channel
134 * must have HALTed to that UIP. Furthermore, the tracking is a
135 * stack, so you can't do the final halt of a UIP after starting
136 * halting to a new UIP.
137 *
138 * Symptoms of not emitting this instruction on actual hardware
139 * included GPU hangs and sparkly rendering on the piglit discard
140 * tests.
141 */
142 brw_inst *last_halt = brw_HALT(p);
143 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
144 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
145
146 int ip = p->nr_insn;
147
148 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
149 brw_inst *patch = &p->store[patch_ip->ip];
150
151 assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT);
152 /* HALT takes a half-instruction distance from the pre-incremented IP. */
153 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
154 }
155
156 this->discard_halt_patches.make_empty();
157
158 return true;
159 }
160
161 void
generate_send(fs_inst * inst,struct brw_reg dst,struct brw_reg desc,struct brw_reg ex_desc,struct brw_reg payload,struct brw_reg payload2)162 fs_generator::generate_send(fs_inst *inst,
163 struct brw_reg dst,
164 struct brw_reg desc,
165 struct brw_reg ex_desc,
166 struct brw_reg payload,
167 struct brw_reg payload2)
168 {
169 const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
170
171 uint32_t desc_imm = inst->desc |
172 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
173
174 uint32_t ex_desc_imm = inst->ex_desc |
175 brw_message_ex_desc(devinfo, inst->ex_mlen);
176
177 if (ex_desc.file != IMM || ex_desc.ud || ex_desc_imm ||
178 inst->send_ex_desc_scratch) {
179 /* If we have any sort of extended descriptor, then we need SENDS. This
180 * also covers the dual-payload case because ex_mlen goes in ex_desc.
181 */
182 brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
183 desc, desc_imm, ex_desc, ex_desc_imm,
184 inst->send_ex_desc_scratch,
185 inst->send_ex_bso, inst->eot);
186 if (inst->check_tdr)
187 brw_inst_set_opcode(p->isa, brw_last_inst,
188 devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
189 } else {
190 brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
191 inst->eot);
192 if (inst->check_tdr)
193 brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
194 }
195 }
196
197 void
generate_mov_indirect(fs_inst * inst,struct brw_reg dst,struct brw_reg reg,struct brw_reg indirect_byte_offset)198 fs_generator::generate_mov_indirect(fs_inst *inst,
199 struct brw_reg dst,
200 struct brw_reg reg,
201 struct brw_reg indirect_byte_offset)
202 {
203 assert(indirect_byte_offset.type == BRW_TYPE_UD);
204 assert(indirect_byte_offset.file == FIXED_GRF);
205 assert(!reg.abs && !reg.negate);
206
207 /* Gen12.5 adds the following region restriction:
208 *
209 * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
210 * and Quad-Word data must not be used."
211 *
212 * We require the source and destination types to match so stomp to an
213 * unsigned integer type.
214 */
215 assert(reg.type == dst.type);
216 reg.type = dst.type =
217 brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(reg.type));
218
219 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
220
221 if (indirect_byte_offset.file == IMM) {
222 imm_byte_offset += indirect_byte_offset.ud;
223
224 reg.nr = imm_byte_offset / REG_SIZE;
225 reg.subnr = imm_byte_offset % REG_SIZE;
226 if (brw_type_size_bytes(reg.type) > 4 && !devinfo->has_64bit_int) {
227 brw_MOV(p, subscript(dst, BRW_TYPE_D, 0),
228 subscript(reg, BRW_TYPE_D, 0));
229 brw_set_default_swsb(p, tgl_swsb_null());
230 brw_MOV(p, subscript(dst, BRW_TYPE_D, 1),
231 subscript(reg, BRW_TYPE_D, 1));
232 } else {
233 brw_MOV(p, dst, reg);
234 }
235 } else {
236 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
237 struct brw_reg addr = vec8(brw_address_reg(0));
238
239 /* Whether we can use destination dependency control without running the
240 * risk of a hang if an instruction gets shot down.
241 */
242 const bool use_dep_ctrl = !inst->predicate &&
243 inst->exec_size == dispatch_width;
244 brw_inst *insn;
245
246 /* The destination stride of an instruction (in bytes) must be greater
247 * than or equal to the size of the rest of the instruction. Since the
248 * address register is of type UW, we can't use a D-type instruction.
249 * In order to get around this, re retype to UW and use a stride.
250 */
251 indirect_byte_offset =
252 retype(spread(indirect_byte_offset, 2), BRW_TYPE_UW);
253
254 /* There are a number of reasons why we don't use the base offset here.
255 * One reason is that the field is only 9 bits which means we can only
256 * use it to access the first 16 GRFs. Also, from the Haswell PRM
257 * section "Register Region Restrictions":
258 *
259 * "The lower bits of the AddressImmediate must not overflow to
260 * change the register address. The lower 5 bits of Address
261 * Immediate when added to lower 5 bits of address register gives
262 * the sub-register offset. The upper bits of Address Immediate
263 * when added to upper bits of address register gives the register
264 * address. Any overflow from sub-register offset is dropped."
265 *
266 * Since the indirect may cause us to cross a register boundary, this
267 * makes the base offset almost useless. We could try and do something
268 * clever where we use a actual base offset if base_offset % 32 == 0 but
269 * that would mean we were generating different code depending on the
270 * base offset. Instead, for the sake of consistency, we'll just do the
271 * add ourselves. This restriction is only listed in the Haswell PRM
272 * but empirical testing indicates that it applies on all older
273 * generations and is lifted on Broadwell.
274 *
275 * In the end, while base_offset is nice to look at in the generated
276 * code, using it saves us 0 instructions and would require quite a bit
277 * of case-by-case work. It's just not worth it.
278 *
279 * Due to a hardware bug some platforms (particularly Gfx11+) seem to
280 * require the address components of all channels to be valid whether or
281 * not they're active, which causes issues if we use VxH addressing
282 * under non-uniform control-flow. We can easily work around that by
283 * initializing the whole address register with a pipelined NoMask MOV
284 * instruction.
285 */
286 insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
287 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
288 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
289 if (devinfo->ver >= 12)
290 brw_set_default_swsb(p, tgl_swsb_null());
291 else
292 brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
293
294 insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
295 if (devinfo->ver >= 12)
296 brw_set_default_swsb(p, tgl_swsb_regdist(1));
297 else
298 brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
299
300 if (brw_type_size_bytes(reg.type) > 4 &&
301 (intel_device_info_is_9lp(devinfo) || !devinfo->has_64bit_int)) {
302 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
303 *
304 * "When source or destination datatype is 64b or operation is
305 * integer DWord multiply, indirect addressing must not be used."
306 *
307 * We may also not support Q/UQ types.
308 *
309 * To work around both of these, we do two integer MOVs instead
310 * of one 64-bit MOV. Because no double value should ever cross
311 * a register boundary, it's safe to use the immediate offset in
312 * the indirect here to handle adding 4 bytes to the offset and
313 * avoid the extra ADD to the register file.
314 */
315 brw_MOV(p, subscript(dst, BRW_TYPE_D, 0),
316 retype(brw_VxH_indirect(0, 0), BRW_TYPE_D));
317 brw_set_default_swsb(p, tgl_swsb_null());
318 brw_MOV(p, subscript(dst, BRW_TYPE_D, 1),
319 retype(brw_VxH_indirect(0, 4), BRW_TYPE_D));
320 } else {
321 struct brw_reg ind_src = brw_VxH_indirect(0, 0);
322
323 brw_MOV(p, dst, retype(ind_src, reg.type));
324 }
325 }
326 }
327
328 void
generate_shuffle(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)329 fs_generator::generate_shuffle(fs_inst *inst,
330 struct brw_reg dst,
331 struct brw_reg src,
332 struct brw_reg idx)
333 {
334 assert(src.file == FIXED_GRF);
335 assert(!src.abs && !src.negate);
336
337 /* Ivy bridge has some strange behavior that makes this a real pain to
338 * implement for 64-bit values so we just don't bother.
339 */
340 assert(devinfo->has_64bit_float || brw_type_size_bytes(src.type) <= 4);
341
342 /* Gen12.5 adds the following region restriction:
343 *
344 * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
345 * and Quad-Word data must not be used."
346 *
347 * We require the source and destination types to match so stomp to an
348 * unsigned integer type.
349 */
350 assert(src.type == dst.type);
351 src.type = dst.type =
352 brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src.type));
353
354 /* Because we're using the address register, we're limited to 16-wide
355 * by the address register file and 8-wide for 64-bit types. We could try
356 * and make this instruction splittable higher up in the compiler but that
357 * gets weird because it reads all of the channels regardless of execution
358 * size. It's easier just to split it here.
359 */
360 unsigned lower_width = MIN2(16, inst->exec_size);
361 if (devinfo->ver < 20 && (element_sz(src) > 4 || element_sz(dst) > 4)) {
362 lower_width = 8;
363 }
364
365 brw_set_default_exec_size(p, cvt(lower_width) - 1);
366 for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
367 brw_set_default_group(p, group);
368
369 if ((src.vstride == 0 && src.hstride == 0) ||
370 idx.file == IMM) {
371 /* Trivial, the source is already uniform or the index is a constant.
372 * We will typically not get here if the optimizer is doing its job,
373 * but asserting would be mean.
374 */
375 const unsigned i = idx.file == IMM ? idx.ud : 0;
376 struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
377 struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1));
378 brw_MOV(p, group_dst, group_src);
379 } else {
380 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
381 struct brw_reg addr = vec8(brw_address_reg(0));
382
383 struct brw_reg group_idx = suboffset(idx, group);
384
385 if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
386 /* Things get grumpy if the register is too wide. */
387 group_idx.width--;
388 group_idx.vstride--;
389 }
390
391 assert(brw_type_size_bytes(group_idx.type) <= 4);
392 if (brw_type_size_bytes(group_idx.type) == 4) {
393 /* The destination stride of an instruction (in bytes) must be
394 * greater than or equal to the size of the rest of the
395 * instruction. Since the address register is of type UW, we
396 * can't use a D-type instruction. In order to get around this,
397 * re retype to UW and use a stride.
398 */
399 group_idx = retype(spread(group_idx, 2), BRW_TYPE_W);
400 }
401
402 uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
403
404 /* From the Haswell PRM:
405 *
406 * "When a sequence of NoDDChk and NoDDClr are used, the last
407 * instruction that completes the scoreboard clear must have a
408 * non-zero execution mask. This means, if any kind of predication
409 * can change the execution mask or channel enable of the last
410 * instruction, the optimization must be avoided. This is to
411 * avoid instructions being shot down the pipeline when no writes
412 * are required."
413 *
414 * Whenever predication is enabled or the instructions being emitted
415 * aren't the full width, it's possible that it will be run with zero
416 * channels enabled so we can't use dependency control without
417 * running the risk of a hang if an instruction gets shot down.
418 */
419 const bool use_dep_ctrl = !inst->predicate &&
420 lower_width == dispatch_width;
421 brw_inst *insn;
422
423 /* Due to a hardware bug some platforms (particularly Gfx11+) seem
424 * to require the address components of all channels to be valid
425 * whether or not they're active, which causes issues if we use VxH
426 * addressing under non-uniform control-flow. We can easily work
427 * around that by initializing the whole address register with a
428 * pipelined NoMask MOV instruction.
429 */
430 insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
431 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
432 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
433 if (devinfo->ver >= 12)
434 brw_set_default_swsb(p, tgl_swsb_null());
435 else
436 brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
437
438 /* Take into account the component size and horizontal stride. */
439 assert(src.vstride == src.hstride + src.width);
440 insn = brw_SHL(p, addr, group_idx,
441 brw_imm_uw(util_logbase2(brw_type_size_bytes(src.type)) +
442 src.hstride - 1));
443 if (devinfo->ver >= 12)
444 brw_set_default_swsb(p, tgl_swsb_regdist(1));
445 else
446 brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
447
448 /* Add on the register start offset */
449 brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
450 brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)),
451 retype(brw_VxH_indirect(0, 0), src.type));
452 }
453
454 brw_set_default_swsb(p, tgl_swsb_null());
455 }
456 }
457
458 void
generate_quad_swizzle(const fs_inst * inst,struct brw_reg dst,struct brw_reg src,unsigned swiz)459 fs_generator::generate_quad_swizzle(const fs_inst *inst,
460 struct brw_reg dst, struct brw_reg src,
461 unsigned swiz)
462 {
463 /* Requires a quad. */
464 assert(inst->exec_size >= 4);
465
466 if (src.file == IMM ||
467 has_scalar_region(src)) {
468 /* The value is uniform across all channels */
469 brw_MOV(p, dst, src);
470
471 } else if (devinfo->ver < 11 && brw_type_size_bytes(src.type) == 4) {
472 /* This only works on 8-wide 32-bit values */
473 assert(inst->exec_size == 8);
474 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
475 assert(src.vstride == src.width + 1);
476 brw_set_default_access_mode(p, BRW_ALIGN_16);
477 struct brw_reg swiz_src = stride(src, 4, 4, 1);
478 swiz_src.swizzle = swiz;
479 brw_MOV(p, dst, swiz_src);
480
481 } else {
482 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
483 assert(src.vstride == src.width + 1);
484 const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
485
486 switch (swiz) {
487 case BRW_SWIZZLE_XXXX:
488 case BRW_SWIZZLE_YYYY:
489 case BRW_SWIZZLE_ZZZZ:
490 case BRW_SWIZZLE_WWWW:
491 brw_MOV(p, dst, stride(src_0, 4, 4, 0));
492 break;
493
494 case BRW_SWIZZLE_XXZZ:
495 case BRW_SWIZZLE_YYWW:
496 brw_MOV(p, dst, stride(src_0, 2, 2, 0));
497 break;
498
499 case BRW_SWIZZLE_XYXY:
500 case BRW_SWIZZLE_ZWZW:
501 assert(inst->exec_size == 4);
502 brw_MOV(p, dst, stride(src_0, 0, 2, 1));
503 break;
504
505 default:
506 assert(inst->force_writemask_all);
507 brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
508
509 for (unsigned c = 0; c < 4; c++) {
510 brw_inst *insn = brw_MOV(
511 p, stride(suboffset(dst, c),
512 4 * inst->dst.stride, 1, 4 * inst->dst.stride),
513 stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
514
515 if (devinfo->ver < 12) {
516 brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
517 brw_inst_set_no_dd_check(devinfo, insn, c > 0);
518 }
519
520 brw_set_default_swsb(p, tgl_swsb_null());
521 }
522
523 break;
524 }
525 }
526 }
527
528 void
generate_barrier(fs_inst *,struct brw_reg src)529 fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
530 {
531 brw_barrier(p, src);
532 if (devinfo->ver >= 12) {
533 brw_set_default_swsb(p, tgl_swsb_null());
534 brw_SYNC(p, TGL_SYNC_BAR);
535 } else {
536 brw_WAIT(p);
537 }
538 }
539
540 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
541 * looking like:
542 *
543 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
544 *
545 * Ideally, we want to produce:
546 *
547 * DDX DDY
548 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
549 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
550 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
551 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
552 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
553 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
554 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
555 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
556 *
557 * and add another set of two more subspans if in 16-pixel dispatch mode.
558 *
559 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
560 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
561 * pair. But the ideal approximation may impose a huge performance cost on
562 * sample_d. On at least Haswell, sample_d instruction does some
563 * optimizations if the same LOD is used for all pixels in the subspan.
564 *
565 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
566 * appropriate swizzling.
567 */
568 void
generate_ddx(const fs_inst * inst,struct brw_reg dst,struct brw_reg src)569 fs_generator::generate_ddx(const fs_inst *inst,
570 struct brw_reg dst, struct brw_reg src)
571 {
572 unsigned vstride, width;
573
574 if (inst->opcode == FS_OPCODE_DDX_FINE) {
575 /* produce accurate derivatives */
576 vstride = BRW_VERTICAL_STRIDE_2;
577 width = BRW_WIDTH_2;
578 } else {
579 /* replicate the derivative at the top-left pixel to other pixels */
580 vstride = BRW_VERTICAL_STRIDE_4;
581 width = BRW_WIDTH_4;
582 }
583
584 struct brw_reg src0 = byte_offset(src, brw_type_size_bytes(src.type));;
585 struct brw_reg src1 = src;
586
587 src0.vstride = vstride;
588 src0.width = width;
589 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
590 src1.vstride = vstride;
591 src1.width = width;
592 src1.hstride = BRW_HORIZONTAL_STRIDE_0;
593
594 brw_ADD(p, dst, src0, negate(src1));
595 }
596
597 /* The negate_value boolean is used to negate the derivative computation for
598 * FBOs, since they place the origin at the upper left instead of the lower
599 * left.
600 */
601 void
generate_ddy(const fs_inst * inst,struct brw_reg dst,struct brw_reg src)602 fs_generator::generate_ddy(const fs_inst *inst,
603 struct brw_reg dst, struct brw_reg src)
604 {
605 const uint32_t type_size = brw_type_size_bytes(src.type);
606
607 if (inst->opcode == FS_OPCODE_DDY_FINE) {
608 /* produce accurate derivatives.
609 *
610 * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
611 * "Register Region Restrictions", Section "1. Special Restrictions":
612 *
613 * "In Align16 mode, the channel selects and channel enables apply to
614 * a pair of half-floats, because these parameters are defined for
615 * DWord elements ONLY. This is applicable when both source and
616 * destination are half-floats."
617 *
618 * So for half-float operations we use the Gfx11+ Align1 path. CHV
619 * inherits its FP16 hardware from SKL, so it is not affected.
620 */
621 if (devinfo->ver >= 11) {
622 src = stride(src, 0, 2, 1);
623
624 brw_push_insn_state(p);
625 brw_set_default_exec_size(p, BRW_EXECUTE_4);
626 for (uint32_t g = 0; g < inst->exec_size; g += 4) {
627 brw_set_default_group(p, inst->group + g);
628 brw_ADD(p, byte_offset(dst, g * type_size),
629 negate(byte_offset(src, g * type_size)),
630 byte_offset(src, (g + 2) * type_size));
631 brw_set_default_swsb(p, tgl_swsb_null());
632 }
633 brw_pop_insn_state(p);
634 } else {
635 struct brw_reg src0 = stride(src, 4, 4, 1);
636 struct brw_reg src1 = stride(src, 4, 4, 1);
637 src0.swizzle = BRW_SWIZZLE_XYXY;
638 src1.swizzle = BRW_SWIZZLE_ZWZW;
639
640 brw_push_insn_state(p);
641 brw_set_default_access_mode(p, BRW_ALIGN_16);
642 brw_ADD(p, dst, negate(src0), src1);
643 brw_pop_insn_state(p);
644 }
645 } else {
646 /* replicate the derivative at the top-left pixel to other pixels */
647 struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
648 struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
649
650 brw_ADD(p, dst, negate(src0), src1);
651 }
652 }
653
654 void
generate_halt(fs_inst *)655 fs_generator::generate_halt(fs_inst *)
656 {
657 /* This HALT will be patched up at FB write time to point UIP at the end of
658 * the program, and at brw_uip_jip() JIP will be set to the end of the
659 * current block (or the program).
660 */
661 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
662 brw_HALT(p);
663 }
664
665 /* The A32 messages take a buffer base address in header.5:[31:0] (See
666 * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
667 * and OWord block messages in the SKL PRM Vol. 2d for more details.)
668 * Unfortunately, there are a number of subtle differences:
669 *
670 * For the block read/write messages:
671 *
672 * - We always stomp header.2 to fill in the actual scratch address (in
673 * units of OWORDs) so we don't care what's in there.
674 *
675 * - They rely on per-thread scratch space value in header.3[3:0] to do
676 * bounds checking so that needs to be valid. The upper bits of
677 * header.3 are ignored, though, so we can copy all of g0.3.
678 *
679 * - They ignore header.5[9:0] and assumes the address is 1KB aligned.
680 *
681 *
682 * For the byte/dword scattered read/write messages:
683 *
684 * - We want header.2 to be zero because that gets added to the per-channel
685 * offset in the non-header portion of the message.
686 *
687 * - Contrary to what the docs claim, they don't do any bounds checking so
688 * the value of header.3[3:0] doesn't matter.
689 *
690 * - They consider all of header.5 for the base address and header.5[9:0]
691 * are not ignored. This means that we can't copy g0.5 verbatim because
692 * g0.5[9:0] contains the FFTID on most platforms. Instead, we have to
693 * use an AND to mask off the bottom 10 bits.
694 *
695 *
696 * For block messages, just copying g0 gives a valid header because all the
697 * garbage gets ignored except for header.2 which we stomp as part of message
698 * setup. For byte/dword scattered messages, we can just zero out the header
699 * and copy over the bits we need from g0.5. This opcode, however, tries to
700 * satisfy the requirements of both by starting with 0 and filling out the
701 * information required by either set of opcodes.
702 */
703 void
generate_scratch_header(fs_inst * inst,struct brw_reg dst,struct brw_reg src)704 fs_generator::generate_scratch_header(fs_inst *inst,
705 struct brw_reg dst,
706 struct brw_reg src)
707 {
708 assert(inst->exec_size == 8 && inst->force_writemask_all);
709 assert(dst.file == FIXED_GRF);
710 assert(src.file == FIXED_GRF);
711 assert(src.type == BRW_TYPE_UD);
712
713 dst.type = BRW_TYPE_UD;
714
715 brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
716 if (devinfo->ver >= 12)
717 brw_set_default_swsb(p, tgl_swsb_null());
718 else
719 brw_inst_set_no_dd_clear(p->devinfo, insn, true);
720
721 /* Copy the per-thread scratch space size from g0.3[3:0] */
722 brw_set_default_exec_size(p, BRW_EXECUTE_1);
723 insn = brw_AND(p, suboffset(dst, 3), component(src, 3),
724 brw_imm_ud(INTEL_MASK(3, 0)));
725 if (devinfo->ver < 12) {
726 brw_inst_set_no_dd_clear(p->devinfo, insn, true);
727 brw_inst_set_no_dd_check(p->devinfo, insn, true);
728 }
729
730 /* Copy the scratch base address from g0.5[31:10] */
731 insn = brw_AND(p, suboffset(dst, 5), component(src, 5),
732 brw_imm_ud(INTEL_MASK(31, 10)));
733 if (devinfo->ver < 12)
734 brw_inst_set_no_dd_check(p->devinfo, insn, true);
735 }
736
737 void
enable_debug(const char * shader_name)738 fs_generator::enable_debug(const char *shader_name)
739 {
740 debug_flag = true;
741 this->shader_name = shader_name;
742 }
743
744 static gfx12_systolic_depth
translate_systolic_depth(unsigned d)745 translate_systolic_depth(unsigned d)
746 {
747 /* Could also return (ffs(d) - 1) & 3. */
748 switch (d) {
749 case 2: return BRW_SYSTOLIC_DEPTH_2;
750 case 4: return BRW_SYSTOLIC_DEPTH_4;
751 case 8: return BRW_SYSTOLIC_DEPTH_8;
752 case 16: return BRW_SYSTOLIC_DEPTH_16;
753 default: unreachable("Invalid systolic depth.");
754 }
755 }
756
757 int
generate_code(const cfg_t * cfg,int dispatch_width,struct shader_stats shader_stats,const brw::performance & perf,struct brw_compile_stats * stats,unsigned max_polygons)758 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
759 struct shader_stats shader_stats,
760 const brw::performance &perf,
761 struct brw_compile_stats *stats,
762 unsigned max_polygons)
763 {
764 /* align to 64 byte boundary. */
765 brw_realign(p, 64);
766
767 this->dispatch_width = dispatch_width;
768
769 int start_offset = p->next_insn_offset;
770
771 int loop_count = 0, send_count = 0, nop_count = 0, sync_nop_count = 0;
772 bool is_accum_used = false;
773
774 struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
775
776 foreach_block_and_inst (block, fs_inst, inst, cfg) {
777 if (inst->opcode == SHADER_OPCODE_UNDEF)
778 continue;
779
780 struct brw_reg src[4], dst;
781 unsigned int last_insn_offset = p->next_insn_offset;
782 bool multiple_instructions_emitted = false;
783 tgl_swsb swsb = inst->sched;
784
785 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
786 * "Register Region Restrictions" section: for BDW, SKL:
787 *
788 * "A POW/FDIV operation must not be followed by an instruction
789 * that requires two destination registers."
790 *
791 * The documentation is often lacking annotations for Atom parts,
792 * and empirically this affects CHV as well.
793 */
794 if (devinfo->ver <= 9 &&
795 p->nr_insn > 1 &&
796 brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH &&
797 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
798 inst->dst.component_size(inst->exec_size) > REG_SIZE) {
799 brw_NOP(p);
800 last_insn_offset = p->next_insn_offset;
801
802 /* In order to avoid spurious instruction count differences when the
803 * instruction schedule changes, keep track of the number of inserted
804 * NOPs.
805 */
806 nop_count++;
807 }
808
809 /* Wa_14010017096:
810 *
811 * Clear accumulator register before end of thread.
812 */
813 if (inst->eot && is_accum_used &&
814 intel_needs_workaround(devinfo, 14010017096)) {
815 brw_set_default_exec_size(p, BRW_EXECUTE_16);
816 brw_set_default_group(p, 0);
817 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
818 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
819 brw_set_default_flag_reg(p, 0, 0);
820 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
821 brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
822 last_insn_offset = p->next_insn_offset;
823 swsb = tgl_swsb_dst_dep(swsb, 1);
824 }
825
826 if (!is_accum_used && !inst->eot) {
827 is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
828 inst->dst.is_accumulator();
829 }
830
831 /* Wa_14013672992:
832 *
833 * Always use @1 SWSB for EOT.
834 */
835 if (inst->eot && intel_needs_workaround(devinfo, 14013672992)) {
836 if (tgl_swsb_src_dep(swsb).mode) {
837 brw_set_default_exec_size(p, BRW_EXECUTE_1);
838 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
839 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
840 brw_set_default_flag_reg(p, 0, 0);
841 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
842 brw_SYNC(p, TGL_SYNC_NOP);
843 last_insn_offset = p->next_insn_offset;
844 }
845
846 swsb = tgl_swsb_dst_dep(swsb, 1);
847 }
848
849 if (unlikely(debug_flag))
850 disasm_annotate(disasm_info, inst, p->next_insn_offset);
851
852 if (devinfo->ver >= 20 && inst->group % 8 != 0) {
853 assert(inst->force_writemask_all);
854 assert(!inst->predicate && !inst->conditional_mod);
855 assert(!inst->writes_accumulator_implicitly(devinfo) &&
856 !inst->reads_accumulator_implicitly());
857 assert(inst->opcode != SHADER_OPCODE_SEL_EXEC);
858 brw_set_default_group(p, 0);
859 } else {
860 brw_set_default_group(p, inst->group);
861 }
862
863 for (unsigned int i = 0; i < inst->sources; i++) {
864 src[i] = normalize_brw_reg_for_encoding(&inst->src[i]);
865 /* The accumulator result appears to get used for the
866 * conditional modifier generation. When negating a UD
867 * value, there is a 33rd bit generated for the sign in the
868 * accumulator value, so now you can't check, for example,
869 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
870 */
871 assert(!inst->conditional_mod ||
872 inst->src[i].type != BRW_TYPE_UD ||
873 !inst->src[i].negate);
874 }
875 dst = normalize_brw_reg_for_encoding(&inst->dst);
876
877 brw_set_default_access_mode(p, BRW_ALIGN_1);
878 brw_set_default_predicate_control(p, inst->predicate);
879 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
880 /* On gfx7 and above, hardware automatically adds the group onto the
881 * flag subregister number.
882 */
883 const unsigned flag_subreg = inst->flag_subreg;
884 brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
885 brw_set_default_saturate(p, inst->saturate);
886 brw_set_default_mask_control(p, inst->force_writemask_all);
887 if (devinfo->ver >= 20 && inst->writes_accumulator) {
888 assert(inst->dst.is_accumulator() ||
889 inst->opcode == BRW_OPCODE_ADDC ||
890 inst->opcode == BRW_OPCODE_MACH ||
891 inst->opcode == BRW_OPCODE_SUBB);
892 } else {
893 brw_set_default_acc_write_control(p, inst->writes_accumulator);
894 }
895 brw_set_default_swsb(p, swsb);
896
897 unsigned exec_size = inst->exec_size;
898
899 brw_set_default_exec_size(p, cvt(exec_size) - 1);
900
901 assert(inst->force_writemask_all || inst->exec_size >= 4);
902 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
903 assert(inst->mlen <= BRW_MAX_MSG_LENGTH * reg_unit(devinfo));
904
905 switch (inst->opcode) {
906 case BRW_OPCODE_NOP:
907 brw_NOP(p);
908 break;
909 case BRW_OPCODE_SYNC:
910 assert(src[0].file == IMM);
911 brw_SYNC(p, tgl_sync_function(src[0].ud));
912
913 if (tgl_sync_function(src[0].ud) == TGL_SYNC_NOP)
914 ++sync_nop_count;
915
916 break;
917 case BRW_OPCODE_MOV:
918 brw_MOV(p, dst, src[0]);
919 break;
920 case BRW_OPCODE_ADD:
921 brw_ADD(p, dst, src[0], src[1]);
922 break;
923 case BRW_OPCODE_MUL:
924 brw_MUL(p, dst, src[0], src[1]);
925 break;
926 case BRW_OPCODE_AVG:
927 brw_AVG(p, dst, src[0], src[1]);
928 break;
929 case BRW_OPCODE_MACH:
930 brw_MACH(p, dst, src[0], src[1]);
931 break;
932
933 case BRW_OPCODE_DP4A:
934 assert(devinfo->ver >= 12);
935 brw_DP4A(p, dst, src[0], src[1], src[2]);
936 break;
937
938 case BRW_OPCODE_LINE:
939 brw_LINE(p, dst, src[0], src[1]);
940 break;
941
942 case BRW_OPCODE_DPAS:
943 assert(devinfo->verx10 >= 125);
944 brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount,
945 dst, src[0], src[1], src[2]);
946 break;
947
948 case BRW_OPCODE_MAD:
949 if (devinfo->ver < 10)
950 brw_set_default_access_mode(p, BRW_ALIGN_16);
951 brw_MAD(p, dst, src[0], src[1], src[2]);
952 break;
953
954 case BRW_OPCODE_LRP:
955 assert(devinfo->ver <= 10);
956 if (devinfo->ver < 10)
957 brw_set_default_access_mode(p, BRW_ALIGN_16);
958 brw_LRP(p, dst, src[0], src[1], src[2]);
959 break;
960
961 case BRW_OPCODE_ADD3:
962 assert(devinfo->verx10 >= 125);
963 brw_ADD3(p, dst, src[0], src[1], src[2]);
964 break;
965
966 case BRW_OPCODE_FRC:
967 brw_FRC(p, dst, src[0]);
968 break;
969 case BRW_OPCODE_RNDD:
970 brw_RNDD(p, dst, src[0]);
971 break;
972 case BRW_OPCODE_RNDE:
973 brw_RNDE(p, dst, src[0]);
974 break;
975 case BRW_OPCODE_RNDZ:
976 brw_RNDZ(p, dst, src[0]);
977 break;
978
979 case BRW_OPCODE_AND:
980 brw_AND(p, dst, src[0], src[1]);
981 break;
982 case BRW_OPCODE_OR:
983 brw_OR(p, dst, src[0], src[1]);
984 break;
985 case BRW_OPCODE_XOR:
986 brw_XOR(p, dst, src[0], src[1]);
987 break;
988 case BRW_OPCODE_NOT:
989 brw_NOT(p, dst, src[0]);
990 break;
991 case BRW_OPCODE_ASR:
992 brw_ASR(p, dst, src[0], src[1]);
993 break;
994 case BRW_OPCODE_SHR:
995 brw_SHR(p, dst, src[0], src[1]);
996 break;
997 case BRW_OPCODE_SHL:
998 brw_SHL(p, dst, src[0], src[1]);
999 break;
1000 case BRW_OPCODE_ROL:
1001 assert(devinfo->ver >= 11);
1002 assert(src[0].type == dst.type);
1003 brw_ROL(p, dst, src[0], src[1]);
1004 break;
1005 case BRW_OPCODE_ROR:
1006 assert(devinfo->ver >= 11);
1007 assert(src[0].type == dst.type);
1008 brw_ROR(p, dst, src[0], src[1]);
1009 break;
1010 case BRW_OPCODE_CMP:
1011 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1012 break;
1013 case BRW_OPCODE_CMPN:
1014 brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
1015 break;
1016 case BRW_OPCODE_SEL:
1017 brw_SEL(p, dst, src[0], src[1]);
1018 break;
1019 case BRW_OPCODE_CSEL:
1020 if (devinfo->ver < 10)
1021 brw_set_default_access_mode(p, BRW_ALIGN_16);
1022 brw_CSEL(p, dst, src[0], src[1], src[2]);
1023 break;
1024 case BRW_OPCODE_BFREV:
1025 brw_BFREV(p, retype(dst, BRW_TYPE_UD), retype(src[0], BRW_TYPE_UD));
1026 break;
1027 case BRW_OPCODE_FBH:
1028 brw_FBH(p, retype(dst, src[0].type), src[0]);
1029 break;
1030 case BRW_OPCODE_FBL:
1031 brw_FBL(p, retype(dst, BRW_TYPE_UD), retype(src[0], BRW_TYPE_UD));
1032 break;
1033 case BRW_OPCODE_LZD:
1034 brw_LZD(p, dst, src[0]);
1035 break;
1036 case BRW_OPCODE_CBIT:
1037 brw_CBIT(p, retype(dst, BRW_TYPE_UD), retype(src[0], BRW_TYPE_UD));
1038 break;
1039 case BRW_OPCODE_ADDC:
1040 brw_ADDC(p, dst, src[0], src[1]);
1041 break;
1042 case BRW_OPCODE_SUBB:
1043 brw_SUBB(p, dst, src[0], src[1]);
1044 break;
1045 case BRW_OPCODE_MAC:
1046 brw_MAC(p, dst, src[0], src[1]);
1047 break;
1048
1049 case BRW_OPCODE_BFE:
1050 if (devinfo->ver < 10)
1051 brw_set_default_access_mode(p, BRW_ALIGN_16);
1052 brw_BFE(p, dst, src[0], src[1], src[2]);
1053 break;
1054
1055 case BRW_OPCODE_BFI1:
1056 brw_BFI1(p, dst, src[0], src[1]);
1057 break;
1058 case BRW_OPCODE_BFI2:
1059 if (devinfo->ver < 10)
1060 brw_set_default_access_mode(p, BRW_ALIGN_16);
1061 brw_BFI2(p, dst, src[0], src[1], src[2]);
1062 break;
1063
1064 case BRW_OPCODE_IF:
1065 brw_IF(p, brw_get_default_exec_size(p));
1066 break;
1067
1068 case BRW_OPCODE_ELSE:
1069 brw_ELSE(p);
1070 break;
1071 case BRW_OPCODE_ENDIF:
1072 brw_ENDIF(p);
1073 break;
1074
1075 case BRW_OPCODE_DO:
1076 brw_DO(p, brw_get_default_exec_size(p));
1077 break;
1078
1079 case BRW_OPCODE_BREAK:
1080 brw_BREAK(p);
1081 break;
1082 case BRW_OPCODE_CONTINUE:
1083 brw_CONT(p);
1084 break;
1085
1086 case BRW_OPCODE_WHILE:
1087 brw_WHILE(p);
1088 loop_count++;
1089 break;
1090
1091 case SHADER_OPCODE_RCP:
1092 case SHADER_OPCODE_RSQ:
1093 case SHADER_OPCODE_SQRT:
1094 case SHADER_OPCODE_EXP2:
1095 case SHADER_OPCODE_LOG2:
1096 case SHADER_OPCODE_SIN:
1097 case SHADER_OPCODE_COS:
1098 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1099 assert(inst->mlen == 0);
1100 gfx6_math(p, dst, brw_math_function(inst->opcode),
1101 src[0], retype(brw_null_reg(), src[0].type));
1102 break;
1103 case SHADER_OPCODE_INT_QUOTIENT:
1104 case SHADER_OPCODE_INT_REMAINDER:
1105 case SHADER_OPCODE_POW:
1106 assert(devinfo->verx10 < 125);
1107 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1108 assert(inst->mlen == 0);
1109 assert(inst->opcode == SHADER_OPCODE_POW || inst->exec_size == 8);
1110 gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1111 break;
1112 case BRW_OPCODE_PLN:
1113 /* PLN reads:
1114 * / in SIMD16 \
1115 * -----------------------------------
1116 * | src1+0 | src1+1 | src1+2 | src1+3 |
1117 * |-----------------------------------|
1118 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
1119 * -----------------------------------
1120 */
1121 brw_PLN(p, dst, src[0], src[1]);
1122 break;
1123 case FS_OPCODE_PIXEL_X:
1124 assert(src[0].type == BRW_TYPE_UW);
1125 assert(src[1].type == BRW_TYPE_UW);
1126 src[0].subnr = 0 * brw_type_size_bytes(src[0].type);
1127 if (src[1].file == IMM) {
1128 assert(src[1].ud == 0);
1129 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1130 } else {
1131 /* Coarse pixel case */
1132 brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
1133 }
1134 break;
1135 case FS_OPCODE_PIXEL_Y:
1136 assert(src[0].type == BRW_TYPE_UW);
1137 assert(src[1].type == BRW_TYPE_UW);
1138 src[0].subnr = 4 * brw_type_size_bytes(src[0].type);
1139 if (src[1].file == IMM) {
1140 assert(src[1].ud == 0);
1141 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1142 } else {
1143 /* Coarse pixel case */
1144 brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
1145 }
1146 break;
1147
1148 case SHADER_OPCODE_SEND:
1149 generate_send(inst, dst, src[0], src[1], src[2],
1150 inst->ex_mlen > 0 ? src[3] : brw_null_reg());
1151 send_count++;
1152 break;
1153
1154 case FS_OPCODE_DDX_COARSE:
1155 case FS_OPCODE_DDX_FINE:
1156 generate_ddx(inst, dst, src[0]);
1157 break;
1158 case FS_OPCODE_DDY_COARSE:
1159 case FS_OPCODE_DDY_FINE:
1160 generate_ddy(inst, dst, src[0]);
1161 break;
1162
1163 case SHADER_OPCODE_SCRATCH_HEADER:
1164 generate_scratch_header(inst, dst, src[0]);
1165 break;
1166
1167 case SHADER_OPCODE_MOV_INDIRECT:
1168 generate_mov_indirect(inst, dst, src[0], src[1]);
1169 break;
1170
1171 case SHADER_OPCODE_MOV_RELOC_IMM:
1172 assert(src[0].file == IMM);
1173 assert(src[1].file == IMM);
1174 brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud, src[1].ud);
1175 break;
1176
1177 case BRW_OPCODE_HALT:
1178 generate_halt(inst);
1179 break;
1180
1181 case SHADER_OPCODE_INTERLOCK:
1182 case SHADER_OPCODE_MEMORY_FENCE: {
1183 assert(src[1].file == IMM);
1184 assert(src[2].file == IMM);
1185
1186 const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
1187 BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
1188
1189 brw_memory_fence(p, dst, src[0], send_op,
1190 brw_message_target(inst->sfid),
1191 inst->desc,
1192 /* commit_enable */ src[1].ud,
1193 /* bti */ src[2].ud);
1194 send_count++;
1195 break;
1196 }
1197
1198 case FS_OPCODE_SCHEDULING_FENCE:
1199 if (inst->sources == 0 && swsb.regdist == 0 &&
1200 swsb.mode == TGL_SBID_NULL) {
1201 if (unlikely(debug_flag))
1202 disasm_info->use_tail = true;
1203 break;
1204 }
1205
1206 if (devinfo->ver >= 12) {
1207 /* Use the available SWSB information to stall. A single SYNC is
1208 * sufficient since if there were multiple dependencies, the
1209 * scoreboard algorithm already injected other SYNCs before this
1210 * instruction.
1211 */
1212 brw_SYNC(p, TGL_SYNC_NOP);
1213 } else {
1214 for (unsigned i = 0; i < inst->sources; i++) {
1215 /* Emit a MOV to force a stall until the instruction producing the
1216 * registers finishes.
1217 */
1218 brw_MOV(p, retype(brw_null_reg(), BRW_TYPE_UW),
1219 retype(src[i], BRW_TYPE_UW));
1220 }
1221
1222 if (inst->sources > 1)
1223 multiple_instructions_emitted = true;
1224 }
1225
1226 break;
1227
1228 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1229 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
1230 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
1231 unreachable("Should be lowered by lower_find_live_channel()");
1232 break;
1233
1234 case FS_OPCODE_LOAD_LIVE_CHANNELS: {
1235 assert(inst->force_writemask_all && inst->group == 0);
1236 assert(inst->dst.file == BAD_FILE);
1237 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1238 brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), BRW_TYPE_UD),
1239 retype(brw_mask_reg(0), BRW_TYPE_UD));
1240 break;
1241 }
1242 case SHADER_OPCODE_BROADCAST:
1243 assert(inst->force_writemask_all);
1244 brw_broadcast(p, dst, src[0], src[1]);
1245 break;
1246
1247 case SHADER_OPCODE_SHUFFLE:
1248 generate_shuffle(inst, dst, src[0], src[1]);
1249 break;
1250
1251 case SHADER_OPCODE_SEL_EXEC:
1252 assert(inst->force_writemask_all);
1253 assert(devinfo->has_64bit_float || brw_type_size_bytes(dst.type) <= 4);
1254 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1255 brw_MOV(p, dst, src[1]);
1256 brw_set_default_mask_control(p, BRW_MASK_ENABLE);
1257 brw_set_default_swsb(p, tgl_swsb_null());
1258 brw_MOV(p, dst, src[0]);
1259 break;
1260
1261 case SHADER_OPCODE_QUAD_SWIZZLE:
1262 assert(src[1].file == IMM);
1263 assert(src[1].type == BRW_TYPE_UD);
1264 generate_quad_swizzle(inst, dst, src[0], src[1].ud);
1265 break;
1266
1267 case SHADER_OPCODE_CLUSTER_BROADCAST: {
1268 assert((!intel_device_info_is_9lp(devinfo) &&
1269 devinfo->has_64bit_float) || brw_type_size_bytes(src[0].type) <= 4);
1270 assert(!src[0].negate && !src[0].abs);
1271 assert(src[1].file == IMM);
1272 assert(src[1].type == BRW_TYPE_UD);
1273 assert(src[2].file == IMM);
1274 assert(src[2].type == BRW_TYPE_UD);
1275 const unsigned component = src[1].ud;
1276 const unsigned cluster_size = src[2].ud;
1277 assert(inst->src[0].file != ARF);
1278
1279 unsigned s;
1280 if (inst->src[0].file == FIXED_GRF) {
1281 s = inst->src[0].hstride ? 1 << (inst->src[0].hstride - 1) : 0;
1282 } else {
1283 s = inst->src[0].stride;
1284 }
1285 unsigned vstride = cluster_size * s;
1286 unsigned width = cluster_size;
1287
1288 /* The maximum exec_size is 32, but the maximum width is only 16. */
1289 if (inst->exec_size == width) {
1290 vstride = 0;
1291 width = 1;
1292 }
1293
1294 struct brw_reg strided = stride(suboffset(src[0], component * s),
1295 vstride, width, 0);
1296 brw_MOV(p, dst, strided);
1297 break;
1298 }
1299
1300 case SHADER_OPCODE_HALT_TARGET:
1301 /* This is the place where the final HALT needs to be inserted if
1302 * we've emitted any discards. If not, this will emit no code.
1303 */
1304 if (!patch_halt_jumps()) {
1305 if (unlikely(debug_flag)) {
1306 disasm_info->use_tail = true;
1307 }
1308 }
1309 break;
1310
1311 case SHADER_OPCODE_BARRIER:
1312 generate_barrier(inst, src[0]);
1313 send_count++;
1314 break;
1315
1316 case SHADER_OPCODE_RND_MODE: {
1317 assert(src[0].file == IMM);
1318 /*
1319 * Changes the floating point rounding mode updating the control
1320 * register field defined at cr0.0[5-6] bits.
1321 */
1322 enum brw_rnd_mode mode =
1323 (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
1324 brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
1325 }
1326 break;
1327
1328 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
1329 assert(src[0].file == IMM);
1330 assert(src[1].file == IMM);
1331 brw_float_controls_mode(p, src[0].d, src[1].d);
1332 break;
1333
1334 case SHADER_OPCODE_READ_ARCH_REG:
1335 if (devinfo->ver >= 12) {
1336 /* There is a SWSB restriction that requires that any time sr0 is
1337 * accessed both the instruction doing the access and the next one
1338 * have SWSB set to RegDist(1).
1339 */
1340 if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
1341 brw_SYNC(p, TGL_SYNC_NOP);
1342 brw_set_default_swsb(p, tgl_swsb_regdist(1));
1343 brw_MOV(p, dst, src[0]);
1344 brw_set_default_swsb(p, tgl_swsb_regdist(1));
1345 brw_AND(p, dst, dst, brw_imm_ud(0xffffffff));
1346 } else {
1347 brw_MOV(p, dst, src[0]);
1348 }
1349 break;
1350
1351 default:
1352 unreachable("Unsupported opcode");
1353
1354 case SHADER_OPCODE_LOAD_PAYLOAD:
1355 unreachable("Should be lowered by lower_load_payload()");
1356 }
1357
1358 if (multiple_instructions_emitted)
1359 continue;
1360
1361 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
1362 assert(p->next_insn_offset == last_insn_offset + 16 ||
1363 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
1364 "emitting more than 1 instruction");
1365
1366 brw_inst *last = &p->store[last_insn_offset / 16];
1367
1368 if (inst->conditional_mod)
1369 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
1370 if (devinfo->ver < 12) {
1371 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
1372 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
1373 }
1374 }
1375
1376 /* When enabled, insert sync NOP after every instruction and make sure
1377 * that current instruction depends on the previous instruction.
1378 */
1379 if (INTEL_DEBUG(DEBUG_SWSB_STALL) && devinfo->ver >= 12) {
1380 brw_set_default_swsb(p, tgl_swsb_regdist(1));
1381 brw_SYNC(p, TGL_SYNC_NOP);
1382 }
1383 }
1384
1385 brw_set_uip_jip(p, start_offset);
1386
1387 /* end of program sentinel */
1388 disasm_new_inst_group(disasm_info, p->next_insn_offset);
1389
1390 /* `send_count` explicitly does not include spills or fills, as we'd
1391 * like to use it as a metric for intentional memory access or other
1392 * shared function use. Otherwise, subtle changes to scheduling or
1393 * register allocation could cause it to fluctuate wildly - and that
1394 * effect is already counted in spill/fill counts.
1395 */
1396 send_count -= shader_stats.spill_count;
1397 send_count -= shader_stats.fill_count;
1398
1399 #ifndef NDEBUG
1400 bool validated =
1401 #else
1402 if (unlikely(debug_flag))
1403 #endif
1404 brw_validate_instructions(&compiler->isa, p->store,
1405 start_offset,
1406 p->next_insn_offset,
1407 disasm_info);
1408
1409 int before_size = p->next_insn_offset - start_offset;
1410 brw_compact_instructions(p, start_offset, disasm_info);
1411 int after_size = p->next_insn_offset - start_offset;
1412
1413 bool dump_shader_bin = brw_should_dump_shader_bin();
1414 unsigned char sha1[21];
1415 char sha1buf[41];
1416
1417 if (unlikely(debug_flag || dump_shader_bin)) {
1418 _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
1419 after_size, sha1);
1420 _mesa_sha1_format(sha1buf, sha1);
1421 }
1422
1423 if (unlikely(dump_shader_bin))
1424 brw_dump_shader_bin(p->store, start_offset, p->next_insn_offset,
1425 sha1buf);
1426
1427 if (unlikely(debug_flag)) {
1428 fprintf(stderr, "Native code for %s (src_hash 0x%08x) (sha1 %s)\n"
1429 "SIMD%d shader: %d instructions. %d loops. %u cycles. "
1430 "%d:%d spills:fills, %u sends, "
1431 "scheduled with mode %s. "
1432 "Promoted %u constants. "
1433 "Compacted %d to %d bytes (%.0f%%)\n",
1434 shader_name, params->source_hash, sha1buf,
1435 dispatch_width, before_size / 16,
1436 loop_count, perf.latency,
1437 shader_stats.spill_count,
1438 shader_stats.fill_count,
1439 send_count,
1440 shader_stats.scheduler_mode,
1441 shader_stats.promoted_constants,
1442 before_size, after_size,
1443 100.0f * (before_size - after_size) / before_size);
1444
1445 /* overriding the shader makes disasm_info invalid */
1446 if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
1447 dump_assembly(p->store, start_offset, p->next_insn_offset,
1448 disasm_info, perf.block_latency);
1449 } else {
1450 fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
1451 }
1452 }
1453 ralloc_free(disasm_info);
1454 #ifndef NDEBUG
1455 if (!validated && !debug_flag) {
1456 fprintf(stderr,
1457 "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
1458 }
1459 #endif
1460 assert(validated);
1461
1462 brw_shader_debug_log(compiler, params->log_data,
1463 "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
1464 "%d:%d spills:fills, %u sends, "
1465 "scheduled with mode %s, "
1466 "Promoted %u constants, "
1467 "compacted %d to %d bytes.\n",
1468 _mesa_shader_stage_to_abbrev(stage),
1469 dispatch_width,
1470 before_size / 16 - nop_count - sync_nop_count,
1471 loop_count, perf.latency,
1472 shader_stats.spill_count,
1473 shader_stats.fill_count,
1474 send_count,
1475 shader_stats.scheduler_mode,
1476 shader_stats.promoted_constants,
1477 before_size, after_size);
1478 if (stats) {
1479 stats->dispatch_width = dispatch_width;
1480 stats->max_polygons = max_polygons;
1481 stats->max_dispatch_width = dispatch_width;
1482 stats->instructions = before_size / 16 - nop_count - sync_nop_count;
1483 stats->sends = send_count;
1484 stats->loops = loop_count;
1485 stats->cycles = perf.latency;
1486 stats->spills = shader_stats.spill_count;
1487 stats->fills = shader_stats.fill_count;
1488 stats->max_live_registers = shader_stats.max_register_pressure;
1489 }
1490
1491 return start_offset;
1492 }
1493
1494 void
add_const_data(void * data,unsigned size)1495 fs_generator::add_const_data(void *data, unsigned size)
1496 {
1497 assert(prog_data->const_data_size == 0);
1498 if (size > 0) {
1499 prog_data->const_data_size = size;
1500 prog_data->const_data_offset = brw_append_data(p, data, size, 32);
1501 }
1502 }
1503
1504 void
add_resume_sbt(unsigned num_resume_shaders,uint64_t * sbt)1505 fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
1506 {
1507 assert(brw_shader_stage_is_bindless(stage));
1508 struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
1509 if (num_resume_shaders > 0) {
1510 bs_prog_data->resume_sbt_offset =
1511 brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
1512 for (unsigned i = 0; i < num_resume_shaders; i++) {
1513 size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
1514 assert(offset <= UINT32_MAX);
1515 brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
1516 BRW_SHADER_RELOC_TYPE_U32,
1517 (uint32_t)offset, (uint32_t)sbt[i]);
1518 }
1519 }
1520 }
1521
1522 const unsigned *
get_assembly()1523 fs_generator::get_assembly()
1524 {
1525 prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
1526
1527 return brw_get_program(p, &prog_data->program_size);
1528 }
1529