1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_fs_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_private.h"
38 #include "intel_nir.h"
39 #include "shader_enums.h"
40 #include "dev/intel_debug.h"
41 #include "dev/intel_wa.h"
42 #include "compiler/glsl_types.h"
43 #include "compiler/nir/nir_builder.h"
44 #include "util/u_math.h"
45
46 using namespace brw;
47
48 static void
49 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
50
51 void
init(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
53 const brw_reg *src, unsigned sources)
54 {
55 memset((void*)this, 0, sizeof(*this));
56
57 initialize_sources(this, src, sources);
58
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->exec_size = exec_size;
65
66 assert(dst.file != IMM && dst.file != UNIFORM);
67
68 assert(this->exec_size != 0);
69
70 this->conditional_mod = BRW_CONDITIONAL_NONE;
71
72 /* This will be the case for almost all instructions. */
73 switch (dst.file) {
74 case VGRF:
75 case ARF:
76 case FIXED_GRF:
77 case ATTR:
78 this->size_written = dst.component_size(exec_size);
79 break;
80 case BAD_FILE:
81 this->size_written = 0;
82 break;
83 case IMM:
84 case UNIFORM:
85 unreachable("Invalid destination register file");
86 }
87
88 this->writes_accumulator = false;
89 }
90
fs_inst()91 fs_inst::fs_inst()
92 {
93 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
94 }
95
fs_inst(enum opcode opcode,uint8_t exec_size)96 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
97 {
98 init(opcode, exec_size, reg_undef, NULL, 0);
99 }
100
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst)101 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
102 {
103 init(opcode, exec_size, dst, NULL, 0);
104 }
105
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0)106 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
107 const brw_reg &src0)
108 {
109 const brw_reg src[1] = { src0 };
110 init(opcode, exec_size, dst, src, 1);
111 }
112
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)113 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
114 const brw_reg &src0, const brw_reg &src1)
115 {
116 const brw_reg src[2] = { src0, src1 };
117 init(opcode, exec_size, dst, src, 2);
118 }
119
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)120 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
121 const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
122 {
123 const brw_reg src[3] = { src0, src1, src2 };
124 init(opcode, exec_size, dst, src, 3);
125 }
126
fs_inst(enum opcode opcode,uint8_t exec_width,const brw_reg & dst,const brw_reg src[],unsigned sources)127 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
128 const brw_reg src[], unsigned sources)
129 {
130 init(opcode, exec_width, dst, src, sources);
131 }
132
fs_inst(const fs_inst & that)133 fs_inst::fs_inst(const fs_inst &that)
134 {
135 memcpy((void*)this, &that, sizeof(that));
136 initialize_sources(this, that.src, that.sources);
137 }
138
~fs_inst()139 fs_inst::~fs_inst()
140 {
141 if (this->src != this->builtin_src)
142 delete[] this->src;
143 }
144
145 static void
initialize_sources(fs_inst * inst,const brw_reg src[],uint8_t num_sources)146 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
147 {
148 if (num_sources > ARRAY_SIZE(inst->builtin_src))
149 inst->src = new brw_reg[num_sources];
150 else
151 inst->src = inst->builtin_src;
152
153 for (unsigned i = 0; i < num_sources; i++)
154 inst->src[i] = src[i];
155
156 inst->sources = num_sources;
157 }
158
159 void
resize_sources(uint8_t num_sources)160 fs_inst::resize_sources(uint8_t num_sources)
161 {
162 if (this->sources == num_sources)
163 return;
164
165 brw_reg *old_src = this->src;
166 brw_reg *new_src;
167
168 const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
169
170 if (old_src == this->builtin_src) {
171 if (num_sources > builtin_size) {
172 new_src = new brw_reg[num_sources];
173 for (unsigned i = 0; i < this->sources; i++)
174 new_src[i] = old_src[i];
175
176 } else {
177 new_src = old_src;
178 }
179 } else {
180 if (num_sources <= builtin_size) {
181 new_src = this->builtin_src;
182 assert(this->sources > num_sources);
183 for (unsigned i = 0; i < num_sources; i++)
184 new_src[i] = old_src[i];
185
186 } else if (num_sources < this->sources) {
187 new_src = old_src;
188
189 } else {
190 new_src = new brw_reg[num_sources];
191 for (unsigned i = 0; i < num_sources; i++)
192 new_src[i] = old_src[i];
193 }
194
195 if (old_src != new_src)
196 delete[] old_src;
197 }
198
199 this->sources = num_sources;
200 this->src = new_src;
201 }
202
203 bool
is_send_from_grf() const204 fs_inst::is_send_from_grf() const
205 {
206 switch (opcode) {
207 case SHADER_OPCODE_SEND:
208 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
209 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
210 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
211 case SHADER_OPCODE_INTERLOCK:
212 case SHADER_OPCODE_MEMORY_FENCE:
213 case SHADER_OPCODE_BARRIER:
214 return true;
215 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
216 return src[1].file == VGRF;
217 default:
218 return false;
219 }
220 }
221
222 bool
is_control_source(unsigned arg) const223 fs_inst::is_control_source(unsigned arg) const
224 {
225 switch (opcode) {
226 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
227 return arg == 0;
228
229 case SHADER_OPCODE_BROADCAST:
230 case SHADER_OPCODE_SHUFFLE:
231 case SHADER_OPCODE_QUAD_SWIZZLE:
232 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
233 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
234 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
235 return arg == 1;
236
237 case SHADER_OPCODE_MOV_INDIRECT:
238 case SHADER_OPCODE_CLUSTER_BROADCAST:
239 return arg == 1 || arg == 2;
240
241 case SHADER_OPCODE_SEND:
242 return arg == 0 || arg == 1;
243
244 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
245 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
246 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
247 return arg != MEMORY_LOGICAL_BINDING &&
248 arg != MEMORY_LOGICAL_ADDRESS &&
249 arg != MEMORY_LOGICAL_DATA0 &&
250 arg != MEMORY_LOGICAL_DATA1;
251
252 default:
253 return false;
254 }
255 }
256
257 bool
is_payload(unsigned arg) const258 fs_inst::is_payload(unsigned arg) const
259 {
260 switch (opcode) {
261 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
262 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
263 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
264 case SHADER_OPCODE_INTERLOCK:
265 case SHADER_OPCODE_MEMORY_FENCE:
266 case SHADER_OPCODE_BARRIER:
267 return arg == 0;
268
269 case SHADER_OPCODE_SEND:
270 return arg == 2 || arg == 3;
271
272 default:
273 return false;
274 }
275 }
276
277 /**
278 * Returns true if this instruction's sources and destinations cannot
279 * safely be the same register.
280 *
281 * In most cases, a register can be written over safely by the same
282 * instruction that is its last use. For a single instruction, the
283 * sources are dereferenced before writing of the destination starts
284 * (naturally).
285 *
286 * However, there are a few cases where this can be problematic:
287 *
288 * - Virtual opcodes that translate to multiple instructions in the
289 * code generator: if src == dst and one instruction writes the
290 * destination before a later instruction reads the source, then
291 * src will have been clobbered.
292 *
293 * - SIMD16 compressed instructions with certain regioning (see below).
294 *
295 * The register allocator uses this information to set up conflicts between
296 * GRF sources and the destination.
297 */
298 bool
has_source_and_destination_hazard() const299 fs_inst::has_source_and_destination_hazard() const
300 {
301 switch (opcode) {
302 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
303 /* Multiple partial writes to the destination */
304 return true;
305 case SHADER_OPCODE_SHUFFLE:
306 /* This instruction returns an arbitrary channel from the source and
307 * gets split into smaller instructions in the generator. It's possible
308 * that one of the instructions will read from a channel corresponding
309 * to an earlier instruction.
310 */
311 case SHADER_OPCODE_SEL_EXEC:
312 /* This is implemented as
313 *
314 * mov(16) g4<1>D 0D { align1 WE_all 1H };
315 * mov(16) g4<1>D g5<8,8,1>D { align1 1H }
316 *
317 * Because the source is only read in the second instruction, the first
318 * may stomp all over it.
319 */
320 return true;
321 case SHADER_OPCODE_QUAD_SWIZZLE:
322 switch (src[1].ud) {
323 case BRW_SWIZZLE_XXXX:
324 case BRW_SWIZZLE_YYYY:
325 case BRW_SWIZZLE_ZZZZ:
326 case BRW_SWIZZLE_WWWW:
327 case BRW_SWIZZLE_XXZZ:
328 case BRW_SWIZZLE_YYWW:
329 case BRW_SWIZZLE_XYXY:
330 case BRW_SWIZZLE_ZWZW:
331 /* These can be implemented as a single Align1 region on all
332 * platforms, so there's never a hazard between source and
333 * destination. C.f. fs_generator::generate_quad_swizzle().
334 */
335 return false;
336 default:
337 return !is_uniform(src[0]);
338 }
339 case BRW_OPCODE_DPAS:
340 /* This is overly conservative. The actual hazard is more complicated to
341 * describe. When the repeat count is N, the single instruction behaves
342 * like N instructions with a repeat count of one, but the destination
343 * and source registers are incremented (in somewhat complex ways) for
344 * each instruction.
345 *
346 * This means the source and destination register is actually a range of
347 * registers. The hazard exists of an earlier iteration would write a
348 * register that should be read by a later iteration.
349 *
350 * There may be some advantage to properly modeling this, but for now,
351 * be overly conservative.
352 */
353 return rcount > 1;
354 default:
355 /* The SIMD16 compressed instruction
356 *
357 * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
358 *
359 * is actually decoded in hardware as:
360 *
361 * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
362 * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
363 *
364 * Which is safe. However, if we have uniform accesses
365 * happening, we get into trouble:
366 *
367 * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
368 * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
369 *
370 * Now our destination for the first instruction overwrote the
371 * second instruction's src0, and we get garbage for those 8
372 * pixels. There's a similar issue for the pre-gfx6
373 * pixel_x/pixel_y, which are registers of 16-bit values and thus
374 * would get stomped by the first decode as well.
375 */
376 if (exec_size == 16) {
377 for (int i = 0; i < sources; i++) {
378 if (src[i].file == VGRF && (src[i].stride == 0 ||
379 src[i].type == BRW_TYPE_UW ||
380 src[i].type == BRW_TYPE_W ||
381 src[i].type == BRW_TYPE_UB ||
382 src[i].type == BRW_TYPE_B)) {
383 return true;
384 }
385 }
386 }
387 return false;
388 }
389 }
390
391 bool
can_do_source_mods(const struct intel_device_info * devinfo) const392 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
393 {
394 if (is_send_from_grf())
395 return false;
396
397 /* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
398 *
399 * "When multiplying a DW and any lower precision integer, source modifier
400 * is not supported."
401 */
402 if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
403 opcode == BRW_OPCODE_MAD)) {
404 const brw_reg_type exec_type = get_exec_type(this);
405 const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
406 MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
407 MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
408
409 if (brw_type_is_int(exec_type) &&
410 brw_type_size_bytes(exec_type) >= 4 &&
411 brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
412 return false;
413 }
414
415 switch (opcode) {
416 case BRW_OPCODE_ADDC:
417 case BRW_OPCODE_BFE:
418 case BRW_OPCODE_BFI1:
419 case BRW_OPCODE_BFI2:
420 case BRW_OPCODE_BFREV:
421 case BRW_OPCODE_CBIT:
422 case BRW_OPCODE_FBH:
423 case BRW_OPCODE_FBL:
424 case BRW_OPCODE_ROL:
425 case BRW_OPCODE_ROR:
426 case BRW_OPCODE_SUBB:
427 case BRW_OPCODE_DP4A:
428 case BRW_OPCODE_DPAS:
429 case SHADER_OPCODE_BROADCAST:
430 case SHADER_OPCODE_CLUSTER_BROADCAST:
431 case SHADER_OPCODE_MOV_INDIRECT:
432 case SHADER_OPCODE_SHUFFLE:
433 case SHADER_OPCODE_INT_QUOTIENT:
434 case SHADER_OPCODE_INT_REMAINDER:
435 return false;
436 default:
437 return true;
438 }
439 }
440
441 bool
can_do_cmod() const442 fs_inst::can_do_cmod() const
443 {
444 switch (opcode) {
445 case BRW_OPCODE_ADD:
446 case BRW_OPCODE_ADD3:
447 case BRW_OPCODE_ADDC:
448 case BRW_OPCODE_AND:
449 case BRW_OPCODE_ASR:
450 case BRW_OPCODE_AVG:
451 case BRW_OPCODE_CMP:
452 case BRW_OPCODE_CMPN:
453 case BRW_OPCODE_DP2:
454 case BRW_OPCODE_DP3:
455 case BRW_OPCODE_DP4:
456 case BRW_OPCODE_DPH:
457 case BRW_OPCODE_FRC:
458 case BRW_OPCODE_LINE:
459 case BRW_OPCODE_LRP:
460 case BRW_OPCODE_LZD:
461 case BRW_OPCODE_MAC:
462 case BRW_OPCODE_MACH:
463 case BRW_OPCODE_MAD:
464 case BRW_OPCODE_MOV:
465 case BRW_OPCODE_MUL:
466 case BRW_OPCODE_NOT:
467 case BRW_OPCODE_OR:
468 case BRW_OPCODE_PLN:
469 case BRW_OPCODE_RNDD:
470 case BRW_OPCODE_RNDE:
471 case BRW_OPCODE_RNDU:
472 case BRW_OPCODE_RNDZ:
473 case BRW_OPCODE_SHL:
474 case BRW_OPCODE_SHR:
475 case BRW_OPCODE_SUBB:
476 case BRW_OPCODE_XOR:
477 break;
478 default:
479 return false;
480 }
481
482 /* The accumulator result appears to get used for the conditional modifier
483 * generation. When negating a UD value, there is a 33rd bit generated for
484 * the sign in the accumulator value, so now you can't check, for example,
485 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
486 */
487 for (unsigned i = 0; i < sources; i++) {
488 if (brw_type_is_uint(src[i].type) && src[i].negate)
489 return false;
490 }
491
492 return true;
493 }
494
495 bool
can_change_types() const496 fs_inst::can_change_types() const
497 {
498 return dst.type == src[0].type &&
499 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
500 (opcode == BRW_OPCODE_MOV ||
501 (opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
502 (opcode == BRW_OPCODE_SEL &&
503 dst.type == src[1].type &&
504 predicate != BRW_PREDICATE_NONE &&
505 !src[1].abs && !src[1].negate && src[1].file != ATTR));
506 }
507
508 bool
equals(const brw_reg & r) const509 brw_reg::equals(const brw_reg &r) const
510 {
511 return brw_regs_equal(this, &r);
512 }
513
514 bool
negative_equals(const brw_reg & r) const515 brw_reg::negative_equals(const brw_reg &r) const
516 {
517 return brw_regs_negative_equal(this, &r);
518 }
519
520 bool
is_contiguous() const521 brw_reg::is_contiguous() const
522 {
523 switch (file) {
524 case ARF:
525 case FIXED_GRF:
526 return hstride == BRW_HORIZONTAL_STRIDE_1 &&
527 vstride == width + hstride;
528 case VGRF:
529 case ATTR:
530 return stride == 1;
531 case UNIFORM:
532 case IMM:
533 case BAD_FILE:
534 return true;
535 }
536
537 unreachable("Invalid register file");
538 }
539
540 unsigned
component_size(unsigned width) const541 brw_reg::component_size(unsigned width) const
542 {
543 if (file == ARF || file == FIXED_GRF) {
544 const unsigned w = MIN2(width, 1u << this->width);
545 const unsigned h = width >> this->width;
546 const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
547 const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
548 assert(w > 0);
549 /* Note this rounds up to next horizontal stride to be consistent with
550 * the VGRF case below.
551 */
552 return ((MAX2(1, h) - 1) * vs + MAX2(w * hs, 1)) * brw_type_size_bytes(type);
553 } else {
554 return MAX2(width * stride, 1) * brw_type_size_bytes(type);
555 }
556 }
557
558 void
vfail(const char * format,va_list va)559 fs_visitor::vfail(const char *format, va_list va)
560 {
561 char *msg;
562
563 if (failed)
564 return;
565
566 failed = true;
567
568 msg = ralloc_vasprintf(mem_ctx, format, va);
569 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
570 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
571
572 this->fail_msg = msg;
573
574 if (unlikely(debug_enabled)) {
575 fprintf(stderr, "%s", msg);
576 }
577 }
578
579 void
fail(const char * format,...)580 fs_visitor::fail(const char *format, ...)
581 {
582 va_list va;
583
584 va_start(va, format);
585 vfail(format, va);
586 va_end(va);
587 }
588
589 /**
590 * Mark this program as impossible to compile with dispatch width greater
591 * than n.
592 *
593 * During the SIMD8 compile (which happens first), we can detect and flag
594 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
595 * SIMD16+ compile altogether.
596 *
597 * During a compile of dispatch width greater than n (if one happens anyway),
598 * this just calls fail().
599 */
600 void
limit_dispatch_width(unsigned n,const char * msg)601 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
602 {
603 if (dispatch_width > n) {
604 fail("%s", msg);
605 } else {
606 max_dispatch_width = MIN2(max_dispatch_width, n);
607 brw_shader_perf_log(compiler, log_data,
608 "Shader dispatch width limited to SIMD%d: %s\n",
609 n, msg);
610 }
611 }
612
613 /**
614 * Returns true if the instruction has a flag that means it won't
615 * update an entire destination register.
616 *
617 * For example, dead code elimination and live variable analysis want to know
618 * when a write to a variable screens off any preceding values that were in
619 * it.
620 */
621 bool
is_partial_write() const622 fs_inst::is_partial_write() const
623 {
624 if (this->predicate && !this->predicate_trivial &&
625 this->opcode != BRW_OPCODE_SEL)
626 return true;
627
628 if (!this->dst.is_contiguous())
629 return true;
630
631 if (this->dst.offset % REG_SIZE != 0)
632 return true;
633
634 return this->size_written % REG_SIZE != 0;
635 }
636
637 unsigned
components_read(unsigned i) const638 fs_inst::components_read(unsigned i) const
639 {
640 /* Return zero if the source is not present. */
641 if (src[i].file == BAD_FILE)
642 return 0;
643
644 switch (opcode) {
645 case BRW_OPCODE_PLN:
646 return i == 0 ? 1 : 2;
647
648 case FS_OPCODE_PIXEL_X:
649 case FS_OPCODE_PIXEL_Y:
650 assert(i < 2);
651 if (i == 0)
652 return 2;
653 else
654 return 1;
655
656 case FS_OPCODE_FB_WRITE_LOGICAL:
657 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
658 /* First/second FB write color. */
659 if (i < 2)
660 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
661 else
662 return 1;
663
664 case SHADER_OPCODE_TEX_LOGICAL:
665 case SHADER_OPCODE_TXD_LOGICAL:
666 case SHADER_OPCODE_TXF_LOGICAL:
667 case SHADER_OPCODE_TXL_LOGICAL:
668 case SHADER_OPCODE_TXS_LOGICAL:
669 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
670 case FS_OPCODE_TXB_LOGICAL:
671 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
672 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
673 case SHADER_OPCODE_TXF_MCS_LOGICAL:
674 case SHADER_OPCODE_LOD_LOGICAL:
675 case SHADER_OPCODE_TG4_LOGICAL:
676 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
677 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
678 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
679 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
680 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
681 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
682 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
683 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
684 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
685 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
686 /* Texture coordinates. */
687 if (i == TEX_LOGICAL_SRC_COORDINATE)
688 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
689 /* Texture derivatives. */
690 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
691 opcode == SHADER_OPCODE_TXD_LOGICAL)
692 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
693 /* Texture offset. */
694 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
695 return 2;
696 /* MCS */
697 else if (i == TEX_LOGICAL_SRC_MCS) {
698 if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
699 return 2;
700 else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
701 return 4;
702 else
703 return 1;
704 } else
705 return 1;
706
707 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
708 if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
709 return 0;
710 /* fallthrough */
711 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
712 if (i == MEMORY_LOGICAL_DATA1)
713 return 0;
714 /* fallthrough */
715 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
716 if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
717 return src[MEMORY_LOGICAL_COMPONENTS].ud;
718 else if (i == MEMORY_LOGICAL_ADDRESS)
719 return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
720 else
721 return 1;
722
723 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
724 return (i == 0 ? 2 : 1);
725
726 case SHADER_OPCODE_URB_WRITE_LOGICAL:
727 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
728
729 if (i == URB_LOGICAL_SRC_DATA)
730 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
731 else
732 return 1;
733
734 case BRW_OPCODE_DPAS:
735 unreachable("Do not use components_read() for DPAS.");
736
737 default:
738 return 1;
739 }
740 }
741
742 unsigned
size_read(int arg) const743 fs_inst::size_read(int arg) const
744 {
745 switch (opcode) {
746 case SHADER_OPCODE_SEND:
747 if (arg == 2) {
748 return mlen * REG_SIZE;
749 } else if (arg == 3) {
750 return ex_mlen * REG_SIZE;
751 }
752 break;
753
754 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
755 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
756 if (arg == 0)
757 return mlen * REG_SIZE;
758 break;
759
760 case BRW_OPCODE_PLN:
761 if (arg == 0)
762 return 16;
763 break;
764
765 case SHADER_OPCODE_LOAD_PAYLOAD:
766 if (arg < this->header_size)
767 return retype(src[arg], BRW_TYPE_UD).component_size(8);
768 break;
769
770 case SHADER_OPCODE_BARRIER:
771 return REG_SIZE;
772
773 case SHADER_OPCODE_MOV_INDIRECT:
774 if (arg == 0) {
775 assert(src[2].file == IMM);
776 return src[2].ud;
777 }
778 break;
779
780 case BRW_OPCODE_DPAS: {
781 /* This is a little bit sketchy. There's no way to get at devinfo from
782 * here, so the regular reg_unit() cannot be used. However, on
783 * reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
784 * reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
785 * coincidence, so this isn't so bad.
786 */
787 const unsigned reg_unit = this->exec_size / 8;
788
789 switch (arg) {
790 case 0:
791 if (src[0].type == BRW_TYPE_HF) {
792 return rcount * reg_unit * REG_SIZE / 2;
793 } else {
794 return rcount * reg_unit * REG_SIZE;
795 }
796 case 1:
797 return sdepth * reg_unit * REG_SIZE;
798 case 2:
799 /* This is simpler than the formula described in the Bspec, but it
800 * covers all of the cases that we support. Each inner sdepth
801 * iteration of the DPAS consumes a single dword for int8, uint8, or
802 * float16 types. These are the one source types currently
803 * supportable through Vulkan. This is independent of reg_unit.
804 */
805 return rcount * sdepth * 4;
806 default:
807 unreachable("Invalid source number.");
808 }
809 break;
810 }
811
812 default:
813 break;
814 }
815
816 switch (src[arg].file) {
817 case UNIFORM:
818 case IMM:
819 return components_read(arg) * brw_type_size_bytes(src[arg].type);
820 case BAD_FILE:
821 case ARF:
822 case FIXED_GRF:
823 case VGRF:
824 case ATTR:
825 return components_read(arg) * src[arg].component_size(exec_size);
826 }
827 return 0;
828 }
829
830 namespace {
831 unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)832 predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
833 {
834 if (devinfo->ver >= 20) {
835 return 1;
836 } else {
837 switch (predicate) {
838 case BRW_PREDICATE_NONE: return 1;
839 case BRW_PREDICATE_NORMAL: return 1;
840 case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
841 case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
842 case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
843 case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
844 case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
845 case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
846 case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
847 case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
848 case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
849 case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
850 default: unreachable("Unsupported predicate");
851 }
852 }
853 }
854 }
855
856 unsigned
flags_read(const intel_device_info * devinfo) const857 fs_inst::flags_read(const intel_device_info *devinfo) const
858 {
859 if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
860 predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
861 /* The vertical predication modes combine corresponding bits from
862 * f0.0 and f1.0 on Gfx7+.
863 */
864 const unsigned shift = 4;
865 return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
866 } else if (predicate) {
867 return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
868 } else {
869 unsigned mask = 0;
870 for (int i = 0; i < sources; i++) {
871 mask |= brw_fs_flag_mask(src[i], size_read(i));
872 }
873 return mask;
874 }
875 }
876
877 unsigned
flags_written(const intel_device_info * devinfo) const878 fs_inst::flags_written(const intel_device_info *devinfo) const
879 {
880 if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
881 opcode != BRW_OPCODE_CSEL &&
882 opcode != BRW_OPCODE_IF &&
883 opcode != BRW_OPCODE_WHILE)) {
884 return brw_fs_flag_mask(this, 1);
885 } else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
886 return brw_fs_flag_mask(this, 32);
887 } else {
888 return brw_fs_flag_mask(dst, size_written);
889 }
890 }
891
892 bool
has_sampler_residency() const893 fs_inst::has_sampler_residency() const
894 {
895 switch (opcode) {
896 case SHADER_OPCODE_TEX_LOGICAL:
897 case FS_OPCODE_TXB_LOGICAL:
898 case SHADER_OPCODE_TXL_LOGICAL:
899 case SHADER_OPCODE_TXD_LOGICAL:
900 case SHADER_OPCODE_TXF_LOGICAL:
901 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
902 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
903 case SHADER_OPCODE_TXS_LOGICAL:
904 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
905 case SHADER_OPCODE_TG4_LOGICAL:
906 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
907 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
908 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
909 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
910 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
911 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
912 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
913 default:
914 return false;
915 }
916 }
917
918 /* \sa inst_is_raw_move in brw_eu_validate. */
919 bool
is_raw_move() const920 fs_inst::is_raw_move() const
921 {
922 if (opcode != BRW_OPCODE_MOV)
923 return false;
924
925 if (src[0].file == IMM) {
926 if (brw_type_is_vector_imm(src[0].type))
927 return false;
928 } else if (src[0].negate || src[0].abs) {
929 return false;
930 }
931
932 if (saturate)
933 return false;
934
935 return src[0].type == dst.type ||
936 (brw_type_is_int(src[0].type) &&
937 brw_type_is_int(dst.type) &&
938 brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
939 }
940
941 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
942 * This brings in those uniform definitions
943 */
944 void
import_uniforms(fs_visitor * v)945 fs_visitor::import_uniforms(fs_visitor *v)
946 {
947 this->push_constant_loc = v->push_constant_loc;
948 this->uniforms = v->uniforms;
949 }
950
951 enum brw_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key * key,nir_intrinsic_instr * intr)952 brw_barycentric_mode(const struct brw_wm_prog_key *key,
953 nir_intrinsic_instr *intr)
954 {
955 const glsl_interp_mode mode =
956 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
957
958 /* Barycentric modes don't make sense for flat inputs. */
959 assert(mode != INTERP_MODE_FLAT);
960
961 unsigned bary;
962 switch (intr->intrinsic) {
963 case nir_intrinsic_load_barycentric_pixel:
964 case nir_intrinsic_load_barycentric_at_offset:
965 /* When per sample interpolation is dynamic, assume sample
966 * interpolation. We'll dynamically remap things so that the FS thread
967 * payload is not affected.
968 */
969 bary = key->persample_interp == BRW_SOMETIMES ?
970 BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE :
971 BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
972 break;
973 case nir_intrinsic_load_barycentric_centroid:
974 bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
975 break;
976 case nir_intrinsic_load_barycentric_sample:
977 case nir_intrinsic_load_barycentric_at_sample:
978 bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
979 break;
980 default:
981 unreachable("invalid intrinsic");
982 }
983
984 if (mode == INTERP_MODE_NOPERSPECTIVE)
985 bary += 3;
986
987 return (enum brw_barycentric_mode) bary;
988 }
989
990 /**
991 * Walk backwards from the end of the program looking for a URB write that
992 * isn't in control flow, and mark it with EOT.
993 *
994 * Return true if successful or false if a separate EOT write is needed.
995 */
996 bool
mark_last_urb_write_with_eot()997 fs_visitor::mark_last_urb_write_with_eot()
998 {
999 foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1000 if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
1001 prev->eot = true;
1002
1003 /* Delete now dead instructions. */
1004 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1005 if (dead == prev)
1006 break;
1007 dead->remove();
1008 }
1009 return true;
1010 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1011 break;
1012 }
1013 }
1014
1015 return false;
1016 }
1017
1018 static unsigned
round_components_to_whole_registers(const intel_device_info * devinfo,unsigned c)1019 round_components_to_whole_registers(const intel_device_info *devinfo,
1020 unsigned c)
1021 {
1022 return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
1023 }
1024
1025 void
assign_curb_setup()1026 fs_visitor::assign_curb_setup()
1027 {
1028 unsigned uniform_push_length =
1029 round_components_to_whole_registers(devinfo, prog_data->nr_params);
1030
1031 unsigned ubo_push_length = 0;
1032 unsigned ubo_push_start[4];
1033 for (int i = 0; i < 4; i++) {
1034 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1035 ubo_push_length += prog_data->ubo_ranges[i].length;
1036
1037 assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
1038 assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
1039 }
1040
1041 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1042 if (stage == MESA_SHADER_FRAGMENT &&
1043 ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
1044 prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
1045
1046 uint64_t used = 0;
1047 bool is_compute = gl_shader_stage_is_compute(stage);
1048
1049 if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
1050 /* With COMPUTE_WALKER, we can push up to one register worth of data via
1051 * the inline data parameter in the COMPUTE_WALKER command itself.
1052 *
1053 * TODO: Support inline data and push at the same time.
1054 */
1055 assert(devinfo->verx10 >= 125);
1056 assert(uniform_push_length <= reg_unit(devinfo));
1057 } else if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
1058 assert(devinfo->has_lsc);
1059 fs_builder ubld = fs_builder(this, 1).exec_all().at(
1060 cfg->first_block(), cfg->first_block()->start());
1061
1062 /* The base offset for our push data is passed in as R0.0[31:6]. We have
1063 * to mask off the bottom 6 bits.
1064 */
1065 brw_reg base_addr =
1066 ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
1067 brw_imm_ud(INTEL_MASK(31, 6)));
1068
1069 /* On Gfx12-HP we load constants at the start of the program using A32
1070 * stateless messages.
1071 */
1072 for (unsigned i = 0; i < uniform_push_length;) {
1073 /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
1074 unsigned num_regs = MIN2(uniform_push_length - i, 8);
1075 assert(num_regs > 0);
1076 num_regs = 1 << util_logbase2(num_regs);
1077
1078 /* This pass occurs after all of the optimization passes, so don't
1079 * emit an 'ADD addr, base_addr, 0' instruction.
1080 */
1081 brw_reg addr = i == 0 ? base_addr :
1082 ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
1083
1084 brw_reg srcs[4] = {
1085 brw_imm_ud(0), /* desc */
1086 brw_imm_ud(0), /* ex_desc */
1087 addr, /* payload */
1088 brw_reg(), /* payload2 */
1089 };
1090
1091 brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
1092 BRW_TYPE_UD);
1093 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
1094
1095 send->sfid = GFX12_SFID_UGM;
1096 send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1097 LSC_ADDR_SURFTYPE_FLAT,
1098 LSC_ADDR_SIZE_A32,
1099 LSC_DATA_SIZE_D32,
1100 num_regs * 8 /* num_channels */,
1101 true /* transpose */,
1102 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1103 send->header_size = 0;
1104 send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
1105 send->size_written =
1106 lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
1107 send->send_is_volatile = true;
1108
1109 i += num_regs;
1110 }
1111
1112 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1113 }
1114
1115 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1116 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1117 for (unsigned int i = 0; i < inst->sources; i++) {
1118 if (inst->src[i].file == UNIFORM) {
1119 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1120 int constant_nr;
1121 if (inst->src[i].nr >= UBO_START) {
1122 /* constant_nr is in 32-bit units, the rest are in bytes */
1123 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1124 inst->src[i].offset / 4;
1125 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1126 constant_nr = push_constant_loc[uniform_nr];
1127 } else {
1128 /* Section 5.11 of the OpenGL 4.1 spec says:
1129 * "Out-of-bounds reads return undefined values, which include
1130 * values from other variables of the active program or zero."
1131 * Just return the first push constant.
1132 */
1133 constant_nr = 0;
1134 }
1135
1136 assert(constant_nr / 8 < 64);
1137 used |= BITFIELD64_BIT(constant_nr / 8);
1138
1139 struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1140 constant_nr / 8,
1141 constant_nr % 8);
1142 brw_reg.abs = inst->src[i].abs;
1143 brw_reg.negate = inst->src[i].negate;
1144
1145 assert(inst->src[i].stride == 0);
1146 inst->src[i] = byte_offset(
1147 retype(brw_reg, inst->src[i].type),
1148 inst->src[i].offset % 4);
1149 }
1150 }
1151 }
1152
1153 uint64_t want_zero = used & prog_data->zero_push_reg;
1154 if (want_zero) {
1155 fs_builder ubld = fs_builder(this, 8).exec_all().at(
1156 cfg->first_block(), cfg->first_block()->start());
1157
1158 /* push_reg_mask_param is in 32-bit units */
1159 unsigned mask_param = prog_data->push_reg_mask_param;
1160 struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1161 mask_param % 8);
1162
1163 brw_reg b32;
1164 for (unsigned i = 0; i < 64; i++) {
1165 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1166 brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
1167 ubld.SHL(horiz_offset(shifted, 8),
1168 byte_offset(retype(mask, BRW_TYPE_W), i / 8),
1169 brw_imm_v(0x01234567));
1170 ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1171
1172 fs_builder ubld16 = ubld.group(16, 0);
1173 b32 = ubld16.vgrf(BRW_TYPE_D);
1174 ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1175 }
1176
1177 if (want_zero & BITFIELD64_BIT(i)) {
1178 assert(i < prog_data->curb_read_length);
1179 struct brw_reg push_reg =
1180 retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
1181
1182 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1183 }
1184 }
1185
1186 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1187 }
1188
1189 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1190 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1191 }
1192
1193 /*
1194 * Build up an array of indices into the urb_setup array that
1195 * references the active entries of the urb_setup array.
1196 * Used to accelerate walking the active entries of the urb_setup array
1197 * on each upload.
1198 */
1199 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1200 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1201 {
1202 /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1203 * skip per-primitive attributes here.
1204 */
1205
1206 /* Make sure uint8_t is sufficient */
1207 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1208 uint8_t index = 0;
1209 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1210 if (wm_prog_data->urb_setup[attr] >= 0) {
1211 wm_prog_data->urb_setup_attribs[index++] = attr;
1212 }
1213 }
1214 wm_prog_data->urb_setup_attribs_count = index;
1215 }
1216
1217 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1218 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1219 {
1220 for (int i = 0; i < inst->sources; i++) {
1221 if (inst->src[i].file == ATTR) {
1222 assert(inst->src[i].nr == 0);
1223 int grf = payload().num_regs +
1224 prog_data->curb_read_length +
1225 inst->src[i].offset / REG_SIZE;
1226
1227 /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1228 *
1229 * VertStride must be used to cross GRF register boundaries. This
1230 * rule implies that elements within a 'Width' cannot cross GRF
1231 * boundaries.
1232 *
1233 * So, for registers that are large enough, we have to split the exec
1234 * size in two and trust the compression state to sort it out.
1235 */
1236 unsigned total_size = inst->exec_size *
1237 inst->src[i].stride *
1238 brw_type_size_bytes(inst->src[i].type);
1239
1240 assert(total_size <= 2 * REG_SIZE);
1241 const unsigned exec_size =
1242 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1243
1244 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1245 struct brw_reg reg =
1246 stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1247 inst->src[i].offset % REG_SIZE),
1248 exec_size * inst->src[i].stride,
1249 width, inst->src[i].stride);
1250 reg.abs = inst->src[i].abs;
1251 reg.negate = inst->src[i].negate;
1252
1253 inst->src[i] = reg;
1254 }
1255 }
1256 }
1257
1258 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1259 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1260 const brw_stage_prog_data *prog_data)
1261 {
1262 if (prog_data->nr_params == 0)
1263 return -1;
1264
1265 if (devinfo->verx10 >= 125)
1266 return -1;
1267
1268 /* The local thread id is always the last parameter in the list */
1269 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1270 if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1271 return prog_data->nr_params - 1;
1272
1273 return -1;
1274 }
1275
1276 /**
1277 * Assign UNIFORM file registers to either push constants or pull constants.
1278 *
1279 * We allow a fragment shader to have more than the specified minimum
1280 * maximum number of fragment shader uniform components (64). If
1281 * there are too many of these, they'd fill up all of register space.
1282 * So, this will push some of them out to the pull constant buffer and
1283 * update the program to load them.
1284 */
1285 void
assign_constant_locations()1286 fs_visitor::assign_constant_locations()
1287 {
1288 /* Only the first compile gets to decide on locations. */
1289 if (push_constant_loc)
1290 return;
1291
1292 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1293 for (unsigned u = 0; u < uniforms; u++)
1294 push_constant_loc[u] = u;
1295
1296 /* Now that we know how many regular uniforms we'll push, reduce the
1297 * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1298 *
1299 * If changing this value, note the limitation about total_regs in
1300 * brw_curbe.c/crocus_state.c
1301 */
1302 const unsigned max_push_length = 64;
1303 unsigned push_length =
1304 round_components_to_whole_registers(devinfo, prog_data->nr_params);
1305 for (int i = 0; i < 4; i++) {
1306 struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
1307
1308 if (push_length + range->length > max_push_length)
1309 range->length = max_push_length - push_length;
1310
1311 push_length += range->length;
1312
1313 assert(push_length % (1 * reg_unit(devinfo)) == 0);
1314
1315 }
1316 assert(push_length <= max_push_length);
1317 }
1318
1319 bool
get_pull_locs(const brw_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)1320 fs_visitor::get_pull_locs(const brw_reg &src,
1321 unsigned *out_surf_index,
1322 unsigned *out_pull_index)
1323 {
1324 assert(src.file == UNIFORM);
1325
1326 if (src.nr < UBO_START)
1327 return false;
1328
1329 const struct brw_ubo_range *range =
1330 &prog_data->ubo_ranges[src.nr - UBO_START];
1331
1332 /* If this access is in our (reduced) range, use the push data. */
1333 if (src.offset / 32 < range->length)
1334 return false;
1335
1336 *out_surf_index = range->block;
1337 *out_pull_index = (32 * range->start + src.offset) / 4;
1338
1339 prog_data->has_ubo_pull = true;
1340
1341 return true;
1342 }
1343
1344 /**
1345 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
1346 * by discard. Due to the layout of the sample mask in the fragment shader
1347 * thread payload, \p bld is required to have a dispatch_width() not greater
1348 * than 16 for fragment shaders.
1349 */
1350 brw_reg
brw_sample_mask_reg(const fs_builder & bld)1351 brw_sample_mask_reg(const fs_builder &bld)
1352 {
1353 const fs_visitor &s = *bld.shader;
1354
1355 if (s.stage != MESA_SHADER_FRAGMENT) {
1356 return brw_imm_ud(0xffffffff);
1357 } else if (s.devinfo->ver >= 20 ||
1358 brw_wm_prog_data(s.prog_data)->uses_kill) {
1359 return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
1360 } else {
1361 assert(bld.dispatch_width() <= 16);
1362 assert(s.devinfo->ver < 20);
1363 return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
1364 BRW_TYPE_UW);
1365 }
1366 }
1367
1368 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)1369 brw_fb_write_msg_control(const fs_inst *inst,
1370 const struct brw_wm_prog_data *prog_data)
1371 {
1372 uint32_t mctl;
1373
1374 if (prog_data->dual_src_blend) {
1375 assert(inst->exec_size < 32);
1376
1377 if (inst->group % 16 == 0)
1378 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
1379 else if (inst->group % 16 == 8)
1380 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
1381 else
1382 unreachable("Invalid dual-source FB write instruction group");
1383 } else {
1384 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
1385
1386 if (inst->exec_size == 16)
1387 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1388 else if (inst->exec_size == 8)
1389 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1390 else if (inst->exec_size == 32)
1391 mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
1392 else
1393 unreachable("Invalid FB write execution size");
1394 }
1395
1396 return mctl;
1397 }
1398
1399 /**
1400 * Predicate the specified instruction on the sample mask.
1401 */
1402 void
brw_emit_predicate_on_sample_mask(const fs_builder & bld,fs_inst * inst)1403 brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
1404 {
1405 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1406 bld.group() == inst->group &&
1407 bld.dispatch_width() == inst->exec_size);
1408
1409 const fs_visitor &s = *bld.shader;
1410 const brw_reg sample_mask = brw_sample_mask_reg(bld);
1411 const unsigned subreg = sample_mask_flag_subreg(s);
1412
1413 if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
1414 assert(sample_mask.file == ARF &&
1415 sample_mask.nr == brw_flag_subreg(subreg).nr &&
1416 sample_mask.subnr == brw_flag_subreg(
1417 subreg + inst->group / 16).subnr);
1418 } else {
1419 bld.group(1, 0).exec_all()
1420 .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
1421 }
1422
1423 if (inst->predicate) {
1424 assert(inst->predicate == BRW_PREDICATE_NORMAL);
1425 assert(!inst->predicate_inverse);
1426 assert(inst->flag_subreg == 0);
1427 assert(s.devinfo->ver < 20);
1428 /* Combine the sample mask with the existing predicate by using a
1429 * vertical predication mode.
1430 */
1431 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1432 } else {
1433 inst->flag_subreg = subreg;
1434 inst->predicate = BRW_PREDICATE_NORMAL;
1435 inst->predicate_inverse = false;
1436 }
1437 }
1438
register_pressure(const fs_visitor * v)1439 brw::register_pressure::register_pressure(const fs_visitor *v)
1440 {
1441 const fs_live_variables &live = v->live_analysis.require();
1442 const unsigned num_instructions = v->cfg->num_blocks ?
1443 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
1444
1445 regs_live_at_ip = new unsigned[num_instructions]();
1446
1447 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
1448 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
1449 regs_live_at_ip[ip] += v->alloc.sizes[reg];
1450 }
1451
1452 const unsigned payload_count = v->first_non_payload_grf;
1453
1454 int *payload_last_use_ip = new int[payload_count];
1455 v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
1456
1457 for (unsigned reg = 0; reg < payload_count; reg++) {
1458 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
1459 ++regs_live_at_ip[ip];
1460 }
1461
1462 delete[] payload_last_use_ip;
1463 }
1464
~register_pressure()1465 brw::register_pressure::~register_pressure()
1466 {
1467 delete[] regs_live_at_ip;
1468 }
1469
1470 void
invalidate_analysis(brw::analysis_dependency_class c)1471 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
1472 {
1473 live_analysis.invalidate(c);
1474 regpressure_analysis.invalidate(c);
1475 idom_analysis.invalidate(c);
1476 def_analysis.invalidate(c);
1477 }
1478
1479 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const1480 fs_visitor::debug_optimizer(const nir_shader *nir,
1481 const char *pass_name,
1482 int iteration, int pass_num) const
1483 {
1484 if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
1485 return;
1486
1487 char *filename;
1488 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
1489 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
1490 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
1491 iteration, pass_num, pass_name);
1492 if (ret == -1)
1493 return;
1494 brw_print_instructions(*this, filename);
1495 free(filename);
1496 }
1497
1498 static uint32_t
brw_compute_max_register_pressure(fs_visitor & s)1499 brw_compute_max_register_pressure(fs_visitor &s)
1500 {
1501 const register_pressure &rp = s.regpressure_analysis.require();
1502 uint32_t ip = 0, max_pressure = 0;
1503 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1504 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
1505 ip++;
1506 }
1507 return max_pressure;
1508 }
1509
1510 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)1511 save_instruction_order(const struct cfg_t *cfg)
1512 {
1513 /* Before we schedule anything, stash off the instruction order as an array
1514 * of fs_inst *. This way, we can reset it between scheduling passes to
1515 * prevent dependencies between the different scheduling modes.
1516 */
1517 int num_insts = cfg->last_block()->end_ip + 1;
1518 fs_inst **inst_arr = new fs_inst * [num_insts];
1519
1520 int ip = 0;
1521 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1522 assert(ip >= block->start_ip && ip <= block->end_ip);
1523 inst_arr[ip++] = inst;
1524 }
1525 assert(ip == num_insts);
1526
1527 return inst_arr;
1528 }
1529
1530 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)1531 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
1532 {
1533 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
1534
1535 int ip = 0;
1536 foreach_block (block, cfg) {
1537 block->instructions.make_empty();
1538
1539 assert(ip == block->start_ip);
1540 for (; ip <= block->end_ip; ip++)
1541 block->instructions.push_tail(inst_arr[ip]);
1542 }
1543 assert(ip == num_insts);
1544 }
1545
1546 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
1547 static inline unsigned
brw_get_scratch_size(int size)1548 brw_get_scratch_size(int size)
1549 {
1550 return MAX2(1024, util_next_power_of_two(size));
1551 }
1552
1553 void
brw_allocate_registers(fs_visitor & s,bool allow_spilling)1554 brw_allocate_registers(fs_visitor &s, bool allow_spilling)
1555 {
1556 const struct intel_device_info *devinfo = s.devinfo;
1557 const nir_shader *nir = s.nir;
1558 bool allocated;
1559
1560 static const enum instruction_scheduler_mode pre_modes[] = {
1561 SCHEDULE_PRE,
1562 SCHEDULE_PRE_NON_LIFO,
1563 SCHEDULE_NONE,
1564 SCHEDULE_PRE_LIFO,
1565 };
1566
1567 static const char *scheduler_mode_name[] = {
1568 [SCHEDULE_PRE] = "top-down",
1569 [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
1570 [SCHEDULE_PRE_LIFO] = "lifo",
1571 [SCHEDULE_POST] = "post",
1572 [SCHEDULE_NONE] = "none",
1573 };
1574
1575 uint32_t best_register_pressure = UINT32_MAX;
1576 enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
1577
1578 brw_fs_opt_compact_virtual_grfs(s);
1579
1580 if (s.needs_register_pressure)
1581 s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
1582
1583 s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
1584
1585 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
1586
1587 /* Before we schedule anything, stash off the instruction order as an array
1588 * of fs_inst *. This way, we can reset it between scheduling passes to
1589 * prevent dependencies between the different scheduling modes.
1590 */
1591 fs_inst **orig_order = save_instruction_order(s.cfg);
1592 fs_inst **best_pressure_order = NULL;
1593
1594 void *scheduler_ctx = ralloc_context(NULL);
1595 instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
1596
1597 /* Try each scheduling heuristic to see if it can successfully register
1598 * allocate without spilling. They should be ordered by decreasing
1599 * performance but increasing likelihood of allocating.
1600 */
1601 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
1602 enum instruction_scheduler_mode sched_mode = pre_modes[i];
1603
1604 brw_schedule_instructions_pre_ra(s, sched, sched_mode);
1605 s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
1606
1607 s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
1608
1609 if (0) {
1610 brw_assign_regs_trivial(s);
1611 allocated = true;
1612 break;
1613 }
1614
1615 /* We should only spill registers on the last scheduling. */
1616 assert(!s.spilled_any_registers);
1617
1618 allocated = brw_assign_regs(s, false, spill_all);
1619 if (allocated)
1620 break;
1621
1622 /* Save the maximum register pressure */
1623 uint32_t this_pressure = brw_compute_max_register_pressure(s);
1624
1625 if (0) {
1626 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
1627 scheduler_mode_name[sched_mode], this_pressure);
1628 }
1629
1630 if (this_pressure < best_register_pressure) {
1631 best_register_pressure = this_pressure;
1632 best_sched = sched_mode;
1633 delete[] best_pressure_order;
1634 best_pressure_order = save_instruction_order(s.cfg);
1635 }
1636
1637 /* Reset back to the original order before trying the next mode */
1638 restore_instruction_order(s.cfg, orig_order);
1639 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1640 }
1641
1642 ralloc_free(scheduler_ctx);
1643
1644 if (!allocated) {
1645 if (0) {
1646 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
1647 scheduler_mode_name[best_sched]);
1648 }
1649 restore_instruction_order(s.cfg, best_pressure_order);
1650 s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
1651
1652 allocated = brw_assign_regs(s, allow_spilling, spill_all);
1653 }
1654
1655 delete[] orig_order;
1656 delete[] best_pressure_order;
1657
1658 if (!allocated) {
1659 s.fail("Failure to register allocate. Reduce number of "
1660 "live scalar values to avoid this.");
1661 } else if (s.spilled_any_registers) {
1662 brw_shader_perf_log(s.compiler, s.log_data,
1663 "%s shader triggered register spilling. "
1664 "Try reducing the number of live scalar "
1665 "values to improve performance.\n",
1666 _mesa_shader_stage_to_string(s.stage));
1667 }
1668
1669 if (s.failed)
1670 return;
1671
1672 s.debug_optimizer(nir, "post_ra_alloc", 96, 0);
1673
1674 brw_fs_opt_bank_conflicts(s);
1675
1676 s.debug_optimizer(nir, "bank_conflict", 96, 1);
1677
1678 brw_schedule_instructions_post_ra(s);
1679
1680 s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
1681
1682 /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
1683 * of part of assign_regs since both bank conflicts optimization and post
1684 * RA scheduling take advantage of distinguishing references to registers
1685 * that were allocated from references that were already fixed.
1686 *
1687 * TODO: Change the passes above, then move this lowering to be part of
1688 * assign_regs.
1689 */
1690 brw_fs_lower_vgrfs_to_fixed_grfs(s);
1691
1692 s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
1693
1694 if (s.last_scratch > 0) {
1695 /* We currently only support up to 2MB of scratch space. If we
1696 * need to support more eventually, the documentation suggests
1697 * that we could allocate a larger buffer, and partition it out
1698 * ourselves. We'd just have to undo the hardware's address
1699 * calculation by subtracting (FFTID * Per Thread Scratch Space)
1700 * and then add FFTID * (Larger Per Thread Scratch Space).
1701 *
1702 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
1703 * Thread Group Tracking > Local Memory/Scratch Space.
1704 */
1705 if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
1706 /* Take the max of any previously compiled variant of the shader. In the
1707 * case of bindless shaders with return parts, this will also take the
1708 * max of all parts.
1709 */
1710 s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
1711 s.prog_data->total_scratch);
1712 } else {
1713 s.fail("Scratch space required is larger than supported");
1714 }
1715 }
1716
1717 if (s.failed)
1718 return;
1719
1720 brw_fs_lower_scoreboard(s);
1721 }
1722
1723 /**
1724 * Move load_interpolated_input with simple (payload-based) barycentric modes
1725 * to the top of the program so we don't emit multiple PLNs for the same input.
1726 *
1727 * This works around CSE not being able to handle non-dominating cases
1728 * such as:
1729 *
1730 * if (...) {
1731 * interpolate input
1732 * } else {
1733 * interpolate the same exact input
1734 * }
1735 *
1736 * This should be replaced by global value numbering someday.
1737 */
1738 bool
brw_nir_move_interpolation_to_top(nir_shader * nir)1739 brw_nir_move_interpolation_to_top(nir_shader *nir)
1740 {
1741 bool progress = false;
1742
1743 nir_foreach_function_impl(impl, nir) {
1744 nir_block *top = nir_start_block(impl);
1745 nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
1746 bool impl_progress = false;
1747
1748 for (nir_block *block = nir_block_cf_tree_next(top);
1749 block != NULL;
1750 block = nir_block_cf_tree_next(block)) {
1751
1752 nir_foreach_instr_safe(instr, block) {
1753 if (instr->type != nir_instr_type_intrinsic)
1754 continue;
1755
1756 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1757 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
1758 continue;
1759 nir_intrinsic_instr *bary_intrinsic =
1760 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
1761 nir_intrinsic_op op = bary_intrinsic->intrinsic;
1762
1763 /* Leave interpolateAtSample/Offset() where they are. */
1764 if (op == nir_intrinsic_load_barycentric_at_sample ||
1765 op == nir_intrinsic_load_barycentric_at_offset)
1766 continue;
1767
1768 nir_instr *move[3] = {
1769 &bary_intrinsic->instr,
1770 intrin->src[1].ssa->parent_instr,
1771 instr
1772 };
1773
1774 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
1775 if (move[i]->block != top) {
1776 nir_instr_move(cursor, move[i]);
1777 impl_progress = true;
1778 }
1779 }
1780 }
1781 }
1782
1783 progress = progress || impl_progress;
1784
1785 nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
1786 : nir_metadata_all);
1787 }
1788
1789 return progress;
1790 }
1791
1792 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)1793 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1794 unsigned threads)
1795 {
1796 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
1797 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
1798 return cs_prog_data->push.per_thread.size * threads +
1799 cs_prog_data->push.cross_thread.size;
1800 }
1801
1802 static bool
filter_simd(const nir_instr * instr,const void *)1803 filter_simd(const nir_instr *instr, const void * /* options */)
1804 {
1805 if (instr->type != nir_instr_type_intrinsic)
1806 return false;
1807
1808 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1809 case nir_intrinsic_load_simd_width_intel:
1810 case nir_intrinsic_load_subgroup_id:
1811 return true;
1812
1813 default:
1814 return false;
1815 }
1816 }
1817
1818 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)1819 lower_simd(nir_builder *b, nir_instr *instr, void *options)
1820 {
1821 uintptr_t simd_width = (uintptr_t)options;
1822
1823 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1824 case nir_intrinsic_load_simd_width_intel:
1825 return nir_imm_int(b, simd_width);
1826
1827 case nir_intrinsic_load_subgroup_id:
1828 /* If the whole workgroup fits in one thread, we can lower subgroup_id
1829 * to a constant zero.
1830 */
1831 if (!b->shader->info.workgroup_size_variable) {
1832 unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
1833 b->shader->info.workgroup_size[1] *
1834 b->shader->info.workgroup_size[2];
1835 if (local_workgroup_size <= simd_width)
1836 return nir_imm_int(b, 0);
1837 }
1838 return NULL;
1839
1840 default:
1841 return NULL;
1842 }
1843 }
1844
1845 bool
brw_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)1846 brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
1847 {
1848 return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
1849 (void *)(uintptr_t)dispatch_width);
1850 }
1851
1852 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)1853 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1854 const struct brw_cs_prog_data *prog_data,
1855 const unsigned *override_local_size)
1856 {
1857 struct intel_cs_dispatch_info info = {};
1858
1859 const unsigned *sizes =
1860 override_local_size ? override_local_size :
1861 prog_data->local_size;
1862
1863 const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
1864 assert(simd >= 0 && simd < 3);
1865
1866 info.group_size = sizes[0] * sizes[1] * sizes[2];
1867 info.simd_size = 8u << simd;
1868 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
1869
1870 const uint32_t remainder = info.group_size & (info.simd_size - 1);
1871 if (remainder > 0)
1872 info.right_mask = ~0u >> (32 - remainder);
1873 else
1874 info.right_mask = ~0u >> (32 - info.simd_size);
1875
1876 return info;
1877 }
1878
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)1879 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
1880 {
1881 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
1882 }
1883
1884 namespace brw {
1885 brw_reg
fetch_payload_reg(const brw::fs_builder & bld,uint8_t regs[2],brw_reg_type type,unsigned n)1886 fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
1887 brw_reg_type type, unsigned n)
1888 {
1889 if (!regs[0])
1890 return brw_reg();
1891
1892 if (bld.dispatch_width() > 16) {
1893 const brw_reg tmp = bld.vgrf(type, n);
1894 const brw::fs_builder hbld = bld.exec_all().group(16, 0);
1895 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1896 brw_reg *const components = new brw_reg[m * n];
1897
1898 for (unsigned c = 0; c < n; c++) {
1899 for (unsigned g = 0; g < m; g++)
1900 components[c * m + g] =
1901 offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
1902 }
1903
1904 hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
1905
1906 delete[] components;
1907 return tmp;
1908
1909 } else {
1910 return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
1911 }
1912 }
1913
1914 brw_reg
fetch_barycentric_reg(const brw::fs_builder & bld,uint8_t regs[2])1915 fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
1916 {
1917 if (!regs[0])
1918 return brw_reg();
1919 else if (bld.shader->devinfo->ver >= 20)
1920 return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
1921
1922 const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
1923 const brw::fs_builder hbld = bld.exec_all().group(8, 0);
1924 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1925 brw_reg *const components = new brw_reg[2 * m];
1926
1927 for (unsigned c = 0; c < 2; c++) {
1928 for (unsigned g = 0; g < m; g++)
1929 components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
1930 hbld, c + 2 * (g % 2));
1931 }
1932
1933 hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
1934
1935 delete[] components;
1936 return tmp;
1937 }
1938
1939 void
check_dynamic_msaa_flag(const fs_builder & bld,const struct brw_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)1940 check_dynamic_msaa_flag(const fs_builder &bld,
1941 const struct brw_wm_prog_data *wm_prog_data,
1942 enum intel_msaa_flags flag)
1943 {
1944 fs_inst *inst = bld.AND(bld.null_reg_ud(),
1945 dynamic_msaa_flags(wm_prog_data),
1946 brw_imm_ud(flag));
1947 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1948 }
1949 }
1950
1951