1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file elk_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "elk_eu.h"
32 #include "elk_fs.h"
33 #include "elk_fs_builder.h"
34 #include "elk_fs_live_variables.h"
35 #include "elk_nir.h"
36 #include "elk_vec4_gs_visitor.h"
37 #include "elk_cfg.h"
38 #include "elk_dead_control_flow.h"
39 #include "elk_private.h"
40 #include "../intel_nir.h"
41 #include "shader_enums.h"
42 #include "dev/intel_debug.h"
43 #include "dev/intel_wa.h"
44 #include "compiler/glsl_types.h"
45 #include "compiler/nir/nir_builder.h"
46 #include "util/u_math.h"
47
48 #include <memory>
49
50 using namespace elk;
51
52 static unsigned get_lowered_simd_width(const elk_fs_visitor *shader,
53 const elk_fs_inst *inst);
54
55 void
init(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources)56 elk_fs_inst::init(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
57 const elk_fs_reg *src, unsigned sources)
58 {
59 memset((void*)this, 0, sizeof(*this));
60
61 this->src = new elk_fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69 this->base_mrf = -1;
70
71 assert(dst.file != IMM && dst.file != UNIFORM);
72
73 assert(this->exec_size != 0);
74
75 this->conditional_mod = ELK_CONDITIONAL_NONE;
76
77 /* This will be the case for almost all instructions. */
78 switch (dst.file) {
79 case VGRF:
80 case ARF:
81 case FIXED_GRF:
82 case MRF:
83 case ATTR:
84 this->size_written = dst.component_size(exec_size);
85 break;
86 case BAD_FILE:
87 this->size_written = 0;
88 break;
89 case IMM:
90 case UNIFORM:
91 unreachable("Invalid destination register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
elk_fs_inst()97 elk_fs_inst::elk_fs_inst()
98 {
99 init(ELK_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size)102 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst)107 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0)112 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
113 const elk_fs_reg &src0)
114 {
115 const elk_fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1)119 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
120 const elk_fs_reg &src0, const elk_fs_reg &src1)
121 {
122 const elk_fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1,const elk_fs_reg & src2)126 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
127 const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2)
128 {
129 const elk_fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_width,const elk_fs_reg & dst,const elk_fs_reg src[],unsigned sources)133 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
134 const elk_fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
elk_fs_inst(const elk_fs_inst & that)139 elk_fs_inst::elk_fs_inst(const elk_fs_inst &that)
140 {
141 memcpy((void*)this, &that, sizeof(that));
142
143 this->src = new elk_fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
~elk_fs_inst()149 elk_fs_inst::~elk_fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
resize_sources(uint8_t num_sources)155 elk_fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 elk_fs_reg *src = new elk_fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & surface,const elk_fs_reg & surface_handle,const elk_fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)170 elk_fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const elk_fs_reg &dst,
172 const elk_fs_reg &surface,
173 const elk_fs_reg &surface_handle,
174 const elk_fs_reg &varying_offset,
175 uint32_t const_offset,
176 uint8_t alignment,
177 unsigned components)
178 {
179 assert(components <= 4);
180
181 /* We have our constant surface use a pitch of 4 bytes, so our index can
182 * be any component of a vector, and then we load 4 contiguous
183 * components starting from that. TODO: Support loading fewer than 4.
184 */
185 elk_fs_reg total_offset = vgrf(glsl_uint_type());
186 bld.ADD(total_offset, varying_offset, elk_imm_ud(const_offset));
187
188 /* The pull load message will load a vec4 (16 bytes). If we are loading
189 * a double this means we are only loading 2 elements worth of data.
190 * We also want to use a 32-bit data type for the dst of the load operation
191 * so other parts of the driver don't get confused about the size of the
192 * result.
193 */
194 elk_fs_reg vec4_result = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
195
196 elk_fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
197 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
198 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
199 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
200 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = elk_imm_ud(alignment);
201
202 elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
203 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
204 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
205
206 elk_shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
207 }
208
209 /**
210 * A helper for MOV generation for fixing up broken hardware SEND dependency
211 * handling.
212 */
213 void
DEP_RESOLVE_MOV(const fs_builder & bld,int grf)214 elk_fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
215 {
216 /* The caller always wants uncompressed to emit the minimal extra
217 * dependencies, and to avoid having to deal with aligning its regs to 2.
218 */
219 const fs_builder ubld = bld.annotate("send dependency resolve")
220 .quarter(0);
221
222 ubld.MOV(ubld.null_reg_f(), elk_fs_reg(VGRF, grf, ELK_REGISTER_TYPE_F));
223 }
224
225 bool
is_send_from_grf() const226 elk_fs_inst::is_send_from_grf() const
227 {
228 switch (opcode) {
229 case ELK_SHADER_OPCODE_SEND:
230 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
231 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
232 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
233 case ELK_SHADER_OPCODE_INTERLOCK:
234 case ELK_SHADER_OPCODE_MEMORY_FENCE:
235 case ELK_SHADER_OPCODE_BARRIER:
236 return true;
237 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238 return src[1].file == VGRF;
239 case ELK_FS_OPCODE_FB_WRITE:
240 return src[0].file == VGRF;
241 default:
242 return false;
243 }
244 }
245
246 bool
is_control_source(unsigned arg) const247 elk_fs_inst::is_control_source(unsigned arg) const
248 {
249 switch (opcode) {
250 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
251 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
252 return arg == 0;
253
254 case ELK_SHADER_OPCODE_BROADCAST:
255 case ELK_SHADER_OPCODE_SHUFFLE:
256 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
257 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
258 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
259 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
260 return arg == 1;
261
262 case ELK_SHADER_OPCODE_MOV_INDIRECT:
263 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
264 case ELK_SHADER_OPCODE_TEX:
265 case ELK_FS_OPCODE_TXB:
266 case ELK_SHADER_OPCODE_TXD:
267 case ELK_SHADER_OPCODE_TXF:
268 case ELK_SHADER_OPCODE_TXF_LZ:
269 case ELK_SHADER_OPCODE_TXF_CMS:
270 case ELK_SHADER_OPCODE_TXF_CMS_W:
271 case ELK_SHADER_OPCODE_TXF_UMS:
272 case ELK_SHADER_OPCODE_TXF_MCS:
273 case ELK_SHADER_OPCODE_TXL:
274 case ELK_SHADER_OPCODE_TXL_LZ:
275 case ELK_SHADER_OPCODE_TXS:
276 case ELK_SHADER_OPCODE_LOD:
277 case ELK_SHADER_OPCODE_TG4:
278 case ELK_SHADER_OPCODE_TG4_OFFSET:
279 case ELK_SHADER_OPCODE_SAMPLEINFO:
280 return arg == 1 || arg == 2;
281
282 case ELK_SHADER_OPCODE_SEND:
283 return arg == 0;
284
285 default:
286 return false;
287 }
288 }
289
290 bool
is_payload(unsigned arg) const291 elk_fs_inst::is_payload(unsigned arg) const
292 {
293 switch (opcode) {
294 case ELK_FS_OPCODE_FB_WRITE:
295 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
296 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
297 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
298 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
299 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
300 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
301 case ELK_SHADER_OPCODE_INTERLOCK:
302 case ELK_SHADER_OPCODE_MEMORY_FENCE:
303 case ELK_SHADER_OPCODE_BARRIER:
304 case ELK_SHADER_OPCODE_TEX:
305 case ELK_FS_OPCODE_TXB:
306 case ELK_SHADER_OPCODE_TXD:
307 case ELK_SHADER_OPCODE_TXF:
308 case ELK_SHADER_OPCODE_TXF_LZ:
309 case ELK_SHADER_OPCODE_TXF_CMS:
310 case ELK_SHADER_OPCODE_TXF_CMS_W:
311 case ELK_SHADER_OPCODE_TXF_UMS:
312 case ELK_SHADER_OPCODE_TXF_MCS:
313 case ELK_SHADER_OPCODE_TXL:
314 case ELK_SHADER_OPCODE_TXL_LZ:
315 case ELK_SHADER_OPCODE_TXS:
316 case ELK_SHADER_OPCODE_LOD:
317 case ELK_SHADER_OPCODE_TG4:
318 case ELK_SHADER_OPCODE_TG4_OFFSET:
319 case ELK_SHADER_OPCODE_SAMPLEINFO:
320 return arg == 0;
321
322 case ELK_SHADER_OPCODE_SEND:
323 return arg == 1;
324
325 default:
326 return false;
327 }
328 }
329
330 /**
331 * Returns true if this instruction's sources and destinations cannot
332 * safely be the same register.
333 *
334 * In most cases, a register can be written over safely by the same
335 * instruction that is its last use. For a single instruction, the
336 * sources are dereferenced before writing of the destination starts
337 * (naturally).
338 *
339 * However, there are a few cases where this can be problematic:
340 *
341 * - Virtual opcodes that translate to multiple instructions in the
342 * code generator: if src == dst and one instruction writes the
343 * destination before a later instruction reads the source, then
344 * src will have been clobbered.
345 *
346 * - SIMD16 compressed instructions with certain regioning (see below).
347 *
348 * The register allocator uses this information to set up conflicts between
349 * GRF sources and the destination.
350 */
351 bool
has_source_and_destination_hazard() const352 elk_fs_inst::has_source_and_destination_hazard() const
353 {
354 switch (opcode) {
355 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
356 /* Multiple partial writes to the destination */
357 return true;
358 case ELK_SHADER_OPCODE_SHUFFLE:
359 /* This instruction returns an arbitrary channel from the source and
360 * gets split into smaller instructions in the generator. It's possible
361 * that one of the instructions will read from a channel corresponding
362 * to an earlier instruction.
363 */
364 case ELK_SHADER_OPCODE_SEL_EXEC:
365 /* This is implemented as
366 *
367 * mov(16) g4<1>D 0D { align1 WE_all 1H };
368 * mov(16) g4<1>D g5<8,8,1>D { align1 1H }
369 *
370 * Because the source is only read in the second instruction, the first
371 * may stomp all over it.
372 */
373 return true;
374 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
375 switch (src[1].ud) {
376 case ELK_SWIZZLE_XXXX:
377 case ELK_SWIZZLE_YYYY:
378 case ELK_SWIZZLE_ZZZZ:
379 case ELK_SWIZZLE_WWWW:
380 case ELK_SWIZZLE_XXZZ:
381 case ELK_SWIZZLE_YYWW:
382 case ELK_SWIZZLE_XYXY:
383 case ELK_SWIZZLE_ZWZW:
384 /* These can be implemented as a single Align1 region on all
385 * platforms, so there's never a hazard between source and
386 * destination. C.f. elk_fs_generator::generate_quad_swizzle().
387 */
388 return false;
389 default:
390 return !is_uniform(src[0]);
391 }
392 default:
393 /* The SIMD16 compressed instruction
394 *
395 * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
396 *
397 * is actually decoded in hardware as:
398 *
399 * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
400 * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
401 *
402 * Which is safe. However, if we have uniform accesses
403 * happening, we get into trouble:
404 *
405 * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
406 * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
407 *
408 * Now our destination for the first instruction overwrote the
409 * second instruction's src0, and we get garbage for those 8
410 * pixels. There's a similar issue for the pre-gfx6
411 * pixel_x/pixel_y, which are registers of 16-bit values and thus
412 * would get stomped by the first decode as well.
413 */
414 if (exec_size == 16) {
415 for (int i = 0; i < sources; i++) {
416 if (src[i].file == VGRF && (src[i].stride == 0 ||
417 src[i].type == ELK_REGISTER_TYPE_UW ||
418 src[i].type == ELK_REGISTER_TYPE_W ||
419 src[i].type == ELK_REGISTER_TYPE_UB ||
420 src[i].type == ELK_REGISTER_TYPE_B)) {
421 return true;
422 }
423 }
424 }
425 return false;
426 }
427 }
428
429 bool
can_do_source_mods(const struct intel_device_info * devinfo) const430 elk_fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
431 {
432 if (devinfo->ver == 6 && is_math())
433 return false;
434
435 if (is_send_from_grf())
436 return false;
437
438 return elk_backend_instruction::can_do_source_mods();
439 }
440
441 bool
can_do_cmod()442 elk_fs_inst::can_do_cmod()
443 {
444 if (!elk_backend_instruction::can_do_cmod())
445 return false;
446
447 /* The accumulator result appears to get used for the conditional modifier
448 * generation. When negating a UD value, there is a 33rd bit generated for
449 * the sign in the accumulator value, so now you can't check, for example,
450 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
451 */
452 for (unsigned i = 0; i < sources; i++) {
453 if (elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
454 return false;
455 }
456
457 return true;
458 }
459
460 bool
can_change_types() const461 elk_fs_inst::can_change_types() const
462 {
463 return dst.type == src[0].type &&
464 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
465 (opcode == ELK_OPCODE_MOV ||
466 (opcode == ELK_OPCODE_SEL &&
467 dst.type == src[1].type &&
468 predicate != ELK_PREDICATE_NONE &&
469 !src[1].abs && !src[1].negate && src[1].file != ATTR));
470 }
471
472 void
init()473 elk_fs_reg::init()
474 {
475 memset((void*)this, 0, sizeof(*this));
476 type = ELK_REGISTER_TYPE_UD;
477 stride = 1;
478 }
479
480 /** Generic unset register constructor. */
elk_fs_reg()481 elk_fs_reg::elk_fs_reg()
482 {
483 init();
484 this->file = BAD_FILE;
485 }
486
elk_fs_reg(struct::elk_reg reg)487 elk_fs_reg::elk_fs_reg(struct ::elk_reg reg) :
488 elk_backend_reg(reg)
489 {
490 this->offset = 0;
491 this->stride = 1;
492 if (this->file == IMM &&
493 (this->type != ELK_REGISTER_TYPE_V &&
494 this->type != ELK_REGISTER_TYPE_UV &&
495 this->type != ELK_REGISTER_TYPE_VF)) {
496 this->stride = 0;
497 }
498 }
499
500 bool
equals(const elk_fs_reg & r) const501 elk_fs_reg::equals(const elk_fs_reg &r) const
502 {
503 return (this->elk_backend_reg::equals(r) &&
504 stride == r.stride);
505 }
506
507 bool
negative_equals(const elk_fs_reg & r) const508 elk_fs_reg::negative_equals(const elk_fs_reg &r) const
509 {
510 return (this->elk_backend_reg::negative_equals(r) &&
511 stride == r.stride);
512 }
513
514 bool
is_contiguous() const515 elk_fs_reg::is_contiguous() const
516 {
517 switch (file) {
518 case ARF:
519 case FIXED_GRF:
520 return hstride == ELK_HORIZONTAL_STRIDE_1 &&
521 vstride == width + hstride;
522 case MRF:
523 case VGRF:
524 case ATTR:
525 return stride == 1;
526 case UNIFORM:
527 case IMM:
528 case BAD_FILE:
529 return true;
530 }
531
532 unreachable("Invalid register file");
533 }
534
535 unsigned
component_size(unsigned width) const536 elk_fs_reg::component_size(unsigned width) const
537 {
538 if (file == ARF || file == FIXED_GRF) {
539 const unsigned w = MIN2(width, 1u << this->width);
540 const unsigned h = width >> this->width;
541 const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
542 const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
543 assert(w > 0);
544 return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
545 } else {
546 return MAX2(width * stride, 1) * type_sz(type);
547 }
548 }
549
550 void
vfail(const char * format,va_list va)551 elk_fs_visitor::vfail(const char *format, va_list va)
552 {
553 char *msg;
554
555 if (failed)
556 return;
557
558 failed = true;
559
560 msg = ralloc_vasprintf(mem_ctx, format, va);
561 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
562 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
563
564 this->fail_msg = msg;
565
566 if (unlikely(debug_enabled)) {
567 fprintf(stderr, "%s", msg);
568 }
569 }
570
571 void
fail(const char * format,...)572 elk_fs_visitor::fail(const char *format, ...)
573 {
574 va_list va;
575
576 va_start(va, format);
577 vfail(format, va);
578 va_end(va);
579 }
580
581 /**
582 * Mark this program as impossible to compile with dispatch width greater
583 * than n.
584 *
585 * During the SIMD8 compile (which happens first), we can detect and flag
586 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
587 * SIMD16+ compile altogether.
588 *
589 * During a compile of dispatch width greater than n (if one happens anyway),
590 * this just calls fail().
591 */
592 void
limit_dispatch_width(unsigned n,const char * msg)593 elk_fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
594 {
595 if (dispatch_width > n) {
596 fail("%s", msg);
597 } else {
598 max_dispatch_width = MIN2(max_dispatch_width, n);
599 elk_shader_perf_log(compiler, log_data,
600 "Shader dispatch width limited to SIMD%d: %s\n",
601 n, msg);
602 }
603 }
604
605 /**
606 * Returns true if the instruction has a flag that means it won't
607 * update an entire destination register.
608 *
609 * For example, dead code elimination and live variable analysis want to know
610 * when a write to a variable screens off any preceding values that were in
611 * it.
612 */
613 bool
is_partial_write() const614 elk_fs_inst::is_partial_write() const
615 {
616 if (this->predicate && !this->predicate_trivial &&
617 this->opcode != ELK_OPCODE_SEL)
618 return true;
619
620 if (this->dst.offset % REG_SIZE != 0)
621 return true;
622
623 /* SEND instructions always write whole registers */
624 if (this->opcode == ELK_SHADER_OPCODE_SEND)
625 return false;
626
627 /* Special case UNDEF since a lot of places in the backend do things like this :
628 *
629 * fs_builder ubld = bld.exec_all().group(1, 0);
630 * elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
631 * ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
632 */
633 if (this->opcode == ELK_SHADER_OPCODE_UNDEF) {
634 assert(this->dst.is_contiguous());
635 return this->size_written < 32;
636 }
637
638 return this->exec_size * type_sz(this->dst.type) < 32 ||
639 !this->dst.is_contiguous();
640 }
641
642 unsigned
components_read(unsigned i) const643 elk_fs_inst::components_read(unsigned i) const
644 {
645 /* Return zero if the source is not present. */
646 if (src[i].file == BAD_FILE)
647 return 0;
648
649 switch (opcode) {
650 case ELK_FS_OPCODE_LINTERP:
651 if (i == 0)
652 return 2;
653 else
654 return 1;
655
656 case ELK_FS_OPCODE_PIXEL_X:
657 case ELK_FS_OPCODE_PIXEL_Y:
658 assert(i < 2);
659 if (i == 0)
660 return 2;
661 else
662 return 1;
663
664 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
665 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
666 /* First/second FB write color. */
667 if (i < 2)
668 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
669 else
670 return 1;
671
672 case ELK_SHADER_OPCODE_TEX_LOGICAL:
673 case ELK_SHADER_OPCODE_TXD_LOGICAL:
674 case ELK_SHADER_OPCODE_TXF_LOGICAL:
675 case ELK_SHADER_OPCODE_TXL_LOGICAL:
676 case ELK_SHADER_OPCODE_TXS_LOGICAL:
677 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
678 case ELK_FS_OPCODE_TXB_LOGICAL:
679 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
680 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
681 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
682 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
683 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
684 case ELK_SHADER_OPCODE_LOD_LOGICAL:
685 case ELK_SHADER_OPCODE_TG4_LOGICAL:
686 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
687 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
688 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
689 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
690 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
691 /* Texture coordinates. */
692 if (i == TEX_LOGICAL_SRC_COORDINATE)
693 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
694 /* Texture derivatives. */
695 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
696 opcode == ELK_SHADER_OPCODE_TXD_LOGICAL)
697 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
698 /* Texture offset. */
699 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
700 return 2;
701 /* MCS */
702 else if (i == TEX_LOGICAL_SRC_MCS) {
703 if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL)
704 return 2;
705 else if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
706 return 4;
707 else
708 return 1;
709 } else
710 return 1;
711
712 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
713 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
714 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
715 /* Surface coordinates. */
716 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
717 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
718 /* Surface operation source (ignored for reads). */
719 else if (i == SURFACE_LOGICAL_SRC_DATA)
720 return 0;
721 else
722 return 1;
723
724 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
725 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
726 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
727 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
728 /* Surface coordinates. */
729 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
730 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
731 /* Surface operation source. */
732 else if (i == SURFACE_LOGICAL_SRC_DATA)
733 return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
734 else
735 return 1;
736
737 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
738 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
739 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
740 assert(src[A64_LOGICAL_ARG].file == IMM);
741 return 1;
742
743 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
744 assert(src[A64_LOGICAL_ARG].file == IMM);
745 if (i == A64_LOGICAL_SRC) { /* data to write */
746 const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
747 assert(comps > 0);
748 return comps;
749 } else {
750 return 1;
751 }
752
753 case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
754 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
755 return 1;
756
757 case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
758 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
759 if (i == SURFACE_LOGICAL_SRC_DATA) {
760 const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
761 assert(comps > 0);
762 return comps;
763 } else {
764 return 1;
765 }
766
767 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
768 assert(src[A64_LOGICAL_ARG].file == IMM);
769 return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
770
771 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
772 assert(src[A64_LOGICAL_ARG].file == IMM);
773 return i == A64_LOGICAL_SRC ?
774 lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
775
776 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
777 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
778 /* Scattered logical opcodes use the following params:
779 * src[0] Surface coordinates
780 * src[1] Surface operation source (ignored for reads)
781 * src[2] Surface
782 * src[3] IMM with always 1 dimension.
783 * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
784 */
785 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
786 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
787 return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
788
789 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
790 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
791 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
792 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
793 return 1;
794
795 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
796 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
797 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
798 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
799 const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
800 /* Surface coordinates. */
801 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
802 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
803 /* Surface operation source. */
804 else if (i == SURFACE_LOGICAL_SRC_DATA)
805 return lsc_op_num_data_values(op);
806 else
807 return 1;
808 }
809 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
810 return (i == 0 ? 2 : 1);
811
812 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
813 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
814
815 if (i == URB_LOGICAL_SRC_DATA)
816 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
817 else
818 return 1;
819
820 default:
821 return 1;
822 }
823 }
824
825 unsigned
size_read(int arg) const826 elk_fs_inst::size_read(int arg) const
827 {
828 switch (opcode) {
829 case ELK_SHADER_OPCODE_SEND:
830 if (arg == 1) {
831 return mlen * REG_SIZE;
832 }
833 break;
834
835 case ELK_FS_OPCODE_FB_WRITE:
836 case ELK_FS_OPCODE_REP_FB_WRITE:
837 if (arg == 0) {
838 if (base_mrf >= 0)
839 return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
840 else
841 return mlen * REG_SIZE;
842 }
843 break;
844
845 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
846 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
847 if (arg == 0)
848 return mlen * REG_SIZE;
849 break;
850
851 case ELK_FS_OPCODE_SET_SAMPLE_ID:
852 if (arg == 1)
853 return 1;
854 break;
855
856 case ELK_FS_OPCODE_LINTERP:
857 if (arg == 1)
858 return 16;
859 break;
860
861 case ELK_SHADER_OPCODE_LOAD_PAYLOAD:
862 if (arg < this->header_size)
863 return retype(src[arg], ELK_REGISTER_TYPE_UD).component_size(8);
864 break;
865
866 case ELK_CS_OPCODE_CS_TERMINATE:
867 case ELK_SHADER_OPCODE_BARRIER:
868 return REG_SIZE;
869
870 case ELK_SHADER_OPCODE_MOV_INDIRECT:
871 if (arg == 0) {
872 assert(src[2].file == IMM);
873 return src[2].ud;
874 }
875 break;
876
877 case ELK_SHADER_OPCODE_TEX:
878 case ELK_FS_OPCODE_TXB:
879 case ELK_SHADER_OPCODE_TXD:
880 case ELK_SHADER_OPCODE_TXF:
881 case ELK_SHADER_OPCODE_TXF_LZ:
882 case ELK_SHADER_OPCODE_TXF_CMS:
883 case ELK_SHADER_OPCODE_TXF_CMS_W:
884 case ELK_SHADER_OPCODE_TXF_UMS:
885 case ELK_SHADER_OPCODE_TXF_MCS:
886 case ELK_SHADER_OPCODE_TXL:
887 case ELK_SHADER_OPCODE_TXL_LZ:
888 case ELK_SHADER_OPCODE_TXS:
889 case ELK_SHADER_OPCODE_LOD:
890 case ELK_SHADER_OPCODE_TG4:
891 case ELK_SHADER_OPCODE_TG4_OFFSET:
892 case ELK_SHADER_OPCODE_SAMPLEINFO:
893 if (arg == 0 && src[0].file == VGRF)
894 return mlen * REG_SIZE;
895 break;
896
897 default:
898 break;
899 }
900
901 switch (src[arg].file) {
902 case UNIFORM:
903 case IMM:
904 return components_read(arg) * type_sz(src[arg].type);
905 case BAD_FILE:
906 case ARF:
907 case FIXED_GRF:
908 case VGRF:
909 case ATTR:
910 return components_read(arg) * src[arg].component_size(exec_size);
911 case MRF:
912 unreachable("MRF registers are not allowed as sources");
913 }
914 return 0;
915 }
916
917 namespace {
918 unsigned
predicate_width(const intel_device_info * devinfo,elk_predicate predicate)919 predicate_width(const intel_device_info *devinfo, elk_predicate predicate)
920 {
921 switch (predicate) {
922 case ELK_PREDICATE_NONE: return 1;
923 case ELK_PREDICATE_NORMAL: return 1;
924 case ELK_PREDICATE_ALIGN1_ANY2H: return 2;
925 case ELK_PREDICATE_ALIGN1_ALL2H: return 2;
926 case ELK_PREDICATE_ALIGN1_ANY4H: return 4;
927 case ELK_PREDICATE_ALIGN1_ALL4H: return 4;
928 case ELK_PREDICATE_ALIGN1_ANY8H: return 8;
929 case ELK_PREDICATE_ALIGN1_ALL8H: return 8;
930 case ELK_PREDICATE_ALIGN1_ANY16H: return 16;
931 case ELK_PREDICATE_ALIGN1_ALL16H: return 16;
932 case ELK_PREDICATE_ALIGN1_ANY32H: return 32;
933 case ELK_PREDICATE_ALIGN1_ALL32H: return 32;
934 default: unreachable("Unsupported predicate");
935 }
936 }
937
938 /* Return the subset of flag registers that an instruction could
939 * potentially read or write based on the execution controls and flag
940 * subregister number of the instruction.
941 */
942 unsigned
flag_mask(const elk_fs_inst * inst,unsigned width)943 flag_mask(const elk_fs_inst *inst, unsigned width)
944 {
945 assert(util_is_power_of_two_nonzero(width));
946 const unsigned start = (inst->flag_subreg * 16 + inst->group) &
947 ~(width - 1);
948 const unsigned end = start + ALIGN(inst->exec_size, width);
949 return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
950 }
951
952 unsigned
bit_mask(unsigned n)953 bit_mask(unsigned n)
954 {
955 return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
956 }
957
958 unsigned
flag_mask(const elk_fs_reg & r,unsigned sz)959 flag_mask(const elk_fs_reg &r, unsigned sz)
960 {
961 if (r.file == ARF) {
962 const unsigned start = (r.nr - ELK_ARF_FLAG) * 4 + r.subnr;
963 const unsigned end = start + sz;
964 return bit_mask(end) & ~bit_mask(start);
965 } else {
966 return 0;
967 }
968 }
969 }
970
971 unsigned
flags_read(const intel_device_info * devinfo) const972 elk_fs_inst::flags_read(const intel_device_info *devinfo) const
973 {
974 if (predicate == ELK_PREDICATE_ALIGN1_ANYV ||
975 predicate == ELK_PREDICATE_ALIGN1_ALLV) {
976 /* The vertical predication modes combine corresponding bits from
977 * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
978 */
979 const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
980 return flag_mask(this, 1) << shift | flag_mask(this, 1);
981 } else if (predicate) {
982 return flag_mask(this, predicate_width(devinfo, predicate));
983 } else {
984 unsigned mask = 0;
985 for (int i = 0; i < sources; i++) {
986 mask |= flag_mask(src[i], size_read(i));
987 }
988 return mask;
989 }
990 }
991
992 unsigned
flags_written(const intel_device_info * devinfo) const993 elk_fs_inst::flags_written(const intel_device_info *devinfo) const
994 {
995 /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
996 * using a separate cmpn and sel instruction. This lowering occurs in
997 * fs_vistor::lower_minmax which is called very, very late.
998 */
999 if ((conditional_mod && ((opcode != ELK_OPCODE_SEL || devinfo->ver <= 5) &&
1000 opcode != ELK_OPCODE_CSEL &&
1001 opcode != ELK_OPCODE_IF &&
1002 opcode != ELK_OPCODE_WHILE)) ||
1003 opcode == ELK_FS_OPCODE_FB_WRITE) {
1004 return flag_mask(this, 1);
1005 } else if (opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1006 opcode == ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1007 opcode == ELK_FS_OPCODE_LOAD_LIVE_CHANNELS) {
1008 return flag_mask(this, 32);
1009 } else {
1010 return flag_mask(dst, size_written);
1011 }
1012 }
1013
1014 /**
1015 * Returns how many MRFs an FS opcode will write over.
1016 *
1017 * Note that this is not the 0 or 1 implied writes in an actual gen
1018 * instruction -- the FS opcodes often generate MOVs in addition.
1019 */
1020 unsigned
implied_mrf_writes() const1021 elk_fs_inst::implied_mrf_writes() const
1022 {
1023 if (mlen == 0)
1024 return 0;
1025
1026 if (base_mrf == -1)
1027 return 0;
1028
1029 switch (opcode) {
1030 case ELK_SHADER_OPCODE_RCP:
1031 case ELK_SHADER_OPCODE_RSQ:
1032 case ELK_SHADER_OPCODE_SQRT:
1033 case ELK_SHADER_OPCODE_EXP2:
1034 case ELK_SHADER_OPCODE_LOG2:
1035 case ELK_SHADER_OPCODE_SIN:
1036 case ELK_SHADER_OPCODE_COS:
1037 return 1 * exec_size / 8;
1038 case ELK_SHADER_OPCODE_POW:
1039 case ELK_SHADER_OPCODE_INT_QUOTIENT:
1040 case ELK_SHADER_OPCODE_INT_REMAINDER:
1041 return 2 * exec_size / 8;
1042 case ELK_SHADER_OPCODE_TEX:
1043 case ELK_FS_OPCODE_TXB:
1044 case ELK_SHADER_OPCODE_TXD:
1045 case ELK_SHADER_OPCODE_TXF:
1046 case ELK_SHADER_OPCODE_TXF_CMS:
1047 case ELK_SHADER_OPCODE_TXF_MCS:
1048 case ELK_SHADER_OPCODE_TG4:
1049 case ELK_SHADER_OPCODE_TG4_OFFSET:
1050 case ELK_SHADER_OPCODE_TXL:
1051 case ELK_SHADER_OPCODE_TXS:
1052 case ELK_SHADER_OPCODE_LOD:
1053 case ELK_SHADER_OPCODE_SAMPLEINFO:
1054 return 1;
1055 case ELK_FS_OPCODE_FB_WRITE:
1056 case ELK_FS_OPCODE_REP_FB_WRITE:
1057 return src[0].file == BAD_FILE ? 0 : 2;
1058 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1059 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1060 return 1;
1061 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1062 return mlen;
1063 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1064 return mlen;
1065 default:
1066 unreachable("not reached");
1067 }
1068 }
1069
1070 bool
has_sampler_residency() const1071 elk_fs_inst::has_sampler_residency() const
1072 {
1073 switch (opcode) {
1074 case ELK_SHADER_OPCODE_TEX_LOGICAL:
1075 case ELK_FS_OPCODE_TXB_LOGICAL:
1076 case ELK_SHADER_OPCODE_TXL_LOGICAL:
1077 case ELK_SHADER_OPCODE_TXD_LOGICAL:
1078 case ELK_SHADER_OPCODE_TXF_LOGICAL:
1079 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1080 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1081 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1082 case ELK_SHADER_OPCODE_TXS_LOGICAL:
1083 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1084 case ELK_SHADER_OPCODE_TG4_LOGICAL:
1085 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1086 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1087 default:
1088 return false;
1089 }
1090 }
1091
1092 elk_fs_reg
vgrf(const glsl_type * const type)1093 elk_fs_visitor::vgrf(const glsl_type *const type)
1094 {
1095 int reg_width = dispatch_width / 8;
1096 return elk_fs_reg(VGRF,
1097 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1098 elk_type_for_base_type(type));
1099 }
1100
elk_fs_reg(enum elk_reg_file file,unsigned nr)1101 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr)
1102 {
1103 init();
1104 this->file = file;
1105 this->nr = nr;
1106 this->type = ELK_REGISTER_TYPE_F;
1107 this->stride = (file == UNIFORM ? 0 : 1);
1108 }
1109
elk_fs_reg(enum elk_reg_file file,unsigned nr,enum elk_reg_type type)1110 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type)
1111 {
1112 init();
1113 this->file = file;
1114 this->nr = nr;
1115 this->type = type;
1116 this->stride = (file == UNIFORM ? 0 : 1);
1117 }
1118
1119 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1120 * This brings in those uniform definitions
1121 */
1122 void
import_uniforms(elk_fs_visitor * v)1123 elk_fs_visitor::import_uniforms(elk_fs_visitor *v)
1124 {
1125 this->push_constant_loc = v->push_constant_loc;
1126 this->uniforms = v->uniforms;
1127 }
1128
1129 enum elk_barycentric_mode
elk_barycentric_mode(nir_intrinsic_instr * intr)1130 elk_barycentric_mode(nir_intrinsic_instr *intr)
1131 {
1132 const glsl_interp_mode mode =
1133 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1134
1135 /* Barycentric modes don't make sense for flat inputs. */
1136 assert(mode != INTERP_MODE_FLAT);
1137
1138 unsigned bary;
1139 switch (intr->intrinsic) {
1140 case nir_intrinsic_load_barycentric_pixel:
1141 case nir_intrinsic_load_barycentric_at_offset:
1142 bary = ELK_BARYCENTRIC_PERSPECTIVE_PIXEL;
1143 break;
1144 case nir_intrinsic_load_barycentric_centroid:
1145 bary = ELK_BARYCENTRIC_PERSPECTIVE_CENTROID;
1146 break;
1147 case nir_intrinsic_load_barycentric_sample:
1148 case nir_intrinsic_load_barycentric_at_sample:
1149 bary = ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1150 break;
1151 default:
1152 unreachable("invalid intrinsic");
1153 }
1154
1155 if (mode == INTERP_MODE_NOPERSPECTIVE)
1156 bary += 3;
1157
1158 return (enum elk_barycentric_mode) bary;
1159 }
1160
1161 /**
1162 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1163 */
1164 static enum elk_barycentric_mode
centroid_to_pixel(enum elk_barycentric_mode bary)1165 centroid_to_pixel(enum elk_barycentric_mode bary)
1166 {
1167 assert(bary == ELK_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1168 bary == ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1169 return (enum elk_barycentric_mode) ((unsigned) bary - 1);
1170 }
1171
1172 /**
1173 * Walk backwards from the end of the program looking for a URB write that
1174 * isn't in control flow, and mark it with EOT.
1175 *
1176 * Return true if successful or false if a separate EOT write is needed.
1177 */
1178 bool
mark_last_urb_write_with_eot()1179 elk_fs_visitor::mark_last_urb_write_with_eot()
1180 {
1181 foreach_in_list_reverse(elk_fs_inst, prev, &this->instructions) {
1182 if (prev->opcode == ELK_SHADER_OPCODE_URB_WRITE_LOGICAL) {
1183 prev->eot = true;
1184
1185 /* Delete now dead instructions. */
1186 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1187 if (dead == prev)
1188 break;
1189 dead->remove();
1190 }
1191 return true;
1192 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1193 break;
1194 }
1195 }
1196
1197 return false;
1198 }
1199
1200 void
emit_gs_thread_end()1201 elk_fs_visitor::emit_gs_thread_end()
1202 {
1203 assert(stage == MESA_SHADER_GEOMETRY);
1204
1205 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
1206
1207 if (gs_compile->control_data_header_size_bits > 0) {
1208 emit_gs_control_data_bits(this->final_gs_vertex_count);
1209 }
1210
1211 const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1212 elk_fs_inst *inst;
1213
1214 if (gs_prog_data->static_vertex_count != -1) {
1215 /* Try and tag the last URB write with EOT instead of emitting a whole
1216 * separate write just to finish the thread.
1217 */
1218 if (mark_last_urb_write_with_eot())
1219 return;
1220
1221 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1222 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1223 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(0);
1224 inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1225 srcs, ARRAY_SIZE(srcs));
1226 } else {
1227 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1228 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1229 srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1230 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1231 inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1232 srcs, ARRAY_SIZE(srcs));
1233 }
1234 inst->eot = true;
1235 inst->offset = 0;
1236 }
1237
1238 void
assign_curb_setup()1239 elk_fs_visitor::assign_curb_setup()
1240 {
1241 unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1242
1243 unsigned ubo_push_length = 0;
1244 unsigned ubo_push_start[4];
1245 for (int i = 0; i < 4; i++) {
1246 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1247 ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1248 }
1249
1250 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1251
1252 uint64_t used = 0;
1253
1254 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1255 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1256 for (unsigned int i = 0; i < inst->sources; i++) {
1257 if (inst->src[i].file == UNIFORM) {
1258 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1259 int constant_nr;
1260 if (inst->src[i].nr >= UBO_START) {
1261 /* constant_nr is in 32-bit units, the rest are in bytes */
1262 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1263 inst->src[i].offset / 4;
1264 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1265 constant_nr = push_constant_loc[uniform_nr];
1266 } else {
1267 /* Section 5.11 of the OpenGL 4.1 spec says:
1268 * "Out-of-bounds reads return undefined values, which include
1269 * values from other variables of the active program or zero."
1270 * Just return the first push constant.
1271 */
1272 constant_nr = 0;
1273 }
1274
1275 assert(constant_nr / 8 < 64);
1276 used |= BITFIELD64_BIT(constant_nr / 8);
1277
1278 struct elk_reg elk_reg = elk_vec1_grf(payload().num_regs +
1279 constant_nr / 8,
1280 constant_nr % 8);
1281 elk_reg.abs = inst->src[i].abs;
1282 elk_reg.negate = inst->src[i].negate;
1283
1284 assert(inst->src[i].stride == 0);
1285 inst->src[i] = byte_offset(
1286 retype(elk_reg, inst->src[i].type),
1287 inst->src[i].offset % 4);
1288 }
1289 }
1290 }
1291
1292 uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1293 if (want_zero) {
1294 fs_builder ubld = fs_builder(this, 8).exec_all().at(
1295 cfg->first_block(), cfg->first_block()->start());
1296
1297 /* push_reg_mask_param is in 32-bit units */
1298 unsigned mask_param = stage_prog_data->push_reg_mask_param;
1299 struct elk_reg mask = elk_vec1_grf(payload().num_regs + mask_param / 8,
1300 mask_param % 8);
1301
1302 elk_fs_reg b32;
1303 for (unsigned i = 0; i < 64; i++) {
1304 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1305 elk_fs_reg shifted = ubld.vgrf(ELK_REGISTER_TYPE_W, 2);
1306 ubld.SHL(horiz_offset(shifted, 8),
1307 byte_offset(retype(mask, ELK_REGISTER_TYPE_W), i / 8),
1308 elk_imm_v(0x01234567));
1309 ubld.SHL(shifted, horiz_offset(shifted, 8), elk_imm_w(8));
1310
1311 fs_builder ubld16 = ubld.group(16, 0);
1312 b32 = ubld16.vgrf(ELK_REGISTER_TYPE_D);
1313 ubld16.group(16, 0).ASR(b32, shifted, elk_imm_w(15));
1314 }
1315
1316 if (want_zero & BITFIELD64_BIT(i)) {
1317 assert(i < prog_data->curb_read_length);
1318 struct elk_reg push_reg =
1319 retype(elk_vec8_grf(payload().num_regs + i, 0),
1320 ELK_REGISTER_TYPE_D);
1321
1322 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1323 }
1324 }
1325
1326 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1327 }
1328
1329 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1330 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1331 }
1332
1333 /*
1334 * Build up an array of indices into the urb_setup array that
1335 * references the active entries of the urb_setup array.
1336 * Used to accelerate walking the active entries of the urb_setup array
1337 * on each upload.
1338 */
1339 void
elk_compute_urb_setup_index(struct elk_wm_prog_data * wm_prog_data)1340 elk_compute_urb_setup_index(struct elk_wm_prog_data *wm_prog_data)
1341 {
1342 /* Make sure uint8_t is sufficient */
1343 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1344 uint8_t index = 0;
1345 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1346 if (wm_prog_data->urb_setup[attr] >= 0) {
1347 wm_prog_data->urb_setup_attribs[index++] = attr;
1348 }
1349 }
1350 wm_prog_data->urb_setup_attribs_count = index;
1351 }
1352
1353 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * nir)1354 calculate_urb_setup(const struct intel_device_info *devinfo,
1355 const struct elk_wm_prog_key *key,
1356 struct elk_wm_prog_data *prog_data,
1357 const nir_shader *nir)
1358 {
1359 memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1360 memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1361
1362 int urb_next = 0; /* in vec4s */
1363
1364 const uint64_t inputs_read =
1365 nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1366
1367 /* Figure out where each of the incoming setup attributes lands. */
1368 if (devinfo->ver >= 6) {
1369 assert(!nir->info.per_primitive_inputs);
1370
1371 uint64_t vue_header_bits =
1372 VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1373
1374 uint64_t unique_fs_attrs = inputs_read & ELK_FS_VARYING_INPUT_MASK;
1375
1376 /* VUE header fields all live in the same URB slot, so we pass them
1377 * as a single FS input attribute. We want to only count them once.
1378 */
1379 if (inputs_read & vue_header_bits) {
1380 unique_fs_attrs &= ~vue_header_bits;
1381 unique_fs_attrs |= VARYING_BIT_PSIZ;
1382 }
1383
1384 if (util_bitcount64(unique_fs_attrs) <= 16) {
1385 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1386 * first 16 varying inputs, so we can put them wherever we want.
1387 * Just put them in order.
1388 *
1389 * This is useful because it means that (a) inputs not used by the
1390 * fragment shader won't take up valuable register space, and (b) we
1391 * won't have to recompile the fragment shader if it gets paired with
1392 * a different vertex (or geometry) shader.
1393 *
1394 * VUE header fields share the same FS input attribute.
1395 */
1396 if (inputs_read & vue_header_bits) {
1397 if (inputs_read & VARYING_BIT_PSIZ)
1398 prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1399 if (inputs_read & VARYING_BIT_LAYER)
1400 prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1401 if (inputs_read & VARYING_BIT_VIEWPORT)
1402 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1403
1404 urb_next++;
1405 }
1406
1407 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1408 if (inputs_read & ELK_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1409 BITFIELD64_BIT(i)) {
1410 prog_data->urb_setup[i] = urb_next++;
1411 }
1412 }
1413 } else {
1414 /* We have enough input varyings that the SF/SBE pipeline stage can't
1415 * arbitrarily rearrange them to suit our whim; we have to put them
1416 * in an order that matches the output of the previous pipeline stage
1417 * (geometry or vertex shader).
1418 */
1419
1420 /* Re-compute the VUE map here in the case that the one coming from
1421 * geometry has more than one position slot (used for Primitive
1422 * Replication).
1423 */
1424 struct intel_vue_map prev_stage_vue_map;
1425 elk_compute_vue_map(devinfo, &prev_stage_vue_map,
1426 key->input_slots_valid,
1427 nir->info.separate_shader, 1);
1428
1429 int first_slot =
1430 elk_compute_first_urb_slot_required(inputs_read,
1431 &prev_stage_vue_map);
1432
1433 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435 slot++) {
1436 int varying = prev_stage_vue_map.slot_to_varying[slot];
1437 if (varying != ELK_VARYING_SLOT_PAD &&
1438 (inputs_read & ELK_FS_VARYING_INPUT_MASK &
1439 BITFIELD64_BIT(varying))) {
1440 prog_data->urb_setup[varying] = slot - first_slot;
1441 }
1442 }
1443 urb_next = prev_stage_vue_map.num_slots - first_slot;
1444 }
1445 } else {
1446 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1447 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1448 /* Point size is packed into the header, not as a general attribute */
1449 if (i == VARYING_SLOT_PSIZ)
1450 continue;
1451
1452 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1453 /* The back color slot is skipped when the front color is
1454 * also written to. In addition, some slots can be
1455 * written in the vertex shader and not read in the
1456 * fragment shader. So the register number must always be
1457 * incremented, mapped or not.
1458 */
1459 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1460 prog_data->urb_setup[i] = urb_next;
1461 urb_next++;
1462 }
1463 }
1464
1465 /*
1466 * It's a FS only attribute, and we did interpolation for this attribute
1467 * in SF thread. So, count it here, too.
1468 *
1469 * See compile_sf_prog() for more info.
1470 */
1471 if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1472 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1473 }
1474
1475 prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1476 prog_data->inputs = inputs_read;
1477
1478 elk_compute_urb_setup_index(prog_data);
1479 }
1480
1481 void
assign_urb_setup()1482 elk_fs_visitor::assign_urb_setup()
1483 {
1484 assert(stage == MESA_SHADER_FRAGMENT);
1485 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
1486
1487 int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1488
1489 /* Offset all the urb_setup[] index by the actual position of the
1490 * setup regs, now that the location of the constants has been chosen.
1491 */
1492 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1493 for (int i = 0; i < inst->sources; i++) {
1494 if (inst->src[i].file == ATTR) {
1495 /* ATTR elk_fs_reg::nr in the FS is in units of logical scalar
1496 * inputs each of which consumes 16B on Gfx4-Gfx12. In
1497 * single polygon mode this leads to the following layout
1498 * of the vertex setup plane parameters in the ATTR
1499 * register file:
1500 *
1501 * elk_fs_reg::nr Input Comp0 Comp1 Comp2 Comp3
1502 * 0 Attr0.x a1-a0 a2-a0 N/A a0
1503 * 1 Attr0.y a1-a0 a2-a0 N/A a0
1504 * 2 Attr0.z a1-a0 a2-a0 N/A a0
1505 * 3 Attr0.w a1-a0 a2-a0 N/A a0
1506 * 4 Attr1.x a1-a0 a2-a0 N/A a0
1507 * ...
1508 */
1509 const unsigned param_width = 1;
1510
1511 /* Size of a single scalar component of a plane parameter
1512 * in bytes.
1513 */
1514 const unsigned chan_sz = 4;
1515 struct elk_reg reg;
1516
1517 /* Calculate the base register on the thread payload of
1518 * either the block of vertex setup data or the block of
1519 * per-primitive constant data depending on whether we're
1520 * accessing a primitive or vertex input. Also calculate
1521 * the index of the input within that block.
1522 */
1523 const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1524 const unsigned base = urb_start +
1525 (per_prim ? 0 :
1526 ALIGN(prog_data->num_per_primitive_inputs / 2,
1527 reg_unit(devinfo)));
1528 const unsigned idx = per_prim ? inst->src[i].nr :
1529 inst->src[i].nr - prog_data->num_per_primitive_inputs;
1530
1531 /* Translate the offset within the param_width-wide
1532 * representation described above into an offset and a
1533 * grf, which contains the plane parameters for the first
1534 * polygon processed by the thread.
1535 *
1536 * Earlier platforms and per-primitive block pack 2 logical
1537 * input components per 32B register.
1538 */
1539 const unsigned grf = base + idx / 2;
1540 assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1541 const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1542 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1543 inst->src[i].offset % chan_sz;
1544 reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1545 delta);
1546
1547 const unsigned width = inst->src[i].stride == 0 ?
1548 1 : MIN2(inst->exec_size, 8);
1549 reg = stride(reg, width * inst->src[i].stride,
1550 width, inst->src[i].stride);
1551
1552 reg.abs = inst->src[i].abs;
1553 reg.negate = inst->src[i].negate;
1554 inst->src[i] = reg;
1555 }
1556 }
1557 }
1558
1559 /* Each attribute is 4 setup channels, each of which is half a reg,
1560 * but they may be replicated multiple times for multipolygon
1561 * dispatch.
1562 */
1563 this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1564
1565 /* Unlike regular attributes, per-primitive attributes have all 4 channels
1566 * in the same slot, so each GRF can store two slots.
1567 */
1568 assert(prog_data->num_per_primitive_inputs % 2 == 0);
1569 this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
1570 }
1571
1572 void
convert_attr_sources_to_hw_regs(elk_fs_inst * inst)1573 elk_fs_visitor::convert_attr_sources_to_hw_regs(elk_fs_inst *inst)
1574 {
1575 for (int i = 0; i < inst->sources; i++) {
1576 if (inst->src[i].file == ATTR) {
1577 assert(inst->src[i].nr == 0);
1578 int grf = payload().num_regs +
1579 prog_data->curb_read_length +
1580 inst->src[i].offset / REG_SIZE;
1581
1582 /* As explained at elk_reg_from_fs_reg, From the Haswell PRM:
1583 *
1584 * VertStride must be used to cross GRF register boundaries. This
1585 * rule implies that elements within a 'Width' cannot cross GRF
1586 * boundaries.
1587 *
1588 * So, for registers that are large enough, we have to split the exec
1589 * size in two and trust the compression state to sort it out.
1590 */
1591 unsigned total_size = inst->exec_size *
1592 inst->src[i].stride *
1593 type_sz(inst->src[i].type);
1594
1595 assert(total_size <= 2 * REG_SIZE);
1596 const unsigned exec_size =
1597 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1598
1599 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1600 struct elk_reg reg =
1601 stride(byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1602 inst->src[i].offset % REG_SIZE),
1603 exec_size * inst->src[i].stride,
1604 width, inst->src[i].stride);
1605 reg.abs = inst->src[i].abs;
1606 reg.negate = inst->src[i].negate;
1607
1608 inst->src[i] = reg;
1609 }
1610 }
1611 }
1612
1613 void
assign_vs_urb_setup()1614 elk_fs_visitor::assign_vs_urb_setup()
1615 {
1616 struct elk_vs_prog_data *vs_prog_data = elk_vs_prog_data(prog_data);
1617
1618 assert(stage == MESA_SHADER_VERTEX);
1619
1620 /* Each attribute is 4 regs. */
1621 this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1622
1623 assert(vs_prog_data->base.urb_read_length <= 15);
1624
1625 /* Rewrite all ATTR file references to the hw grf that they land in. */
1626 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1627 convert_attr_sources_to_hw_regs(inst);
1628 }
1629 }
1630
1631 void
assign_tcs_urb_setup()1632 elk_fs_visitor::assign_tcs_urb_setup()
1633 {
1634 assert(stage == MESA_SHADER_TESS_CTRL);
1635
1636 /* Rewrite all ATTR file references to HW_REGs. */
1637 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1638 convert_attr_sources_to_hw_regs(inst);
1639 }
1640 }
1641
1642 void
assign_tes_urb_setup()1643 elk_fs_visitor::assign_tes_urb_setup()
1644 {
1645 assert(stage == MESA_SHADER_TESS_EVAL);
1646
1647 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1648
1649 first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1650
1651 /* Rewrite all ATTR file references to HW_REGs. */
1652 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1653 convert_attr_sources_to_hw_regs(inst);
1654 }
1655 }
1656
1657 void
assign_gs_urb_setup()1658 elk_fs_visitor::assign_gs_urb_setup()
1659 {
1660 assert(stage == MESA_SHADER_GEOMETRY);
1661
1662 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1663
1664 first_non_payload_grf +=
1665 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1666
1667 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1668 /* Rewrite all ATTR file references to GRFs. */
1669 convert_attr_sources_to_hw_regs(inst);
1670 }
1671 }
1672
1673
1674 /**
1675 * Split large virtual GRFs into separate components if we can.
1676 *
1677 * This pass aggressively splits VGRFs into as small a chunks as possible,
1678 * down to single registers if it can. If no VGRFs can be split, we return
1679 * false so this pass can safely be used inside an optimization loop. We
1680 * want to split, because virtual GRFs are what we register allocate and
1681 * spill (due to contiguousness requirements for some instructions), and
1682 * they're what we naturally generate in the codegen process, but most
1683 * virtual GRFs don't actually need to be contiguous sets of GRFs. If we
1684 * split, we'll end up with reduced live intervals and better dead code
1685 * elimination and coalescing.
1686 */
1687 bool
split_virtual_grfs()1688 elk_fs_visitor::split_virtual_grfs()
1689 {
1690 /* Compact the register file so we eliminate dead vgrfs. This
1691 * only defines split points for live registers, so if we have
1692 * too large dead registers they will hit assertions later.
1693 */
1694 compact_virtual_grfs();
1695
1696 unsigned num_vars = this->alloc.count;
1697
1698 /* Count the total number of registers */
1699 unsigned reg_count = 0;
1700 unsigned vgrf_to_reg[num_vars];
1701 for (unsigned i = 0; i < num_vars; i++) {
1702 vgrf_to_reg[i] = reg_count;
1703 reg_count += alloc.sizes[i];
1704 }
1705
1706 /* An array of "split points". For each register slot, this indicates
1707 * if this slot can be separated from the previous slot. Every time an
1708 * instruction uses multiple elements of a register (as a source or
1709 * destination), we mark the used slots as inseparable. Then we go
1710 * through and split the registers into the smallest pieces we can.
1711 */
1712 bool *split_points = new bool[reg_count];
1713 memset(split_points, 0, reg_count * sizeof(*split_points));
1714
1715 /* Mark all used registers as fully splittable */
1716 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1717 if (inst->dst.file == VGRF) {
1718 unsigned reg = vgrf_to_reg[inst->dst.nr];
1719 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1720 split_points[reg + j] = true;
1721 }
1722
1723 for (unsigned i = 0; i < inst->sources; i++) {
1724 if (inst->src[i].file == VGRF) {
1725 unsigned reg = vgrf_to_reg[inst->src[i].nr];
1726 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1727 split_points[reg + j] = true;
1728 }
1729 }
1730 }
1731
1732 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1733 /* We fix up undef instructions later */
1734 if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1735 assert(inst->dst.file == VGRF);
1736 continue;
1737 }
1738
1739 if (inst->dst.file == VGRF) {
1740 unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1741 for (unsigned j = 1; j < regs_written(inst); j++)
1742 split_points[reg + j] = false;
1743 }
1744 for (unsigned i = 0; i < inst->sources; i++) {
1745 if (inst->src[i].file == VGRF) {
1746 unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1747 for (unsigned j = 1; j < regs_read(inst, i); j++)
1748 split_points[reg + j] = false;
1749 }
1750 }
1751 }
1752
1753 /* Bitset of which registers have been split */
1754 bool *vgrf_has_split = new bool[num_vars];
1755 memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
1756
1757 unsigned *new_virtual_grf = new unsigned[reg_count];
1758 unsigned *new_reg_offset = new unsigned[reg_count];
1759
1760 unsigned reg = 0;
1761 bool has_splits = false;
1762 for (unsigned i = 0; i < num_vars; i++) {
1763 /* The first one should always be 0 as a quick sanity check. */
1764 assert(split_points[reg] == false);
1765
1766 /* j = 0 case */
1767 new_reg_offset[reg] = 0;
1768 reg++;
1769 unsigned offset = 1;
1770
1771 /* j > 0 case */
1772 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1773 /* If this is a split point, reset the offset to 0 and allocate a
1774 * new virtual GRF for the previous offset many registers
1775 */
1776 if (split_points[reg]) {
1777 has_splits = true;
1778 vgrf_has_split[i] = true;
1779 assert(offset <= MAX_VGRF_SIZE(devinfo));
1780 unsigned grf = alloc.allocate(offset);
1781 for (unsigned k = reg - offset; k < reg; k++)
1782 new_virtual_grf[k] = grf;
1783 offset = 0;
1784 }
1785 new_reg_offset[reg] = offset;
1786 offset++;
1787 reg++;
1788 }
1789
1790 /* The last one gets the original register number */
1791 assert(offset <= MAX_VGRF_SIZE(devinfo));
1792 alloc.sizes[i] = offset;
1793 for (unsigned k = reg - offset; k < reg; k++)
1794 new_virtual_grf[k] = i;
1795 }
1796 assert(reg == reg_count);
1797
1798 bool progress;
1799 if (!has_splits) {
1800 progress = false;
1801 goto cleanup;
1802 }
1803
1804 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
1805 if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1806 assert(inst->dst.file == VGRF);
1807 if (vgrf_has_split[inst->dst.nr]) {
1808 const fs_builder ibld(this, block, inst);
1809 assert(inst->size_written % REG_SIZE == 0);
1810 unsigned reg_offset = inst->dst.offset / REG_SIZE;
1811 unsigned size_written = 0;
1812 while (size_written < inst->size_written) {
1813 reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
1814 elk_fs_inst *undef =
1815 ibld.UNDEF(
1816 byte_offset(elk_fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
1817 new_reg_offset[reg] * REG_SIZE));
1818 undef->size_written =
1819 MIN2(inst->size_written - size_written, undef->size_written);
1820 assert(undef->size_written % REG_SIZE == 0);
1821 size_written += undef->size_written;
1822 }
1823 inst->remove(block);
1824 } else {
1825 reg = vgrf_to_reg[inst->dst.nr];
1826 assert(new_reg_offset[reg] == 0);
1827 assert(new_virtual_grf[reg] == inst->dst.nr);
1828 }
1829 continue;
1830 }
1831
1832 if (inst->dst.file == VGRF) {
1833 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1834 if (vgrf_has_split[inst->dst.nr]) {
1835 inst->dst.nr = new_virtual_grf[reg];
1836 inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
1837 inst->dst.offset % REG_SIZE;
1838 assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1839 } else {
1840 assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
1841 assert(new_virtual_grf[reg] == inst->dst.nr);
1842 }
1843 }
1844 for (unsigned i = 0; i < inst->sources; i++) {
1845 if (inst->src[i].file != VGRF)
1846 continue;
1847
1848 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1849 if (vgrf_has_split[inst->src[i].nr]) {
1850 inst->src[i].nr = new_virtual_grf[reg];
1851 inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
1852 inst->src[i].offset % REG_SIZE;
1853 assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1854 } else {
1855 assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
1856 assert(new_virtual_grf[reg] == inst->src[i].nr);
1857 }
1858 }
1859 }
1860 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1861
1862 progress = true;
1863
1864 cleanup:
1865 delete[] split_points;
1866 delete[] vgrf_has_split;
1867 delete[] new_virtual_grf;
1868 delete[] new_reg_offset;
1869
1870 return progress;
1871 }
1872
1873 /**
1874 * Remove unused virtual GRFs and compact the vgrf_* arrays.
1875 *
1876 * During code generation, we create tons of temporary variables, many of
1877 * which get immediately killed and are never used again. Yet, in later
1878 * optimization and analysis passes, such as compute_live_intervals, we need
1879 * to loop over all the virtual GRFs. Compacting them can save a lot of
1880 * overhead.
1881 */
1882 bool
compact_virtual_grfs()1883 elk_fs_visitor::compact_virtual_grfs()
1884 {
1885 bool progress = false;
1886 int *remap_table = new int[this->alloc.count];
1887 memset(remap_table, -1, this->alloc.count * sizeof(int));
1888
1889 /* Mark which virtual GRFs are used. */
1890 foreach_block_and_inst(block, const elk_fs_inst, inst, cfg) {
1891 if (inst->dst.file == VGRF)
1892 remap_table[inst->dst.nr] = 0;
1893
1894 for (int i = 0; i < inst->sources; i++) {
1895 if (inst->src[i].file == VGRF)
1896 remap_table[inst->src[i].nr] = 0;
1897 }
1898 }
1899
1900 /* Compact the GRF arrays. */
1901 int new_index = 0;
1902 for (unsigned i = 0; i < this->alloc.count; i++) {
1903 if (remap_table[i] == -1) {
1904 /* We just found an unused register. This means that we are
1905 * actually going to compact something.
1906 */
1907 progress = true;
1908 } else {
1909 remap_table[i] = new_index;
1910 alloc.sizes[new_index] = alloc.sizes[i];
1911 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1912 ++new_index;
1913 }
1914 }
1915
1916 this->alloc.count = new_index;
1917
1918 /* Patch all the instructions to use the newly renumbered registers */
1919 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1920 if (inst->dst.file == VGRF)
1921 inst->dst.nr = remap_table[inst->dst.nr];
1922
1923 for (int i = 0; i < inst->sources; i++) {
1924 if (inst->src[i].file == VGRF)
1925 inst->src[i].nr = remap_table[inst->src[i].nr];
1926 }
1927 }
1928
1929 /* Patch all the references to delta_xy, since they're used in register
1930 * allocation. If they're unused, switch them to BAD_FILE so we don't
1931 * think some random VGRF is delta_xy.
1932 */
1933 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1934 if (delta_xy[i].file == VGRF) {
1935 if (remap_table[delta_xy[i].nr] != -1) {
1936 delta_xy[i].nr = remap_table[delta_xy[i].nr];
1937 } else {
1938 delta_xy[i].file = BAD_FILE;
1939 }
1940 }
1941 }
1942
1943 delete[] remap_table;
1944
1945 return progress;
1946 }
1947
1948 int
elk_get_subgroup_id_param_index(const intel_device_info * devinfo,const elk_stage_prog_data * prog_data)1949 elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
1950 const elk_stage_prog_data *prog_data)
1951 {
1952 if (prog_data->nr_params == 0)
1953 return -1;
1954
1955 /* The local thread id is always the last parameter in the list */
1956 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1957 if (last_param == ELK_PARAM_BUILTIN_SUBGROUP_ID)
1958 return prog_data->nr_params - 1;
1959
1960 return -1;
1961 }
1962
1963 /**
1964 * Assign UNIFORM file registers to either push constants or pull constants.
1965 *
1966 * We allow a fragment shader to have more than the specified minimum
1967 * maximum number of fragment shader uniform components (64). If
1968 * there are too many of these, they'd fill up all of register space.
1969 * So, this will push some of them out to the pull constant buffer and
1970 * update the program to load them.
1971 */
1972 void
assign_constant_locations()1973 elk_fs_visitor::assign_constant_locations()
1974 {
1975 /* Only the first compile gets to decide on locations. */
1976 if (push_constant_loc)
1977 return;
1978
1979 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1980 for (unsigned u = 0; u < uniforms; u++)
1981 push_constant_loc[u] = u;
1982
1983 /* Now that we know how many regular uniforms we'll push, reduce the
1984 * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1985 */
1986 /* For gen4/5:
1987 * Only allow 16 registers (128 uniform components) as push constants.
1988 *
1989 * If changing this value, note the limitation about total_regs in
1990 * elk_curbe.c/crocus_state.c
1991 */
1992 const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
1993 unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1994 for (int i = 0; i < 4; i++) {
1995 struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
1996
1997 if (push_length + range->length > max_push_length)
1998 range->length = max_push_length - push_length;
1999
2000 push_length += range->length;
2001 }
2002 assert(push_length <= max_push_length);
2003 }
2004
2005 bool
get_pull_locs(const elk_fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)2006 elk_fs_visitor::get_pull_locs(const elk_fs_reg &src,
2007 unsigned *out_surf_index,
2008 unsigned *out_pull_index)
2009 {
2010 assert(src.file == UNIFORM);
2011
2012 if (src.nr < UBO_START)
2013 return false;
2014
2015 const struct elk_ubo_range *range =
2016 &prog_data->ubo_ranges[src.nr - UBO_START];
2017
2018 /* If this access is in our (reduced) range, use the push data. */
2019 if (src.offset / 32 < range->length)
2020 return false;
2021
2022 *out_surf_index = range->block;
2023 *out_pull_index = (32 * range->start + src.offset) / 4;
2024
2025 prog_data->has_ubo_pull = true;
2026
2027 return true;
2028 }
2029
2030 /**
2031 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2032 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2033 */
2034 bool
lower_constant_loads()2035 elk_fs_visitor::lower_constant_loads()
2036 {
2037 unsigned index, pull_index;
2038 bool progress = false;
2039
2040 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
2041 /* Set up the annotation tracking for new generated instructions. */
2042 const fs_builder ibld(this, block, inst);
2043
2044 for (int i = 0; i < inst->sources; i++) {
2045 if (inst->src[i].file != UNIFORM)
2046 continue;
2047
2048 /* We'll handle this case later */
2049 if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT && i == 0)
2050 continue;
2051
2052 if (!get_pull_locs(inst->src[i], &index, &pull_index))
2053 continue;
2054
2055 assert(inst->src[i].stride == 0);
2056
2057 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2058 const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2059 const elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2060 const unsigned base = pull_index * 4;
2061
2062 elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
2063 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = elk_imm_ud(index);
2064 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = elk_imm_ud(base & ~(block_sz - 1));
2065 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = elk_imm_ud(block_sz);
2066
2067
2068 ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
2069 srcs, PULL_UNIFORM_CONSTANT_SRCS);
2070
2071 /* Rewrite the instruction to use the temporary VGRF. */
2072 inst->src[i].file = VGRF;
2073 inst->src[i].nr = dst.nr;
2074 inst->src[i].offset = (base & (block_sz - 1)) +
2075 inst->src[i].offset % 4;
2076
2077 progress = true;
2078 }
2079
2080 if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT &&
2081 inst->src[0].file == UNIFORM) {
2082
2083 if (!get_pull_locs(inst->src[0], &index, &pull_index))
2084 continue;
2085
2086 VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2087 elk_imm_ud(index),
2088 elk_fs_reg() /* surface_handle */,
2089 inst->src[1],
2090 pull_index * 4, 4, 1);
2091 inst->remove(block);
2092
2093 progress = true;
2094 }
2095 }
2096 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2097
2098 return progress;
2099 }
2100
2101 static uint64_t
src_as_uint(const elk_fs_reg & src)2102 src_as_uint(const elk_fs_reg &src)
2103 {
2104 assert(src.file == IMM);
2105
2106 switch (src.type) {
2107 case ELK_REGISTER_TYPE_W:
2108 return (uint64_t)(int16_t)(src.ud & 0xffff);
2109
2110 case ELK_REGISTER_TYPE_UW:
2111 return (uint64_t)(uint16_t)(src.ud & 0xffff);
2112
2113 case ELK_REGISTER_TYPE_D:
2114 return (uint64_t)src.d;
2115
2116 case ELK_REGISTER_TYPE_UD:
2117 return (uint64_t)src.ud;
2118
2119 case ELK_REGISTER_TYPE_Q:
2120 return src.d64;
2121
2122 case ELK_REGISTER_TYPE_UQ:
2123 return src.u64;
2124
2125 default:
2126 unreachable("Invalid integer type.");
2127 }
2128 }
2129
2130 static elk_fs_reg
elk_imm_for_type(uint64_t value,enum elk_reg_type type)2131 elk_imm_for_type(uint64_t value, enum elk_reg_type type)
2132 {
2133 switch (type) {
2134 case ELK_REGISTER_TYPE_W:
2135 return elk_imm_w(value);
2136
2137 case ELK_REGISTER_TYPE_UW:
2138 return elk_imm_uw(value);
2139
2140 case ELK_REGISTER_TYPE_D:
2141 return elk_imm_d(value);
2142
2143 case ELK_REGISTER_TYPE_UD:
2144 return elk_imm_ud(value);
2145
2146 case ELK_REGISTER_TYPE_Q:
2147 return elk_imm_d(value);
2148
2149 case ELK_REGISTER_TYPE_UQ:
2150 return elk_imm_uq(value);
2151
2152 default:
2153 unreachable("Invalid integer type.");
2154 }
2155 }
2156
2157 bool
opt_algebraic()2158 elk_fs_visitor::opt_algebraic()
2159 {
2160 bool progress = false;
2161
2162 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2163 switch (inst->opcode) {
2164 case ELK_OPCODE_MOV:
2165 if (!devinfo->has_64bit_float &&
2166 inst->dst.type == ELK_REGISTER_TYPE_DF) {
2167 assert(inst->dst.type == inst->src[0].type);
2168 assert(!inst->saturate);
2169 assert(!inst->src[0].abs);
2170 assert(!inst->src[0].negate);
2171 const elk::fs_builder ibld(this, block, inst);
2172
2173 if (!inst->is_partial_write())
2174 ibld.emit_undef_for_dst(inst);
2175
2176 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 1),
2177 subscript(inst->src[0], ELK_REGISTER_TYPE_F, 1));
2178 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 0),
2179 subscript(inst->src[0], ELK_REGISTER_TYPE_F, 0));
2180
2181 inst->remove(block);
2182 progress = true;
2183 }
2184
2185 if (!devinfo->has_64bit_int &&
2186 (inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2187 inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2188 assert(inst->dst.type == inst->src[0].type);
2189 assert(!inst->saturate);
2190 assert(!inst->src[0].abs);
2191 assert(!inst->src[0].negate);
2192 const elk::fs_builder ibld(this, block, inst);
2193
2194 if (!inst->is_partial_write())
2195 ibld.emit_undef_for_dst(inst);
2196
2197 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2198 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1));
2199 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2200 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0));
2201
2202 inst->remove(block);
2203 progress = true;
2204 }
2205
2206 if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2207 inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2208 inst->dst.is_null() &&
2209 (inst->src[0].abs || inst->src[0].negate)) {
2210 inst->src[0].abs = false;
2211 inst->src[0].negate = false;
2212 progress = true;
2213 break;
2214 }
2215
2216 if (inst->src[0].file != IMM)
2217 break;
2218
2219 if (inst->saturate) {
2220 /* Full mixed-type saturates don't happen. However, we can end up
2221 * with things like:
2222 *
2223 * mov.sat(8) g21<1>DF -1F
2224 *
2225 * Other mixed-size-but-same-base-type cases may also be possible.
2226 */
2227 if (inst->dst.type != inst->src[0].type &&
2228 inst->dst.type != ELK_REGISTER_TYPE_DF &&
2229 inst->src[0].type != ELK_REGISTER_TYPE_F)
2230 assert(!"unimplemented: saturate mixed types");
2231
2232 if (elk_saturate_immediate(inst->src[0].type,
2233 &inst->src[0].as_elk_reg())) {
2234 inst->saturate = false;
2235 progress = true;
2236 }
2237 }
2238 break;
2239
2240 case ELK_OPCODE_MUL:
2241 if (inst->src[1].file != IMM)
2242 continue;
2243
2244 if (elk_reg_type_is_floating_point(inst->src[1].type))
2245 break;
2246
2247 /* From the BDW PRM, Vol 2a, "mul - Multiply":
2248 *
2249 * "When multiplying integer datatypes, if src0 is DW and src1
2250 * is W, irrespective of the destination datatype, the
2251 * accumulator maintains full 48-bit precision."
2252 * ...
2253 * "When multiplying integer data types, if one of the sources
2254 * is a DW, the resulting full precision data is stored in
2255 * the accumulator."
2256 *
2257 * There are also similar notes in earlier PRMs.
2258 *
2259 * The MOV instruction can copy the bits of the source, but it
2260 * does not clear the higher bits of the accumulator. So, because
2261 * we might use the full accumulator in the MUL/MACH macro, we
2262 * shouldn't replace such MULs with MOVs.
2263 */
2264 if ((elk_reg_type_to_size(inst->src[0].type) == 4 ||
2265 elk_reg_type_to_size(inst->src[1].type) == 4) &&
2266 (inst->dst.is_accumulator() ||
2267 inst->writes_accumulator_implicitly(devinfo)))
2268 break;
2269
2270 /* a * 1.0 = a */
2271 if (inst->src[1].is_one()) {
2272 inst->opcode = ELK_OPCODE_MOV;
2273 inst->sources = 1;
2274 inst->src[1] = reg_undef;
2275 progress = true;
2276 break;
2277 }
2278
2279 /* a * -1.0 = -a */
2280 if (inst->src[1].is_negative_one()) {
2281 inst->opcode = ELK_OPCODE_MOV;
2282 inst->sources = 1;
2283 inst->src[0].negate = !inst->src[0].negate;
2284 inst->src[1] = reg_undef;
2285 progress = true;
2286 break;
2287 }
2288
2289 break;
2290 case ELK_OPCODE_ADD:
2291 if (inst->src[1].file != IMM)
2292 continue;
2293
2294 if (elk_reg_type_is_integer(inst->src[1].type) &&
2295 inst->src[1].is_zero()) {
2296 inst->opcode = ELK_OPCODE_MOV;
2297 inst->sources = 1;
2298 inst->src[1] = reg_undef;
2299 progress = true;
2300 break;
2301 }
2302
2303 if (inst->src[0].file == IMM) {
2304 assert(inst->src[0].type == ELK_REGISTER_TYPE_F);
2305 inst->opcode = ELK_OPCODE_MOV;
2306 inst->sources = 1;
2307 inst->src[0].f += inst->src[1].f;
2308 inst->src[1] = reg_undef;
2309 progress = true;
2310 break;
2311 }
2312 break;
2313
2314 case ELK_OPCODE_AND:
2315 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2316 const uint64_t src0 = src_as_uint(inst->src[0]);
2317 const uint64_t src1 = src_as_uint(inst->src[1]);
2318
2319 inst->opcode = ELK_OPCODE_MOV;
2320 inst->sources = 1;
2321 inst->src[0] = elk_imm_for_type(src0 & src1, inst->dst.type);
2322 inst->src[1] = reg_undef;
2323 progress = true;
2324 break;
2325 }
2326
2327 break;
2328
2329 case ELK_OPCODE_OR:
2330 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2331 const uint64_t src0 = src_as_uint(inst->src[0]);
2332 const uint64_t src1 = src_as_uint(inst->src[1]);
2333
2334 inst->opcode = ELK_OPCODE_MOV;
2335 inst->sources = 1;
2336 inst->src[0] = elk_imm_for_type(src0 | src1, inst->dst.type);
2337 inst->src[1] = reg_undef;
2338 progress = true;
2339 break;
2340 }
2341
2342 if (inst->src[0].equals(inst->src[1]) ||
2343 inst->src[1].is_zero()) {
2344 /* On Gfx8+, the OR instruction can have a source modifier that
2345 * performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
2346 * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2347 */
2348 if (inst->src[0].negate) {
2349 inst->opcode = ELK_OPCODE_NOT;
2350 inst->sources = 1;
2351 inst->src[0].negate = false;
2352 } else {
2353 inst->opcode = ELK_OPCODE_MOV;
2354 inst->sources = 1;
2355 }
2356 inst->src[1] = reg_undef;
2357 progress = true;
2358 break;
2359 }
2360 break;
2361 case ELK_OPCODE_CMP:
2362 if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2363 inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2364 inst->src[1].is_zero() &&
2365 (inst->src[0].abs || inst->src[0].negate)) {
2366 inst->src[0].abs = false;
2367 inst->src[0].negate = false;
2368 progress = true;
2369 break;
2370 }
2371 break;
2372 case ELK_OPCODE_SEL:
2373 if (!devinfo->has_64bit_float &&
2374 !devinfo->has_64bit_int &&
2375 (inst->dst.type == ELK_REGISTER_TYPE_DF ||
2376 inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2377 inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2378 assert(inst->dst.type == inst->src[0].type);
2379 assert(!inst->saturate);
2380 assert(!inst->src[0].abs && !inst->src[0].negate);
2381 assert(!inst->src[1].abs && !inst->src[1].negate);
2382 const elk::fs_builder ibld(this, block, inst);
2383
2384 if (!inst->is_partial_write())
2385 ibld.emit_undef_for_dst(inst);
2386
2387 set_predicate(inst->predicate,
2388 ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2389 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
2390 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0)));
2391 set_predicate(inst->predicate,
2392 ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2393 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
2394 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1)));
2395
2396 inst->remove(block);
2397 progress = true;
2398 }
2399 if (inst->src[0].equals(inst->src[1])) {
2400 inst->opcode = ELK_OPCODE_MOV;
2401 inst->sources = 1;
2402 inst->src[1] = reg_undef;
2403 inst->predicate = ELK_PREDICATE_NONE;
2404 inst->predicate_inverse = false;
2405 progress = true;
2406 } else if (inst->saturate && inst->src[1].file == IMM) {
2407 switch (inst->conditional_mod) {
2408 case ELK_CONDITIONAL_LE:
2409 case ELK_CONDITIONAL_L:
2410 switch (inst->src[1].type) {
2411 case ELK_REGISTER_TYPE_F:
2412 if (inst->src[1].f >= 1.0f) {
2413 inst->opcode = ELK_OPCODE_MOV;
2414 inst->sources = 1;
2415 inst->src[1] = reg_undef;
2416 inst->conditional_mod = ELK_CONDITIONAL_NONE;
2417 progress = true;
2418 }
2419 break;
2420 default:
2421 break;
2422 }
2423 break;
2424 case ELK_CONDITIONAL_GE:
2425 case ELK_CONDITIONAL_G:
2426 switch (inst->src[1].type) {
2427 case ELK_REGISTER_TYPE_F:
2428 if (inst->src[1].f <= 0.0f) {
2429 inst->opcode = ELK_OPCODE_MOV;
2430 inst->sources = 1;
2431 inst->src[1] = reg_undef;
2432 inst->conditional_mod = ELK_CONDITIONAL_NONE;
2433 progress = true;
2434 }
2435 break;
2436 default:
2437 break;
2438 }
2439 default:
2440 break;
2441 }
2442 }
2443 break;
2444 case ELK_OPCODE_MAD:
2445 if (inst->src[0].type != ELK_REGISTER_TYPE_F ||
2446 inst->src[1].type != ELK_REGISTER_TYPE_F ||
2447 inst->src[2].type != ELK_REGISTER_TYPE_F)
2448 break;
2449 if (inst->src[1].is_one()) {
2450 inst->opcode = ELK_OPCODE_ADD;
2451 inst->sources = 2;
2452 inst->src[1] = inst->src[2];
2453 inst->src[2] = reg_undef;
2454 progress = true;
2455 } else if (inst->src[2].is_one()) {
2456 inst->opcode = ELK_OPCODE_ADD;
2457 inst->sources = 2;
2458 inst->src[2] = reg_undef;
2459 progress = true;
2460 }
2461 break;
2462 case ELK_OPCODE_SHL:
2463 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2464 /* It's not currently possible to generate this, and this constant
2465 * folding does not handle it.
2466 */
2467 assert(!inst->saturate);
2468
2469 elk_fs_reg result;
2470
2471 switch (type_sz(inst->src[0].type)) {
2472 case 2:
2473 result = elk_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
2474 break;
2475 case 4:
2476 result = elk_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
2477 break;
2478 case 8:
2479 result = elk_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
2480 break;
2481 default:
2482 /* Just in case a future platform re-enables B or UB types. */
2483 unreachable("Invalid source size.");
2484 }
2485
2486 inst->opcode = ELK_OPCODE_MOV;
2487 inst->src[0] = retype(result, inst->dst.type);
2488 inst->src[1] = reg_undef;
2489 inst->sources = 1;
2490
2491 progress = true;
2492 }
2493 break;
2494
2495 case ELK_SHADER_OPCODE_BROADCAST:
2496 if (is_uniform(inst->src[0])) {
2497 inst->opcode = ELK_OPCODE_MOV;
2498 inst->sources = 1;
2499 inst->force_writemask_all = true;
2500 progress = true;
2501 } else if (inst->src[1].file == IMM) {
2502 inst->opcode = ELK_OPCODE_MOV;
2503 /* It's possible that the selected component will be too large and
2504 * overflow the register. This can happen if someone does a
2505 * readInvocation() from GLSL or SPIR-V and provides an OOB
2506 * invocationIndex. If this happens and we some how manage
2507 * to constant fold it in and get here, then component() may cause
2508 * us to start reading outside of the VGRF which will lead to an
2509 * assert later. Instead, just let it wrap around if it goes over
2510 * exec_size.
2511 */
2512 const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2513 inst->src[0] = component(inst->src[0], comp);
2514 inst->sources = 1;
2515 inst->force_writemask_all = true;
2516 progress = true;
2517 }
2518 break;
2519
2520 case ELK_SHADER_OPCODE_SHUFFLE:
2521 if (is_uniform(inst->src[0])) {
2522 inst->opcode = ELK_OPCODE_MOV;
2523 inst->sources = 1;
2524 progress = true;
2525 } else if (inst->src[1].file == IMM) {
2526 inst->opcode = ELK_OPCODE_MOV;
2527 inst->src[0] = component(inst->src[0],
2528 inst->src[1].ud);
2529 inst->sources = 1;
2530 progress = true;
2531 }
2532 break;
2533
2534 default:
2535 break;
2536 }
2537
2538 /* Ensure that the correct source has the immediate value. 2-source
2539 * instructions must have the immediate in src[1]. On Gfx12 and later,
2540 * some 3-source instructions can have the immediate in src[0] or
2541 * src[2]. It's complicated, so don't mess with 3-source instructions
2542 * here.
2543 */
2544 if (progress && inst->sources == 2 && inst->is_commutative()) {
2545 if (inst->src[0].file == IMM) {
2546 elk_fs_reg tmp = inst->src[1];
2547 inst->src[1] = inst->src[0];
2548 inst->src[0] = tmp;
2549 }
2550 }
2551 }
2552
2553 if (progress)
2554 invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
2555 DEPENDENCY_INSTRUCTION_DETAIL);
2556
2557 return progress;
2558 }
2559
2560 static unsigned
load_payload_sources_read_for_size(elk_fs_inst * lp,unsigned size_read)2561 load_payload_sources_read_for_size(elk_fs_inst *lp, unsigned size_read)
2562 {
2563 assert(lp->opcode == ELK_SHADER_OPCODE_LOAD_PAYLOAD);
2564 assert(size_read >= lp->header_size * REG_SIZE);
2565
2566 unsigned i;
2567 unsigned size = lp->header_size * REG_SIZE;
2568 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
2569 size += lp->exec_size * type_sz(lp->src[i].type);
2570
2571 /* Size read must cover exactly a subset of sources. */
2572 assert(size == size_read);
2573 return i;
2574 }
2575
2576 /**
2577 * Optimize sample messages that have constant zero values for the trailing
2578 * parameters. We can just reduce the message length for these
2579 * instructions instead of reserving a register for it. Trailing parameters
2580 * that aren't sent default to zero anyway. This will cause the dead code
2581 * eliminator to remove the MOV instruction that would otherwise be emitted to
2582 * set up the zero value.
2583 */
2584 bool
opt_zero_samples()2585 elk_fs_visitor::opt_zero_samples()
2586 {
2587 /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
2588 assert(devinfo->ver >= 7);
2589
2590 bool progress = false;
2591
2592 foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2593 if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2594 send->sfid != ELK_SFID_SAMPLER)
2595 continue;
2596
2597 /* Wa_14012688258:
2598 *
2599 * Don't trim zeros at the end of payload for sample operations
2600 * in cube and cube arrays.
2601 */
2602 if (send->keep_payload_trailing_zeros)
2603 continue;
2604
2605 elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2606
2607 if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2608 continue;
2609
2610 /* How much of the payload are actually read by this SEND. */
2611 const unsigned params =
2612 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2613
2614 /* We don't want to remove the message header or the first parameter.
2615 * Removing the first parameter is not allowed, see the Haswell PRM
2616 * volume 7, page 149:
2617 *
2618 * "Parameter 0 is required except for the sampleinfo message, which
2619 * has no parameter 0"
2620 */
2621 const unsigned first_param_idx = lp->header_size;
2622 unsigned zero_size = 0;
2623 for (unsigned i = params - 1; i > first_param_idx; i--) {
2624 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
2625 break;
2626 zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
2627 }
2628
2629 const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
2630 if (zero_len > 0) {
2631 send->mlen -= zero_len;
2632 progress = true;
2633 }
2634 }
2635
2636 if (progress)
2637 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2638
2639 return progress;
2640 }
2641
2642 /**
2643 * Remove redundant or useless halts.
2644 *
2645 * For example, we can eliminate halts in the following sequence:
2646 *
2647 * halt (redundant with the next halt)
2648 * halt (useless; jumps to the next instruction)
2649 * halt-target
2650 */
2651 bool
opt_redundant_halt()2652 elk_fs_visitor::opt_redundant_halt()
2653 {
2654 bool progress = false;
2655
2656 unsigned halt_count = 0;
2657 elk_fs_inst *halt_target = NULL;
2658 elk_bblock_t *halt_target_block = NULL;
2659 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2660 if (inst->opcode == ELK_OPCODE_HALT)
2661 halt_count++;
2662
2663 if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
2664 halt_target = inst;
2665 halt_target_block = block;
2666 break;
2667 }
2668 }
2669
2670 if (!halt_target) {
2671 assert(halt_count == 0);
2672 return false;
2673 }
2674
2675 /* Delete any HALTs immediately before the halt target. */
2676 for (elk_fs_inst *prev = (elk_fs_inst *) halt_target->prev;
2677 !prev->is_head_sentinel() && prev->opcode == ELK_OPCODE_HALT;
2678 prev = (elk_fs_inst *) halt_target->prev) {
2679 prev->remove(halt_target_block);
2680 halt_count--;
2681 progress = true;
2682 }
2683
2684 if (halt_count == 0) {
2685 halt_target->remove(halt_target_block);
2686 progress = true;
2687 }
2688
2689 if (progress)
2690 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2691
2692 return progress;
2693 }
2694
2695 /**
2696 * Compute a bitmask with GRF granularity with a bit set for each GRF starting
2697 * from \p r.offset which overlaps the region starting at \p s.offset and
2698 * spanning \p ds bytes.
2699 */
2700 static inline unsigned
mask_relative_to(const elk_fs_reg & r,const elk_fs_reg & s,unsigned ds)2701 mask_relative_to(const elk_fs_reg &r, const elk_fs_reg &s, unsigned ds)
2702 {
2703 const int rel_offset = reg_offset(s) - reg_offset(r);
2704 const int shift = rel_offset / REG_SIZE;
2705 const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
2706 assert(reg_space(r) == reg_space(s) &&
2707 shift >= 0 && shift < int(8 * sizeof(unsigned)));
2708 return ((1 << n) - 1) << shift;
2709 }
2710
2711 bool
compute_to_mrf()2712 elk_fs_visitor::compute_to_mrf()
2713 {
2714 bool progress = false;
2715 int next_ip = 0;
2716
2717 /* No MRFs on Gen >= 7. */
2718 if (devinfo->ver >= 7)
2719 return false;
2720
2721 const fs_live_variables &live = live_analysis.require();
2722
2723 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2724 int ip = next_ip;
2725 next_ip++;
2726
2727 if (inst->opcode != ELK_OPCODE_MOV ||
2728 inst->is_partial_write() ||
2729 inst->dst.file != MRF || inst->src[0].file != VGRF ||
2730 inst->dst.type != inst->src[0].type ||
2731 inst->src[0].abs || inst->src[0].negate ||
2732 !inst->src[0].is_contiguous() ||
2733 inst->src[0].offset % REG_SIZE != 0)
2734 continue;
2735
2736 /* Can't compute-to-MRF this GRF if someone else was going to
2737 * read it later.
2738 */
2739 if (live.vgrf_end[inst->src[0].nr] > ip)
2740 continue;
2741
2742 /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
2743 * things that computed the value of all GRFs of the source region. The
2744 * regs_left bitset keeps track of the registers we haven't yet found a
2745 * generating instruction for.
2746 */
2747 unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
2748
2749 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2750 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2751 inst->src[0], inst->size_read(0))) {
2752 /* Found the last thing to write our reg we want to turn
2753 * into a compute-to-MRF.
2754 */
2755
2756 /* If this one instruction didn't populate all the
2757 * channels, bail. We might be able to rewrite everything
2758 * that writes that reg, but it would require smarter
2759 * tracking.
2760 */
2761 if (scan_inst->is_partial_write())
2762 break;
2763
2764 /* Handling things not fully contained in the source of the copy
2765 * would need us to understand coalescing out more than one MOV at
2766 * a time.
2767 */
2768 if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
2769 inst->src[0], inst->size_read(0)))
2770 break;
2771
2772 /* SEND instructions can't have MRF as a destination. */
2773 if (scan_inst->mlen)
2774 break;
2775
2776 if (devinfo->ver == 6) {
2777 /* gfx6 math instructions must have the destination be
2778 * GRF, so no compute-to-MRF for them.
2779 */
2780 if (scan_inst->is_math()) {
2781 break;
2782 }
2783 }
2784
2785 /* Clear the bits for any registers this instruction overwrites. */
2786 regs_left &= ~mask_relative_to(
2787 inst->src[0], scan_inst->dst, scan_inst->size_written);
2788 if (!regs_left)
2789 break;
2790 }
2791
2792 /* We don't handle control flow here. Most computation of
2793 * values that end up in MRFs are shortly before the MRF
2794 * write anyway.
2795 */
2796 if (block->start() == scan_inst)
2797 break;
2798
2799 /* You can't read from an MRF, so if someone else reads our
2800 * MRF's source GRF that we wanted to rewrite, that stops us.
2801 */
2802 bool interfered = false;
2803 for (int i = 0; i < scan_inst->sources; i++) {
2804 if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
2805 inst->src[0], inst->size_read(0))) {
2806 interfered = true;
2807 }
2808 }
2809 if (interfered)
2810 break;
2811
2812 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2813 inst->dst, inst->size_written)) {
2814 /* If somebody else writes our MRF here, we can't
2815 * compute-to-MRF before that.
2816 */
2817 break;
2818 }
2819
2820 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
2821 regions_overlap(elk_fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
2822 inst->dst, inst->size_written)) {
2823 /* Found a SEND instruction, which means that there are
2824 * live values in MRFs from base_mrf to base_mrf +
2825 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2826 * above it.
2827 */
2828 break;
2829 }
2830 }
2831
2832 if (regs_left)
2833 continue;
2834
2835 /* Found all generating instructions of our MRF's source value, so it
2836 * should be safe to rewrite them to point to the MRF directly.
2837 */
2838 regs_left = (1 << regs_read(inst, 0)) - 1;
2839
2840 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2841 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2842 inst->src[0], inst->size_read(0))) {
2843 /* Clear the bits for any registers this instruction overwrites. */
2844 regs_left &= ~mask_relative_to(
2845 inst->src[0], scan_inst->dst, scan_inst->size_written);
2846
2847 const unsigned rel_offset = reg_offset(scan_inst->dst) -
2848 reg_offset(inst->src[0]);
2849
2850 if (inst->dst.nr & ELK_MRF_COMPR4) {
2851 /* Apply the same address transformation done by the hardware
2852 * for COMPR4 MRF writes.
2853 */
2854 assert(rel_offset < 2 * REG_SIZE);
2855 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
2856
2857 /* Clear the COMPR4 bit if the generating instruction is not
2858 * compressed.
2859 */
2860 if (scan_inst->size_written < 2 * REG_SIZE)
2861 scan_inst->dst.nr &= ~ELK_MRF_COMPR4;
2862
2863 } else {
2864 /* Calculate the MRF number the result of this instruction is
2865 * ultimately written to.
2866 */
2867 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
2868 }
2869
2870 scan_inst->dst.file = MRF;
2871 scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
2872 scan_inst->saturate |= inst->saturate;
2873 if (!regs_left)
2874 break;
2875 }
2876 }
2877
2878 assert(!regs_left);
2879 inst->remove(block);
2880 progress = true;
2881 }
2882
2883 if (progress)
2884 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2885
2886 return progress;
2887 }
2888
2889 /**
2890 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2891 * flow. We could probably do better here with some form of divergence
2892 * analysis.
2893 */
2894 bool
eliminate_find_live_channel()2895 elk_fs_visitor::eliminate_find_live_channel()
2896 {
2897 bool progress = false;
2898 unsigned depth = 0;
2899
2900 if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
2901 /* The optimization below assumes that channel zero is live on thread
2902 * dispatch, which may not be the case if the fixed function dispatches
2903 * threads sparsely.
2904 */
2905 return false;
2906 }
2907
2908 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2909 switch (inst->opcode) {
2910 case ELK_OPCODE_IF:
2911 case ELK_OPCODE_DO:
2912 depth++;
2913 break;
2914
2915 case ELK_OPCODE_ENDIF:
2916 case ELK_OPCODE_WHILE:
2917 depth--;
2918 break;
2919
2920 case ELK_OPCODE_HALT:
2921 /* This can potentially make control flow non-uniform until the end
2922 * of the program.
2923 */
2924 goto out;
2925
2926 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
2927 if (depth == 0) {
2928 inst->opcode = ELK_OPCODE_MOV;
2929 inst->src[0] = elk_imm_ud(0u);
2930 inst->sources = 1;
2931 inst->force_writemask_all = true;
2932 progress = true;
2933 }
2934 break;
2935
2936 default:
2937 break;
2938 }
2939 }
2940
2941 out:
2942 if (progress)
2943 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2944
2945 return progress;
2946 }
2947
2948 /**
2949 * Once we've generated code, try to convert normal ELK_FS_OPCODE_FB_WRITE
2950 * instructions to ELK_FS_OPCODE_REP_FB_WRITE.
2951 */
2952 void
emit_repclear_shader()2953 elk_fs_visitor::emit_repclear_shader()
2954 {
2955 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
2956 elk_fs_inst *write = NULL;
2957
2958 assert(uniforms == 0);
2959 assume(key->nr_color_regions > 0);
2960
2961 elk_fs_reg color_output, header;
2962 if (devinfo->ver >= 7) {
2963 color_output = retype(elk_vec4_grf(127, 0), ELK_REGISTER_TYPE_UD);
2964 header = retype(elk_vec8_grf(125, 0), ELK_REGISTER_TYPE_UD);
2965 } else {
2966 color_output = retype(elk_vec4_reg(MRF, 2, 0), ELK_REGISTER_TYPE_UD);
2967 header = retype(elk_vec8_reg(MRF, 0, 0), ELK_REGISTER_TYPE_UD);
2968 }
2969
2970 /* We pass the clear color as a flat input. Copy it to the output. */
2971 elk_fs_reg color_input =
2972 elk_reg(ELK_GENERAL_REGISTER_FILE, 2, 3, 0, 0, ELK_REGISTER_TYPE_UD,
2973 ELK_VERTICAL_STRIDE_8, ELK_WIDTH_2, ELK_HORIZONTAL_STRIDE_4,
2974 ELK_SWIZZLE_XYZW, WRITEMASK_XYZW);
2975
2976 const fs_builder bld = fs_builder(this).at_end();
2977 bld.exec_all().group(4, 0).MOV(color_output, color_input);
2978
2979 if (key->nr_color_regions > 1) {
2980 /* Copy g0..g1 as the message header */
2981 bld.exec_all().group(16, 0)
2982 .MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2983 }
2984
2985 for (int i = 0; i < key->nr_color_regions; ++i) {
2986 if (i > 0)
2987 bld.exec_all().group(1, 0).MOV(component(header, 2), elk_imm_ud(i));
2988
2989 if (devinfo->ver >= 7) {
2990 write = bld.emit(ELK_SHADER_OPCODE_SEND);
2991 write->resize_sources(2);
2992 write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
2993 write->src[0] = elk_imm_ud(0);
2994 write->src[1] = i == 0 ? color_output : header;
2995 write->check_tdr = true;
2996 write->send_has_side_effects = true;
2997 write->desc = elk_fb_write_desc(devinfo, i,
2998 ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
2999 i == key->nr_color_regions - 1, false);
3000 } else {
3001 write = bld.emit(ELK_FS_OPCODE_REP_FB_WRITE);
3002 write->target = i;
3003 write->base_mrf = i == 0 ? color_output.nr : header.nr;
3004 }
3005
3006 /* We can use a headerless message for the first render target */
3007 write->header_size = i == 0 ? 0 : 2;
3008 write->mlen = 1 + write->header_size;
3009 }
3010 write->eot = true;
3011 write->last_rt = true;
3012
3013 calculate_cfg();
3014
3015 this->first_non_payload_grf = payload().num_regs;
3016 }
3017
3018 /**
3019 * Walks through basic blocks, looking for repeated MRF writes and
3020 * removing the later ones.
3021 */
3022 bool
remove_duplicate_mrf_writes()3023 elk_fs_visitor::remove_duplicate_mrf_writes()
3024 {
3025 elk_fs_inst *last_mrf_move[ELK_MAX_MRF(devinfo->ver)];
3026 bool progress = false;
3027
3028 /* Need to update the MRF tracking for compressed instructions. */
3029 if (dispatch_width >= 16)
3030 return false;
3031
3032 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3033
3034 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3035 if (inst->is_control_flow()) {
3036 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3037 }
3038
3039 if (inst->opcode == ELK_OPCODE_MOV &&
3040 inst->dst.file == MRF) {
3041 elk_fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3042 if (prev_inst && prev_inst->opcode == ELK_OPCODE_MOV &&
3043 inst->dst.equals(prev_inst->dst) &&
3044 inst->src[0].equals(prev_inst->src[0]) &&
3045 inst->saturate == prev_inst->saturate &&
3046 inst->predicate == prev_inst->predicate &&
3047 inst->conditional_mod == prev_inst->conditional_mod &&
3048 inst->exec_size == prev_inst->exec_size) {
3049 inst->remove(block);
3050 progress = true;
3051 continue;
3052 }
3053 }
3054
3055 /* Clear out the last-write records for MRFs that were overwritten. */
3056 if (inst->dst.file == MRF) {
3057 last_mrf_move[inst->dst.nr] = NULL;
3058 }
3059
3060 if (inst->mlen > 0 && inst->base_mrf != -1) {
3061 /* Found a SEND instruction, which will include two or fewer
3062 * implied MRF writes. We could do better here.
3063 */
3064 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3065 last_mrf_move[inst->base_mrf + i] = NULL;
3066 }
3067 }
3068
3069 /* Clear out any MRF move records whose sources got overwritten. */
3070 for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3071 if (last_mrf_move[i] &&
3072 regions_overlap(inst->dst, inst->size_written,
3073 last_mrf_move[i]->src[0],
3074 last_mrf_move[i]->size_read(0))) {
3075 last_mrf_move[i] = NULL;
3076 }
3077 }
3078
3079 if (inst->opcode == ELK_OPCODE_MOV &&
3080 inst->dst.file == MRF &&
3081 inst->src[0].file != ARF &&
3082 !inst->is_partial_write()) {
3083 last_mrf_move[inst->dst.nr] = inst;
3084 }
3085 }
3086
3087 if (progress)
3088 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3089
3090 return progress;
3091 }
3092
3093 /**
3094 * Rounding modes for conversion instructions are included for each
3095 * conversion, but right now it is a state. So once it is set,
3096 * we don't need to call it again for subsequent calls.
3097 *
3098 * This is useful for vector/matrices conversions, as setting the
3099 * mode once is enough for the full vector/matrix
3100 */
3101 bool
remove_extra_rounding_modes()3102 elk_fs_visitor::remove_extra_rounding_modes()
3103 {
3104 bool progress = false;
3105 unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3106
3107 elk_rnd_mode base_mode = ELK_RND_MODE_UNSPECIFIED;
3108 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3109 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3110 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3111 execution_mode)
3112 base_mode = ELK_RND_MODE_RTNE;
3113 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3114 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3115 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3116 execution_mode)
3117 base_mode = ELK_RND_MODE_RTZ;
3118
3119 foreach_block (block, cfg) {
3120 elk_rnd_mode prev_mode = base_mode;
3121
3122 foreach_inst_in_block_safe (elk_fs_inst, inst, block) {
3123 if (inst->opcode == ELK_SHADER_OPCODE_RND_MODE) {
3124 assert(inst->src[0].file == ELK_IMMEDIATE_VALUE);
3125 const elk_rnd_mode mode = (elk_rnd_mode) inst->src[0].d;
3126 if (mode == prev_mode) {
3127 inst->remove(block);
3128 progress = true;
3129 } else {
3130 prev_mode = mode;
3131 }
3132 }
3133 }
3134 }
3135
3136 if (progress)
3137 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3138
3139 return progress;
3140 }
3141
3142 static void
clear_deps_for_inst_src(elk_fs_inst * inst,bool * deps,int first_grf,int grf_len)3143 clear_deps_for_inst_src(elk_fs_inst *inst, bool *deps, int first_grf, int grf_len)
3144 {
3145 /* Clear the flag for registers that actually got read (as expected). */
3146 for (int i = 0; i < inst->sources; i++) {
3147 int grf;
3148 if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3149 grf = inst->src[i].nr;
3150 } else {
3151 continue;
3152 }
3153
3154 if (grf >= first_grf &&
3155 grf < first_grf + grf_len) {
3156 deps[grf - first_grf] = false;
3157 if (inst->exec_size == 16)
3158 deps[grf - first_grf + 1] = false;
3159 }
3160 }
3161 }
3162
3163 /**
3164 * Implements this workaround for the original 965:
3165 *
3166 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3167 * check for post destination dependencies on this instruction, software
3168 * must ensure that there is no destination hazard for the case of ‘write
3169 * followed by a posted write’ shown in the following example.
3170 *
3171 * 1. mov r3 0
3172 * 2. send r3.xy <rest of send instruction>
3173 * 3. mov r2 r3
3174 *
3175 * Due to no post-destination dependency check on the ‘send’, the above
3176 * code sequence could have two instructions (1 and 2) in flight at the
3177 * same time that both consider ‘r3’ as the target of their final writes.
3178 */
3179 void
insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3180 elk_fs_visitor::insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t *block,
3181 elk_fs_inst *inst)
3182 {
3183 int write_len = regs_written(inst);
3184 int first_write_grf = inst->dst.nr;
3185 bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3186 assert(write_len < (int)sizeof(needs_dep) - 1);
3187
3188 memset(needs_dep, false, sizeof(needs_dep));
3189 memset(needs_dep, true, write_len);
3190
3191 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3192
3193 /* Walk backwards looking for writes to registers we're writing which
3194 * aren't read since being written. If we hit the start of the program,
3195 * we assume that there are no outstanding dependencies on entry to the
3196 * program.
3197 */
3198 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3199 /* If we hit control flow, assume that there *are* outstanding
3200 * dependencies, and force their cleanup before our instruction.
3201 */
3202 if (block->start() == scan_inst && block->num != 0) {
3203 for (int i = 0; i < write_len; i++) {
3204 if (needs_dep[i])
3205 DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3206 first_write_grf + i);
3207 }
3208 return;
3209 }
3210
3211 /* We insert our reads as late as possible on the assumption that any
3212 * instruction but a MOV that might have left us an outstanding
3213 * dependency has more latency than a MOV.
3214 */
3215 if (scan_inst->dst.file == VGRF) {
3216 for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3217 int reg = scan_inst->dst.nr + i;
3218
3219 if (reg >= first_write_grf &&
3220 reg < first_write_grf + write_len &&
3221 needs_dep[reg - first_write_grf]) {
3222 DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3223 needs_dep[reg - first_write_grf] = false;
3224 if (scan_inst->exec_size == 16)
3225 needs_dep[reg - first_write_grf + 1] = false;
3226 }
3227 }
3228 }
3229
3230 /* Clear the flag for registers that actually got read (as expected). */
3231 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3232
3233 /* Continue the loop only if we haven't resolved all the dependencies */
3234 int i;
3235 for (i = 0; i < write_len; i++) {
3236 if (needs_dep[i])
3237 break;
3238 }
3239 if (i == write_len)
3240 return;
3241 }
3242 }
3243
3244 /**
3245 * Implements this workaround for the original 965:
3246 *
3247 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3248 * used as a destination register until after it has been sourced by an
3249 * instruction with a different destination register.
3250 */
3251 void
insert_gfx4_post_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3252 elk_fs_visitor::insert_gfx4_post_send_dependency_workarounds(elk_bblock_t *block, elk_fs_inst *inst)
3253 {
3254 int write_len = regs_written(inst);
3255 unsigned first_write_grf = inst->dst.nr;
3256 bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3257 assert(write_len < (int)sizeof(needs_dep) - 1);
3258
3259 memset(needs_dep, false, sizeof(needs_dep));
3260 memset(needs_dep, true, write_len);
3261 /* Walk forwards looking for writes to registers we're writing which aren't
3262 * read before being written.
3263 */
3264 foreach_inst_in_block_starting_from(elk_fs_inst, scan_inst, inst) {
3265 /* If we hit control flow, force resolve all remaining dependencies. */
3266 if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3267 for (int i = 0; i < write_len; i++) {
3268 if (needs_dep[i])
3269 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3270 first_write_grf + i);
3271 }
3272 return;
3273 }
3274
3275 /* Clear the flag for registers that actually got read (as expected). */
3276 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3277
3278 /* We insert our reads as late as possible since they're reading the
3279 * result of a SEND, which has massive latency.
3280 */
3281 if (scan_inst->dst.file == VGRF &&
3282 scan_inst->dst.nr >= first_write_grf &&
3283 scan_inst->dst.nr < first_write_grf + write_len &&
3284 needs_dep[scan_inst->dst.nr - first_write_grf]) {
3285 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3286 scan_inst->dst.nr);
3287 needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3288 }
3289
3290 /* Continue the loop only if we haven't resolved all the dependencies */
3291 int i;
3292 for (i = 0; i < write_len; i++) {
3293 if (needs_dep[i])
3294 break;
3295 }
3296 if (i == write_len)
3297 return;
3298 }
3299 }
3300
3301 void
insert_gfx4_send_dependency_workarounds()3302 elk_fs_visitor::insert_gfx4_send_dependency_workarounds()
3303 {
3304 if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
3305 return;
3306
3307 bool progress = false;
3308
3309 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
3310 if (inst->mlen != 0 && inst->dst.file == VGRF) {
3311 insert_gfx4_pre_send_dependency_workarounds(block, inst);
3312 insert_gfx4_post_send_dependency_workarounds(block, inst);
3313 progress = true;
3314 }
3315 }
3316
3317 if (progress)
3318 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3319 }
3320
3321 bool
lower_load_payload()3322 elk_fs_visitor::lower_load_payload()
3323 {
3324 bool progress = false;
3325
3326 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3327 if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
3328 continue;
3329
3330 assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3331 assert(inst->saturate == false);
3332 elk_fs_reg dst = inst->dst;
3333
3334 /* Get rid of COMPR4. We'll add it back in if we need it */
3335 if (dst.file == MRF)
3336 dst.nr = dst.nr & ~ELK_MRF_COMPR4;
3337
3338 const fs_builder ibld(this, block, inst);
3339 const fs_builder ubld = ibld.exec_all();
3340
3341 for (uint8_t i = 0; i < inst->header_size;) {
3342 /* Number of header GRFs to initialize at once with a single MOV
3343 * instruction.
3344 */
3345 const unsigned n =
3346 (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3347 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3348 2 : 1;
3349
3350 if (inst->src[i].file != BAD_FILE)
3351 ubld.group(8 * n, 0).MOV(retype(dst, ELK_REGISTER_TYPE_UD),
3352 retype(inst->src[i], ELK_REGISTER_TYPE_UD));
3353
3354 dst = byte_offset(dst, n * REG_SIZE);
3355 i += n;
3356 }
3357
3358 if (inst->dst.file == MRF && (inst->dst.nr & ELK_MRF_COMPR4) &&
3359 inst->exec_size > 8) {
3360 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3361 * a straightforward copy. Instead, the result of the
3362 * LOAD_PAYLOAD is treated as interleaved and the first four
3363 * non-header sources are unpacked as:
3364 *
3365 * m + 0: r0
3366 * m + 1: g0
3367 * m + 2: b0
3368 * m + 3: a0
3369 * m + 4: r1
3370 * m + 5: g1
3371 * m + 6: b1
3372 * m + 7: a1
3373 *
3374 * This is used for gen <= 5 fb writes.
3375 */
3376 assert(inst->exec_size == 16);
3377 assert(inst->header_size + 4 <= inst->sources);
3378 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3379 if (inst->src[i].file != BAD_FILE) {
3380 if (devinfo->has_compr4) {
3381 elk_fs_reg compr4_dst = retype(dst, inst->src[i].type);
3382 compr4_dst.nr |= ELK_MRF_COMPR4;
3383 ibld.MOV(compr4_dst, inst->src[i]);
3384 } else {
3385 /* Platform doesn't have COMPR4. We have to fake it */
3386 elk_fs_reg mov_dst = retype(dst, inst->src[i].type);
3387 ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
3388 mov_dst.nr += 4;
3389 ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
3390 }
3391 }
3392
3393 dst.nr++;
3394 }
3395
3396 /* The loop above only ever incremented us through the first set
3397 * of 4 registers. However, thanks to the magic of COMPR4, we
3398 * actually wrote to the first 8 registers, so we need to take
3399 * that into account now.
3400 */
3401 dst.nr += 4;
3402
3403 /* The COMPR4 code took care of the first 4 sources. We'll let
3404 * the regular path handle any remaining sources. Yes, we are
3405 * modifying the instruction but we're about to delete it so
3406 * this really doesn't hurt anything.
3407 */
3408 inst->header_size += 4;
3409 }
3410
3411 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3412 dst.type = inst->src[i].type;
3413 if (inst->src[i].file != BAD_FILE) {
3414 ibld.MOV(dst, inst->src[i]);
3415 }
3416 dst = offset(dst, ibld, 1);
3417 }
3418
3419 inst->remove(block);
3420 progress = true;
3421 }
3422
3423 if (progress)
3424 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3425
3426 return progress;
3427 }
3428
3429 /**
3430 * Factor an unsigned 32-bit integer.
3431 *
3432 * Attempts to factor \c x into two values that are at most 0xFFFF. If no
3433 * such factorization is possible, either because the value is too large or is
3434 * prime, both \c result_a and \c result_b will be zero.
3435 */
3436 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)3437 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
3438 {
3439 /* This is necessary to prevent various opportunities for division by zero
3440 * below.
3441 */
3442 assert(x > 0xffff);
3443
3444 /* This represents the actual expected constraints on the input. Namely,
3445 * both the upper and lower words should be > 1.
3446 */
3447 assert(x >= 0x00020002);
3448
3449 *result_a = 0;
3450 *result_b = 0;
3451
3452 /* The value is too large to factor with the constraints. */
3453 if (x > (0xffffu * 0xffffu))
3454 return;
3455
3456 /* A non-prime number will have the form p*q*d where p is some prime
3457 * number, q > 1, and 1 <= d <= q. To meet the constraints of this
3458 * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p).
3459 * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally,
3460 * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
3461 *
3462 * The observation is finding the largest possible value of p reduces the
3463 * possible range of d. After selecting p, all values of d in this range
3464 * are tested until a factorization is found. The size of the range of
3465 * possible values of d sets an upper bound on the run time of the
3466 * function.
3467 */
3468 static const uint16_t primes[256] = {
3469 2, 3, 5, 7, 11, 13, 17, 19,
3470 23, 29, 31, 37, 41, 43, 47, 53,
3471 59, 61, 67, 71, 73, 79, 83, 89,
3472 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */
3473 137, 139, 149, 151, 157, 163, 167, 173,
3474 179, 181, 191, 193, 197, 199, 211, 223,
3475 227, 229, 233, 239, 241, 251, 257, 263,
3476 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */
3477 313, 317, 331, 337, 347, 349, 353, 359,
3478 367, 373, 379, 383, 389, 397, 401, 409,
3479 419, 421, 431, 433, 439, 443, 449, 457,
3480 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */
3481 509, 521, 523, 541, 547, 557, 563, 569,
3482 571, 577, 587, 593, 599, 601, 607, 613,
3483 617, 619, 631, 641, 643, 647, 653, 659,
3484 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */
3485 727, 733, 739, 743, 751, 757, 761, 769,
3486 773, 787, 797, 809, 811, 821, 823, 827,
3487 829, 839, 853, 857, 859, 863, 877, 881,
3488 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */
3489 947, 953, 967, 971, 977, 983, 991, 997,
3490 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
3491 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
3492 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */
3493 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
3494 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
3495 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
3496 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */
3497 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
3498 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
3499 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
3500 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */
3501 };
3502
3503 unsigned p;
3504 unsigned x_div_p;
3505
3506 for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
3507 p = primes[i];
3508 x_div_p = x / p;
3509
3510 if ((x_div_p * p) == x)
3511 break;
3512 }
3513
3514 /* A prime factor was not found. */
3515 if (x_div_p * p != x)
3516 return;
3517
3518 /* Terminate early if d=1 is a solution. */
3519 if (x_div_p < 0x10000) {
3520 *result_a = x_div_p;
3521 *result_b = p;
3522 return;
3523 }
3524
3525 /* Pick the maximum possible value for 'd'. It's important that the loop
3526 * below execute while d <= max_d because max_d is a valid value. Having
3527 * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
3528 * incorrectly reported as not being factorable. The problem would occur
3529 * with any value that is a factor of two primes in the table and one prime
3530 * not in the table.
3531 */
3532 const unsigned max_d = 0xffff / p;
3533
3534 /* Pick an initial value of 'd' that (combined with rejecting too large
3535 * values above) guarantees that 'q' will always be small enough.
3536 * DIV_ROUND_UP is used to prevent 'd' from being zero.
3537 */
3538 for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
3539 unsigned q = x_div_p / d;
3540
3541 if ((q * d) == x_div_p) {
3542 assert(p * d * q == x);
3543 assert((p * d) < 0x10000);
3544
3545 *result_a = q;
3546 *result_b = p * d;
3547 break;
3548 }
3549
3550 /* Since every value of 'd' is tried, as soon as 'd' is larger
3551 * than 'q', we're just re-testing combinations that have
3552 * already been tested.
3553 */
3554 if (d > q)
3555 break;
3556 }
3557 }
3558
3559 void
lower_mul_dword_inst(elk_fs_inst * inst,elk_bblock_t * block)3560 elk_fs_visitor::lower_mul_dword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3561 {
3562 const fs_builder ibld(this, block, inst);
3563
3564 /* It is correct to use inst->src[1].d in both end of the comparison.
3565 * Using .ud in the UINT16_MAX comparison would cause any negative value to
3566 * fail the check.
3567 */
3568 if (inst->src[1].file == IMM &&
3569 (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
3570 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3571 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3572 * src1 are used.
3573 *
3574 * If multiplying by an immediate value that fits in 16-bits, do a
3575 * single MUL instruction with that value in the proper location.
3576 */
3577 const bool ud = (inst->src[1].d >= 0);
3578 if (devinfo->ver < 7) {
3579 elk_fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
3580 ibld.MOV(imm, inst->src[1]);
3581 ibld.MUL(inst->dst, imm, inst->src[0]);
3582 } else {
3583 ibld.MUL(inst->dst, inst->src[0],
3584 ud ? elk_imm_uw(inst->src[1].ud)
3585 : elk_imm_w(inst->src[1].d));
3586 }
3587 } else {
3588 /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
3589 * do 32-bit integer multiplication in one instruction, but instead
3590 * must do a sequence (which actually calculates a 64-bit result):
3591 *
3592 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3593 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3594 * mov(8) g2<1>D acc0<8,8,1>D
3595 *
3596 * But on Gen > 6, the ability to use second accumulator register
3597 * (acc1) for non-float data types was removed, preventing a simple
3598 * implementation in SIMD16. A 16-channel result can be calculated by
3599 * executing the three instructions twice in SIMD8, once with quarter
3600 * control of 1Q for the first eight channels and again with 2Q for
3601 * the second eight channels.
3602 *
3603 * Which accumulator register is implicitly accessed (by AccWrEnable
3604 * for instance) is determined by the quarter control. Unfortunately
3605 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3606 * implicit accumulator access by an instruction with 2Q will access
3607 * acc1 regardless of whether the data type is usable in acc1.
3608 *
3609 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3610 * integer data types.
3611 *
3612 * Since we only want the low 32-bits of the result, we can do two
3613 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3614 * adjust the high result and add them (like the mach is doing):
3615 *
3616 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3617 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3618 * shl(8) g9<1>D g8<8,8,1>D 16D
3619 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3620 *
3621 * We avoid the shl instruction by realizing that we only want to add
3622 * the low 16-bits of the "high" result to the high 16-bits of the
3623 * "low" result and using proper regioning on the add:
3624 *
3625 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3626 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3627 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3628 *
3629 * Since it does not use the (single) accumulator register, we can
3630 * schedule multi-component multiplications much better.
3631 */
3632
3633 bool needs_mov = false;
3634 elk_fs_reg orig_dst = inst->dst;
3635
3636 /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3637 * reusing the original destination is impossible due to hardware
3638 * restrictions, source/destination overlap, or it being the null
3639 * register.
3640 */
3641 elk_fs_reg low = inst->dst;
3642 if (orig_dst.is_null() || orig_dst.file == MRF ||
3643 regions_overlap(inst->dst, inst->size_written,
3644 inst->src[0], inst->size_read(0)) ||
3645 regions_overlap(inst->dst, inst->size_written,
3646 inst->src[1], inst->size_read(1)) ||
3647 inst->dst.stride >= 4) {
3648 needs_mov = true;
3649 low = elk_fs_reg(VGRF, alloc.allocate(regs_written(inst)),
3650 inst->dst.type);
3651 }
3652
3653 /* Get a new VGRF but keep the same stride as inst->dst */
3654 elk_fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
3655 high.stride = inst->dst.stride;
3656 high.offset = inst->dst.offset % REG_SIZE;
3657
3658 bool do_addition = true;
3659 if (devinfo->ver >= 7) {
3660 if (inst->src[1].abs)
3661 lower_src_modifiers(this, block, inst, 1);
3662
3663 if (inst->src[1].file == IMM) {
3664 unsigned a;
3665 unsigned b;
3666
3667 /* If the immeditate value can be factored into two values, A and
3668 * B, that each fit in 16-bits, the multiplication result can
3669 * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
3670 * This saves an operation (the addition) and a temporary register
3671 * (high).
3672 *
3673 * Skip the optimization if either the high word or the low word
3674 * is 0 or 1. In these conditions, at least one of the
3675 * multiplications generated by the straightforward method will be
3676 * eliminated anyway.
3677 */
3678 if (inst->src[1].ud > 0x0001ffff &&
3679 (inst->src[1].ud & 0xffff) > 1) {
3680 factor_uint32(inst->src[1].ud, &a, &b);
3681
3682 if (a != 0) {
3683 ibld.MUL(low, inst->src[0], elk_imm_uw(a));
3684 ibld.MUL(low, low, elk_imm_uw(b));
3685 do_addition = false;
3686 }
3687 }
3688
3689 if (do_addition) {
3690 ibld.MUL(low, inst->src[0],
3691 elk_imm_uw(inst->src[1].ud & 0xffff));
3692 ibld.MUL(high, inst->src[0],
3693 elk_imm_uw(inst->src[1].ud >> 16));
3694 }
3695 } else {
3696 ibld.MUL(low, inst->src[0],
3697 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3698 ibld.MUL(high, inst->src[0],
3699 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 1));
3700 }
3701 } else {
3702 if (inst->src[0].abs)
3703 lower_src_modifiers(this, block, inst, 0);
3704
3705 ibld.MUL(low, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 0),
3706 inst->src[1]);
3707 ibld.MUL(high, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 1),
3708 inst->src[1]);
3709 }
3710
3711 if (do_addition) {
3712 ibld.ADD(subscript(low, ELK_REGISTER_TYPE_UW, 1),
3713 subscript(low, ELK_REGISTER_TYPE_UW, 1),
3714 subscript(high, ELK_REGISTER_TYPE_UW, 0));
3715 }
3716
3717 if (needs_mov || inst->conditional_mod)
3718 set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
3719 }
3720 }
3721
3722 void
lower_mul_qword_inst(elk_fs_inst * inst,elk_bblock_t * block)3723 elk_fs_visitor::lower_mul_qword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3724 {
3725 const fs_builder ibld(this, block, inst);
3726
3727 /* Considering two 64-bit integers ab and cd where each letter ab
3728 * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
3729 * only need to provide the YZ part of the result. -------
3730 * BD
3731 * Only BD needs to be 64 bits. For AD and BC we only care + AD
3732 * about the lower 32 bits (since they are part of the upper + BC
3733 * 32 bits of our result). AC is not needed since it starts + AC
3734 * on the 65th bit of the result. -------
3735 * WXYZ
3736 */
3737 unsigned int q_regs = regs_written(inst);
3738 unsigned int d_regs = (q_regs + 1) / 2;
3739
3740 elk_fs_reg bd(VGRF, alloc.allocate(q_regs), ELK_REGISTER_TYPE_UQ);
3741 elk_fs_reg ad(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3742 elk_fs_reg bc(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3743
3744 /* Here we need the full 64 bit result for 32b * 32b. */
3745 if (devinfo->has_integer_dword_mul) {
3746 ibld.MUL(bd, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3747 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3748 } else {
3749 elk_fs_reg bd_high(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3750 elk_fs_reg bd_low(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3751 const unsigned acc_width = reg_unit(devinfo) * 8;
3752 elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), ELK_REGISTER_TYPE_UD),
3753 inst->group % acc_width);
3754
3755 elk_fs_inst *mul = ibld.MUL(acc,
3756 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3757 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3758 mul->writes_accumulator = true;
3759
3760 ibld.MACH(bd_high, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3761 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3762 ibld.MOV(bd_low, acc);
3763
3764 ibld.UNDEF(bd);
3765 ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 0), bd_low);
3766 ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 1), bd_high);
3767 }
3768
3769 ibld.MUL(ad, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
3770 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3771 ibld.MUL(bc, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3772 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1));
3773
3774 ibld.ADD(ad, ad, bc);
3775 ibld.ADD(subscript(bd, ELK_REGISTER_TYPE_UD, 1),
3776 subscript(bd, ELK_REGISTER_TYPE_UD, 1), ad);
3777
3778 if (devinfo->has_64bit_int) {
3779 ibld.MOV(inst->dst, bd);
3780 } else {
3781 if (!inst->is_partial_write())
3782 ibld.emit_undef_for_dst(inst);
3783 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
3784 subscript(bd, ELK_REGISTER_TYPE_UD, 0));
3785 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
3786 subscript(bd, ELK_REGISTER_TYPE_UD, 1));
3787 }
3788 }
3789
3790 void
lower_mulh_inst(elk_fs_inst * inst,elk_bblock_t * block)3791 elk_fs_visitor::lower_mulh_inst(elk_fs_inst *inst, elk_bblock_t *block)
3792 {
3793 const fs_builder ibld(this, block, inst);
3794
3795 /* According to the BDW+ BSpec page for the "Multiply Accumulate
3796 * High" instruction:
3797 *
3798 * "An added preliminary mov is required for source modification on
3799 * src1:
3800 * mov (8) r3.0<1>:d -r3<8;8,1>:d
3801 * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
3802 * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
3803 */
3804 if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
3805 lower_src_modifiers(this, block, inst, 1);
3806
3807 /* Should have been lowered to 8-wide. */
3808 assert(inst->exec_size <= get_lowered_simd_width(this, inst));
3809 const unsigned acc_width = reg_unit(devinfo) * 8;
3810 const elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), inst->dst.type),
3811 inst->group % acc_width);
3812 elk_fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3813 elk_fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3814
3815 if (devinfo->ver >= 8) {
3816 /* Until Gfx8, integer multiplies read 32-bits from one source,
3817 * and 16-bits from the other, and relying on the MACH instruction
3818 * to generate the high bits of the result.
3819 *
3820 * On Gfx8, the multiply instruction does a full 32x32-bit
3821 * multiply, but in order to do a 64-bit multiply we can simulate
3822 * the previous behavior and then use a MACH instruction.
3823 */
3824 assert(mul->src[1].type == ELK_REGISTER_TYPE_D ||
3825 mul->src[1].type == ELK_REGISTER_TYPE_UD);
3826 mul->src[1].type = ELK_REGISTER_TYPE_UW;
3827 mul->src[1].stride *= 2;
3828
3829 if (mul->src[1].file == IMM) {
3830 mul->src[1] = elk_imm_uw(mul->src[1].ud);
3831 }
3832 } else if (devinfo->verx10 == 70 &&
3833 inst->group > 0) {
3834 /* Among other things the quarter control bits influence which
3835 * accumulator register is used by the hardware for instructions
3836 * that access the accumulator implicitly (e.g. MACH). A
3837 * second-half instruction would normally map to acc1, which
3838 * doesn't exist on Gfx7 and up (the hardware does emulate it for
3839 * floating-point instructions *only* by taking advantage of the
3840 * extra precision of acc0 not normally used for floating point
3841 * arithmetic).
3842 *
3843 * HSW and up are careful enough not to try to access an
3844 * accumulator register that doesn't exist, but on earlier Gfx7
3845 * hardware we need to make sure that the quarter control bits are
3846 * zero to avoid non-deterministic behaviour and emit an extra MOV
3847 * to get the result masked correctly according to the current
3848 * channel enables.
3849 */
3850 mach->group = 0;
3851 mach->force_writemask_all = true;
3852 mach->dst = ibld.vgrf(inst->dst.type);
3853 ibld.MOV(inst->dst, mach->dst);
3854 }
3855 }
3856
3857 bool
lower_integer_multiplication()3858 elk_fs_visitor::lower_integer_multiplication()
3859 {
3860 bool progress = false;
3861
3862 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3863 if (inst->opcode == ELK_OPCODE_MUL) {
3864 /* If the instruction is already in a form that does not need lowering,
3865 * return early.
3866 */
3867 if (devinfo->ver >= 7) {
3868 if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
3869 continue;
3870 } else {
3871 if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
3872 continue;
3873 }
3874
3875 if ((inst->dst.type == ELK_REGISTER_TYPE_Q ||
3876 inst->dst.type == ELK_REGISTER_TYPE_UQ) &&
3877 (inst->src[0].type == ELK_REGISTER_TYPE_Q ||
3878 inst->src[0].type == ELK_REGISTER_TYPE_UQ) &&
3879 (inst->src[1].type == ELK_REGISTER_TYPE_Q ||
3880 inst->src[1].type == ELK_REGISTER_TYPE_UQ)) {
3881 lower_mul_qword_inst(inst, block);
3882 inst->remove(block);
3883 progress = true;
3884 } else if (!inst->dst.is_accumulator() &&
3885 (inst->dst.type == ELK_REGISTER_TYPE_D ||
3886 inst->dst.type == ELK_REGISTER_TYPE_UD) &&
3887 !devinfo->has_integer_dword_mul) {
3888 lower_mul_dword_inst(inst, block);
3889 inst->remove(block);
3890 progress = true;
3891 }
3892 } else if (inst->opcode == ELK_SHADER_OPCODE_MULH) {
3893 lower_mulh_inst(inst, block);
3894 inst->remove(block);
3895 progress = true;
3896 }
3897
3898 }
3899
3900 if (progress)
3901 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3902
3903 return progress;
3904 }
3905
3906 bool
lower_minmax()3907 elk_fs_visitor::lower_minmax()
3908 {
3909 assert(devinfo->ver < 6);
3910
3911 bool progress = false;
3912
3913 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3914 const fs_builder ibld(this, block, inst);
3915
3916 if (inst->opcode == ELK_OPCODE_SEL &&
3917 inst->predicate == ELK_PREDICATE_NONE) {
3918 /* If src1 is an immediate value that is not NaN, then it can't be
3919 * NaN. In that case, emit CMP because it is much better for cmod
3920 * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
3921 * support HF or DF, so it is not necessary to check for those.
3922 */
3923 if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
3924 (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
3925 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
3926 inst->conditional_mod);
3927 } else {
3928 ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
3929 inst->conditional_mod);
3930 }
3931 inst->predicate = ELK_PREDICATE_NORMAL;
3932 inst->conditional_mod = ELK_CONDITIONAL_NONE;
3933
3934 progress = true;
3935 }
3936 }
3937
3938 if (progress)
3939 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3940
3941 return progress;
3942 }
3943
3944 bool
lower_sub_sat()3945 elk_fs_visitor::lower_sub_sat()
3946 {
3947 bool progress = false;
3948
3949 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3950 const fs_builder ibld(this, block, inst);
3951
3952 if (inst->opcode == ELK_SHADER_OPCODE_USUB_SAT ||
3953 inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
3954 /* The fundamental problem is the hardware performs source negation
3955 * at the bit width of the source. If the source is 0x80000000D, the
3956 * negation is 0x80000000D. As a result, subtractSaturate(0,
3957 * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
3958 * are at least three ways to resolve this:
3959 *
3960 * 1. Use the accumulator for the negated source. The accumulator is
3961 * 33 bits, so our source 0x80000000 is sign-extended to
3962 * 0x1800000000. The negation of which is 0x080000000. This
3963 * doesn't help for 64-bit integers (which are already bigger than
3964 * 33 bits). There are also only 8 accumulators, so SIMD16 or
3965 * SIMD32 instructions would have to be split into multiple SIMD8
3966 * instructions.
3967 *
3968 * 2. Use slightly different math. For any n-bit value x, we know (x
3969 * >> 1) != -(x >> 1). We can use this fact to only do
3970 * subtractions involving (x >> 1). subtractSaturate(a, b) ==
3971 * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
3972 *
3973 * 3. For unsigned sources, it is sufficient to replace the
3974 * subtractSaturate with (a > b) ? a - b : 0.
3975 *
3976 * It may also be possible to use the SUBB instruction. This
3977 * implicitly writes the accumulator, so it could only be used in the
3978 * same situations as #1 above. It is further limited by only
3979 * allowing UD sources.
3980 */
3981 if (inst->exec_size == 8 && inst->src[0].type != ELK_REGISTER_TYPE_Q &&
3982 inst->src[0].type != ELK_REGISTER_TYPE_UQ) {
3983 elk_fs_reg acc(ARF, ELK_ARF_ACCUMULATOR, inst->src[1].type);
3984
3985 ibld.MOV(acc, inst->src[1]);
3986 elk_fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
3987 add->saturate = true;
3988 add->src[0].negate = true;
3989 } else if (inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
3990 /* tmp = src1 >> 1;
3991 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
3992 */
3993 elk_fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
3994 elk_fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
3995 elk_fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
3996 elk_fs_inst *add;
3997
3998 ibld.SHR(tmp1, inst->src[1], elk_imm_d(1));
3999
4000 add = ibld.ADD(tmp2, inst->src[1], tmp1);
4001 add->src[1].negate = true;
4002
4003 add = ibld.ADD(tmp3, inst->src[0], tmp1);
4004 add->src[1].negate = true;
4005 add->saturate = true;
4006
4007 add = ibld.ADD(inst->dst, tmp3, tmp2);
4008 add->src[1].negate = true;
4009 add->saturate = true;
4010 } else {
4011 /* a > b ? a - b : 0 */
4012 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4013 ELK_CONDITIONAL_G);
4014
4015 elk_fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4016 add->src[1].negate = !add->src[1].negate;
4017
4018 ibld.SEL(inst->dst, inst->dst, elk_imm_ud(0))
4019 ->predicate = ELK_PREDICATE_NORMAL;
4020 }
4021
4022 inst->remove(block);
4023 progress = true;
4024 }
4025 }
4026
4027 if (progress)
4028 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4029
4030 return progress;
4031 }
4032
4033 /**
4034 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4035 * by discard. Due to the layout of the sample mask in the fragment shader
4036 * thread payload, \p bld is required to have a dispatch_width() not greater
4037 * than 16 for fragment shaders.
4038 */
4039 elk_fs_reg
elk_sample_mask_reg(const fs_builder & bld)4040 elk_sample_mask_reg(const fs_builder &bld)
4041 {
4042 const elk_fs_visitor &s = *bld.shader;
4043
4044 if (s.stage != MESA_SHADER_FRAGMENT) {
4045 return elk_imm_ud(0xffffffff);
4046 } else if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4047 assert(bld.dispatch_width() <= 16);
4048 return elk_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
4049 } else {
4050 assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4051 return retype(elk_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4052 ELK_REGISTER_TYPE_UW);
4053 }
4054 }
4055
4056 uint32_t
elk_fb_write_msg_control(const elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data)4057 elk_fb_write_msg_control(const elk_fs_inst *inst,
4058 const struct elk_wm_prog_data *prog_data)
4059 {
4060 uint32_t mctl;
4061
4062 if (inst->opcode == ELK_FS_OPCODE_REP_FB_WRITE) {
4063 assert(inst->group == 0 && inst->exec_size == 16);
4064 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4065 } else if (prog_data->dual_src_blend) {
4066 assert(inst->exec_size == 8);
4067
4068 if (inst->group % 16 == 0)
4069 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4070 else if (inst->group % 16 == 8)
4071 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4072 else
4073 unreachable("Invalid dual-source FB write instruction group");
4074 } else {
4075 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4076
4077 if (inst->exec_size == 16)
4078 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4079 else if (inst->exec_size == 8)
4080 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4081 else
4082 unreachable("Invalid FB write execution size");
4083 }
4084
4085 return mctl;
4086 }
4087
4088 /**
4089 * Predicate the specified instruction on the sample mask.
4090 */
4091 void
elk_emit_predicate_on_sample_mask(const fs_builder & bld,elk_fs_inst * inst)4092 elk_emit_predicate_on_sample_mask(const fs_builder &bld, elk_fs_inst *inst)
4093 {
4094 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
4095 bld.group() == inst->group &&
4096 bld.dispatch_width() == inst->exec_size);
4097
4098 const elk_fs_visitor &s = *bld.shader;
4099 const elk_fs_reg sample_mask = elk_sample_mask_reg(bld);
4100 const unsigned subreg = sample_mask_flag_subreg(s);
4101
4102 if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4103 assert(sample_mask.file == ARF &&
4104 sample_mask.nr == elk_flag_subreg(subreg).nr &&
4105 sample_mask.subnr == elk_flag_subreg(
4106 subreg + inst->group / 16).subnr);
4107 } else {
4108 bld.group(1, 0).exec_all()
4109 .MOV(elk_flag_subreg(subreg + inst->group / 16), sample_mask);
4110 }
4111
4112 if (inst->predicate) {
4113 assert(inst->predicate == ELK_PREDICATE_NORMAL);
4114 assert(!inst->predicate_inverse);
4115 assert(inst->flag_subreg == 0);
4116 /* Combine the sample mask with the existing predicate by using a
4117 * vertical predication mode.
4118 */
4119 inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
4120 } else {
4121 inst->flag_subreg = subreg;
4122 inst->predicate = ELK_PREDICATE_NORMAL;
4123 inst->predicate_inverse = false;
4124 }
4125 }
4126
4127 static bool
is_mixed_float_with_fp32_dst(const elk_fs_inst * inst)4128 is_mixed_float_with_fp32_dst(const elk_fs_inst *inst)
4129 {
4130 /* This opcode sometimes uses :W type on the source even if the operand is
4131 * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
4132 */
4133 if (inst->opcode == ELK_OPCODE_F16TO32)
4134 return true;
4135
4136 if (inst->dst.type != ELK_REGISTER_TYPE_F)
4137 return false;
4138
4139 for (int i = 0; i < inst->sources; i++) {
4140 if (inst->src[i].type == ELK_REGISTER_TYPE_HF)
4141 return true;
4142 }
4143
4144 return false;
4145 }
4146
4147 static bool
is_mixed_float_with_packed_fp16_dst(const elk_fs_inst * inst)4148 is_mixed_float_with_packed_fp16_dst(const elk_fs_inst *inst)
4149 {
4150 /* This opcode sometimes uses :W type on the destination even if the
4151 * destination is a :HF, because in gfx7 there is no support for :HF, and
4152 * thus it uses :W.
4153 */
4154 if (inst->opcode == ELK_OPCODE_F32TO16 &&
4155 inst->dst.stride == 1)
4156 return true;
4157
4158 if (inst->dst.type != ELK_REGISTER_TYPE_HF ||
4159 inst->dst.stride != 1)
4160 return false;
4161
4162 for (int i = 0; i < inst->sources; i++) {
4163 if (inst->src[i].type == ELK_REGISTER_TYPE_F)
4164 return true;
4165 }
4166
4167 return false;
4168 }
4169
4170 /**
4171 * Get the closest allowed SIMD width for instruction \p inst accounting for
4172 * some common regioning and execution control restrictions that apply to FPU
4173 * instructions. These restrictions don't necessarily have any relevance to
4174 * instructions not executed by the FPU pipeline like extended math, control
4175 * flow or send message instructions.
4176 *
4177 * For virtual opcodes it's really up to the instruction -- In some cases
4178 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4179 * instructions) it may simplify virtual instruction lowering if we can
4180 * enforce FPU-like regioning restrictions already on the virtual instruction,
4181 * in other cases (e.g. virtual send-like instructions) this may be
4182 * excessively restrictive.
4183 */
4184 static unsigned
get_fpu_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4185 get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
4186 const elk_fs_inst *inst)
4187 {
4188 const struct elk_compiler *compiler = shader->compiler;
4189 const struct intel_device_info *devinfo = compiler->devinfo;
4190
4191 /* Maximum execution size representable in the instruction controls. */
4192 unsigned max_width = MIN2(32, inst->exec_size);
4193
4194 /* According to the PRMs:
4195 * "A. In Direct Addressing mode, a source cannot span more than 2
4196 * adjacent GRF registers.
4197 * B. A destination cannot span more than 2 adjacent GRF registers."
4198 *
4199 * Look for the source or destination with the largest register region
4200 * which is the one that is going to limit the overall execution size of
4201 * the instruction due to this rule.
4202 */
4203 unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4204
4205 for (unsigned i = 0; i < inst->sources; i++)
4206 reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
4207
4208 /* Calculate the maximum execution size of the instruction based on the
4209 * factor by which it goes over the hardware limit of 2 GRFs.
4210 */
4211 const unsigned max_reg_count = 2 * reg_unit(devinfo);
4212 if (reg_count > max_reg_count)
4213 max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
4214
4215 /* According to the IVB PRMs:
4216 * "When destination spans two registers, the source MUST span two
4217 * registers. The exception to the above rule:
4218 *
4219 * - When source is scalar, the source registers are not incremented.
4220 * - When source is packed integer Word and destination is packed
4221 * integer DWord, the source register is not incremented but the
4222 * source sub register is incremented."
4223 *
4224 * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
4225 * restrictions. The code below intentionally doesn't check whether the
4226 * destination type is integer because empirically the hardware doesn't
4227 * seem to care what the actual type is as long as it's dword-aligned.
4228 *
4229 * HSW PRMs also add a note to the second exception:
4230 * "When lower 8 channels are disabled, the sub register of source1
4231 * operand is not incremented. If the lower 8 channels are expected
4232 * to be disabled, say by predication, the instruction must be split
4233 * into pair of simd8 operations."
4234 *
4235 * We can't reliably know if the channels won't be disabled due to,
4236 * for example, IMASK. So, play it safe and disallow packed-word exception
4237 * for src1.
4238 */
4239 if (devinfo->ver < 8) {
4240 for (unsigned i = 0; i < inst->sources; i++) {
4241 /* IVB implements DF scalars as <0;2,1> regions. */
4242 const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4243 (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
4244 const bool is_packed_word_exception = i != 1 &&
4245 type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4246 type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4247
4248 /* We check size_read(i) against size_written instead of REG_SIZE
4249 * because we want to properly handle SIMD32. In SIMD32, you can end
4250 * up with writes to 4 registers and a source that reads 2 registers
4251 * and we may still need to lower all the way to SIMD8 in that case.
4252 */
4253 if (inst->size_written > REG_SIZE &&
4254 inst->size_read(i) != 0 &&
4255 inst->size_read(i) < inst->size_written &&
4256 !is_scalar_exception && !is_packed_word_exception) {
4257 const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4258 max_width = MIN2(max_width, inst->exec_size / reg_count);
4259 }
4260 }
4261 }
4262
4263 if (devinfo->ver < 6) {
4264 /* From the G45 PRM, Volume 4 Page 361:
4265 *
4266 * "Operand Alignment Rule: With the exceptions listed below, a
4267 * source/destination operand in general should be aligned to even
4268 * 256-bit physical register with a region size equal to two 256-bit
4269 * physical registers."
4270 *
4271 * Normally we enforce this by allocating virtual registers to the
4272 * even-aligned class. But we need to handle payload registers.
4273 */
4274 for (unsigned i = 0; i < inst->sources; i++) {
4275 if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
4276 inst->size_read(i) > REG_SIZE) {
4277 max_width = MIN2(max_width, 8);
4278 }
4279 }
4280 }
4281
4282 /* From the IVB PRMs:
4283 * "When an instruction is SIMD32, the low 16 bits of the execution mask
4284 * are applied for both halves of the SIMD32 instruction. If different
4285 * execution mask channels are required, split the instruction into two
4286 * SIMD16 instructions."
4287 *
4288 * There is similar text in the HSW PRMs. Gfx4-6 don't even implement
4289 * 32-wide control flow support in hardware and will behave similarly.
4290 */
4291 if (devinfo->ver < 8 && !inst->force_writemask_all)
4292 max_width = MIN2(max_width, 16);
4293
4294 /* From the IVB PRMs (applies to HSW too):
4295 * "Instructions with condition modifiers must not use SIMD32."
4296 *
4297 * From the BDW PRMs (applies to later hardware too):
4298 * "Ternary instruction with condition modifiers must not use SIMD32."
4299 */
4300 if (inst->conditional_mod && (devinfo->ver < 8 ||
4301 inst->elk_is_3src(compiler)))
4302 max_width = MIN2(max_width, 16);
4303
4304 /* From the IVB PRMs (applies to other devices that don't have the
4305 * intel_device_info::supports_simd16_3src flag set):
4306 * "In Align16 access mode, SIMD16 is not allowed for DW operations and
4307 * SIMD8 is not allowed for DF operations."
4308 */
4309 if (inst->elk_is_3src(compiler) && !devinfo->supports_simd16_3src)
4310 max_width = MIN2(max_width, inst->exec_size / reg_count);
4311
4312 /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4313 * the 8-bit quarter of the execution mask signals specified in the
4314 * instruction control fields) for the second compressed half of any
4315 * single-precision instruction (for double-precision instructions
4316 * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4317 * the EU will apply the wrong execution controls for the second
4318 * sequential GRF write if the number of channels per GRF is not exactly
4319 * eight in single-precision mode (or four in double-float mode).
4320 *
4321 * In this situation we calculate the maximum size of the split
4322 * instructions so they only ever write to a single register.
4323 */
4324 if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
4325 !inst->force_writemask_all) {
4326 const unsigned channels_per_grf = inst->exec_size /
4327 DIV_ROUND_UP(inst->size_written, REG_SIZE);
4328 const unsigned exec_type_size = get_exec_type_size(inst);
4329 assert(exec_type_size);
4330
4331 /* The hardware shifts exactly 8 channels per compressed half of the
4332 * instruction in single-precision mode and exactly 4 in double-precision.
4333 */
4334 if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4335 max_width = MIN2(max_width, channels_per_grf);
4336
4337 /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4338 * because HW applies the same channel enable signals to both halves of
4339 * the compressed instruction which will be just wrong under
4340 * non-uniform control flow.
4341 */
4342 if (devinfo->verx10 == 70 &&
4343 (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4344 max_width = MIN2(max_width, 4);
4345 }
4346
4347 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4348 * Float Operations:
4349 *
4350 * "No SIMD16 in mixed mode when destination is f32. Instruction
4351 * execution size must be no more than 8."
4352 *
4353 * FIXME: the simulator doesn't seem to complain if we don't do this and
4354 * empirical testing with existing CTS tests show that they pass just fine
4355 * without implementing this, however, since our interpretation of the PRM
4356 * is that conversion MOVs between HF and F are still mixed-float
4357 * instructions (and therefore subject to this restriction) we decided to
4358 * split them to be safe. Might be useful to do additional investigation to
4359 * lift the restriction if we can ensure that it is safe though, since these
4360 * conversions are common when half-float types are involved since many
4361 * instructions do not support HF types and conversions from/to F are
4362 * required.
4363 */
4364 if (is_mixed_float_with_fp32_dst(inst))
4365 max_width = MIN2(max_width, 8);
4366
4367 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4368 * Float Operations:
4369 *
4370 * "No SIMD16 in mixed mode when destination is packed f16 for both
4371 * Align1 and Align16."
4372 */
4373 if (is_mixed_float_with_packed_fp16_dst(inst))
4374 max_width = MIN2(max_width, 8);
4375
4376 /* Only power-of-two execution sizes are representable in the instruction
4377 * control fields.
4378 */
4379 return 1 << util_logbase2(max_width);
4380 }
4381
4382 /**
4383 * Get the maximum allowed SIMD width for instruction \p inst accounting for
4384 * various payload size restrictions that apply to sampler message
4385 * instructions.
4386 *
4387 * This is only intended to provide a maximum theoretical bound for the
4388 * execution size of the message based on the number of argument components
4389 * alone, which in most cases will determine whether the SIMD8 or SIMD16
4390 * variant of the message can be used, though some messages may have
4391 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4392 * the message length to determine the exact SIMD width and argument count,
4393 * which makes a number of sampler message combinations impossible to
4394 * represent).
4395 *
4396 * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
4397 * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
4398 */
4399 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const elk_fs_inst * inst)4400 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
4401 const elk_fs_inst *inst)
4402 {
4403 /* If we have a min_lod parameter on anything other than a simple sample
4404 * message, it will push it over 5 arguments and we have to fall back to
4405 * SIMD8.
4406 */
4407 if (inst->opcode != ELK_SHADER_OPCODE_TEX &&
4408 inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
4409 return 8;
4410
4411 /* Calculate the number of coordinate components that have to be present
4412 * assuming that additional arguments follow the texel coordinates in the
4413 * message payload. On IVB+ there is no need for padding, on ILK-SNB we
4414 * need to pad to four or three components depending on the message,
4415 * pre-ILK we need to pad to at most three components.
4416 */
4417 const unsigned req_coord_components =
4418 (devinfo->ver >= 7 ||
4419 !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4420 (devinfo->ver >= 5 && inst->opcode != ELK_SHADER_OPCODE_TXF_LOGICAL &&
4421 inst->opcode != ELK_SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4422 3;
4423
4424 /* Calculate the total number of argument components that need to be passed
4425 * to the sampler unit.
4426 */
4427 const unsigned num_payload_components =
4428 MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4429 req_coord_components) +
4430 inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4431 inst->components_read(TEX_LOGICAL_SRC_LOD) +
4432 inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4433 inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4434 (inst->opcode == ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4435 inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4436 inst->components_read(TEX_LOGICAL_SRC_MCS);
4437
4438 const unsigned simd_limit = reg_unit(devinfo) *
4439 (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4440
4441 /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
4442 * maximum message size supported by the sampler, regardless of whether a
4443 * header is provided or not.
4444 */
4445 return MIN2(inst->exec_size, simd_limit);
4446 }
4447
4448 /**
4449 * Get the closest native SIMD width supported by the hardware for instruction
4450 * \p inst. The instruction will be left untouched by
4451 * elk_fs_visitor::lower_simd_width() if the returned value is equal to the
4452 * original execution size.
4453 */
4454 static unsigned
get_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4455 get_lowered_simd_width(const elk_fs_visitor *shader, const elk_fs_inst *inst)
4456 {
4457 const struct elk_compiler *compiler = shader->compiler;
4458 const struct intel_device_info *devinfo = compiler->devinfo;
4459
4460 switch (inst->opcode) {
4461 case ELK_OPCODE_MOV:
4462 case ELK_OPCODE_SEL:
4463 case ELK_OPCODE_NOT:
4464 case ELK_OPCODE_AND:
4465 case ELK_OPCODE_OR:
4466 case ELK_OPCODE_XOR:
4467 case ELK_OPCODE_SHR:
4468 case ELK_OPCODE_SHL:
4469 case ELK_OPCODE_ASR:
4470 case ELK_OPCODE_CMPN:
4471 case ELK_OPCODE_CSEL:
4472 case ELK_OPCODE_F32TO16:
4473 case ELK_OPCODE_F16TO32:
4474 case ELK_OPCODE_BFREV:
4475 case ELK_OPCODE_BFE:
4476 case ELK_OPCODE_ADD:
4477 case ELK_OPCODE_MUL:
4478 case ELK_OPCODE_AVG:
4479 case ELK_OPCODE_FRC:
4480 case ELK_OPCODE_RNDU:
4481 case ELK_OPCODE_RNDD:
4482 case ELK_OPCODE_RNDE:
4483 case ELK_OPCODE_RNDZ:
4484 case ELK_OPCODE_LZD:
4485 case ELK_OPCODE_FBH:
4486 case ELK_OPCODE_FBL:
4487 case ELK_OPCODE_CBIT:
4488 case ELK_OPCODE_SAD2:
4489 case ELK_OPCODE_MAD:
4490 case ELK_OPCODE_LRP:
4491 case ELK_FS_OPCODE_PACK:
4492 case ELK_SHADER_OPCODE_SEL_EXEC:
4493 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
4494 case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
4495 return get_fpu_lowered_simd_width(shader, inst);
4496
4497 case ELK_OPCODE_CMP: {
4498 /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4499 * when the destination is a GRF the dependency-clear bit on the flag
4500 * register is cleared early.
4501 *
4502 * Suggested workarounds are to disable coissuing CMP instructions
4503 * or to split CMP(16) instructions into two CMP(8) instructions.
4504 *
4505 * We choose to split into CMP(8) instructions since disabling
4506 * coissuing would affect CMP instructions not otherwise affected by
4507 * the errata.
4508 */
4509 const unsigned max_width = (devinfo->verx10 == 70 &&
4510 !inst->dst.is_null() ? 8 : ~0);
4511 return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
4512 }
4513 case ELK_OPCODE_BFI1:
4514 case ELK_OPCODE_BFI2:
4515 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4516 * should
4517 * "Force BFI instructions to be executed always in SIMD8."
4518 */
4519 return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
4520 get_fpu_lowered_simd_width(shader, inst));
4521
4522 case ELK_OPCODE_IF:
4523 assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4524 return inst->exec_size;
4525
4526 case ELK_SHADER_OPCODE_RCP:
4527 case ELK_SHADER_OPCODE_RSQ:
4528 case ELK_SHADER_OPCODE_SQRT:
4529 case ELK_SHADER_OPCODE_EXP2:
4530 case ELK_SHADER_OPCODE_LOG2:
4531 case ELK_SHADER_OPCODE_SIN:
4532 case ELK_SHADER_OPCODE_COS: {
4533 /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
4534 * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
4535 */
4536 if (devinfo->ver == 6 || devinfo->verx10 == 40)
4537 return MIN2(8, inst->exec_size);
4538 if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4539 return MIN2(8, inst->exec_size);
4540 return MIN2(16, inst->exec_size);
4541 }
4542
4543 case ELK_SHADER_OPCODE_POW: {
4544 /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
4545 * to SIMD8 with half-float
4546 */
4547 if (devinfo->ver < 7)
4548 return MIN2(8, inst->exec_size);
4549 if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4550 return MIN2(8, inst->exec_size);
4551 return MIN2(16, inst->exec_size);
4552 }
4553
4554 case ELK_SHADER_OPCODE_USUB_SAT:
4555 case ELK_SHADER_OPCODE_ISUB_SAT:
4556 return get_fpu_lowered_simd_width(shader, inst);
4557
4558 case ELK_SHADER_OPCODE_INT_QUOTIENT:
4559 case ELK_SHADER_OPCODE_INT_REMAINDER:
4560 /* Integer division is limited to SIMD8 on all generations. */
4561 return MIN2(8, inst->exec_size);
4562
4563 case ELK_FS_OPCODE_LINTERP:
4564 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4565 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
4566 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4567 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4568 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4569 return MIN2(16, inst->exec_size);
4570
4571 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4572 /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4573 * message used to implement varying pull constant loads, so expand it
4574 * to SIMD16. An alternative with longer message payload length but
4575 * shorter return payload would be to use the SIMD8 sampler message that
4576 * takes (header, u, v, r) as parameters instead of (header, u).
4577 */
4578 return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
4579
4580 case ELK_FS_OPCODE_DDX_COARSE:
4581 case ELK_FS_OPCODE_DDX_FINE:
4582 case ELK_FS_OPCODE_DDY_COARSE:
4583 case ELK_FS_OPCODE_DDY_FINE:
4584 /* The implementation of this virtual opcode may require emitting
4585 * compressed Align16 instructions, which are severely limited on some
4586 * generations.
4587 *
4588 * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4589 * Region Restrictions):
4590 *
4591 * "In Align16 access mode, SIMD16 is not allowed for DW operations
4592 * and SIMD8 is not allowed for DF operations."
4593 *
4594 * In this context, "DW operations" means "operations acting on 32-bit
4595 * values", so it includes operations on floats.
4596 *
4597 * Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3
4598 * (Instruction Compression -> Rules and Restrictions):
4599 *
4600 * "A compressed instruction must be in Align1 access mode. Align16
4601 * mode instructions cannot be compressed."
4602 *
4603 * Similar text exists in the g45 PRM.
4604 *
4605 * Empirically, compressed align16 instructions using odd register
4606 * numbers don't appear to work on Sandybridge either.
4607 */
4608 return (devinfo->ver == 4 || devinfo->ver == 6 ||
4609 (devinfo->verx10 == 70) ?
4610 MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4611
4612 case ELK_SHADER_OPCODE_MULH:
4613 /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4614 * is 8-wide on Gfx7+.
4615 */
4616 return (devinfo->ver >= 7 ? 8 :
4617 get_fpu_lowered_simd_width(shader, inst));
4618
4619 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
4620 /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
4621 * here.
4622 */
4623 assert(devinfo->ver != 6 ||
4624 inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4625 inst->exec_size == 8);
4626 /* Dual-source FB writes are unsupported in SIMD16 mode. */
4627 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4628 8 : MIN2(16, inst->exec_size));
4629
4630 case ELK_SHADER_OPCODE_TEX_LOGICAL:
4631 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
4632 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
4633 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
4634 case ELK_SHADER_OPCODE_LOD_LOGICAL:
4635 case ELK_SHADER_OPCODE_TG4_LOGICAL:
4636 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
4637 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4638 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4639 return get_sampler_lowered_simd_width(devinfo, inst);
4640
4641 /* On gfx12 parameters are fixed to 16-bit values and therefore they all
4642 * always fit regardless of the execution size.
4643 */
4644 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
4645 return MIN2(16, inst->exec_size);
4646
4647 case ELK_SHADER_OPCODE_TXD_LOGICAL:
4648 /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
4649 * unsuppported on Xe2.
4650 */
4651 return 8;
4652
4653 case ELK_SHADER_OPCODE_TXL_LOGICAL:
4654 case ELK_FS_OPCODE_TXB_LOGICAL:
4655 /* Only one execution size is representable pre-ILK depending on whether
4656 * the shadow reference argument is present.
4657 */
4658 if (devinfo->ver == 4)
4659 return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
4660 else
4661 return get_sampler_lowered_simd_width(devinfo, inst);
4662
4663 case ELK_SHADER_OPCODE_TXF_LOGICAL:
4664 case ELK_SHADER_OPCODE_TXS_LOGICAL:
4665 /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4666 * messages. Use SIMD16 instead.
4667 */
4668 if (devinfo->ver == 4)
4669 return 16;
4670 else
4671 return get_sampler_lowered_simd_width(devinfo, inst);
4672
4673 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4674 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4675 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4676 return 8;
4677
4678 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4679 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4680 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4681 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
4682 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
4683 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
4684 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
4685 return MIN2(16, inst->exec_size);
4686
4687 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
4688 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
4689 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
4690 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
4691 return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
4692
4693 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
4694 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
4695 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
4696 assert(inst->exec_size <= 16);
4697 return inst->exec_size;
4698
4699 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
4700 return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
4701
4702 case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
4703 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
4704 return MIN2(8, inst->exec_size);
4705
4706 case ELK_SHADER_OPCODE_QUAD_SWIZZLE: {
4707 const unsigned swiz = inst->src[1].ud;
4708 return (is_uniform(inst->src[0]) ?
4709 get_fpu_lowered_simd_width(shader, inst) :
4710 type_sz(inst->src[0].type) == 4 ? 8 :
4711 swiz == ELK_SWIZZLE_XYXY || swiz == ELK_SWIZZLE_ZWZW ? 4 :
4712 get_fpu_lowered_simd_width(shader, inst));
4713 }
4714 case ELK_SHADER_OPCODE_MOV_INDIRECT: {
4715 /* From IVB and HSW PRMs:
4716 *
4717 * "2.When the destination requires two registers and the sources are
4718 * indirect, the sources must use 1x1 regioning mode.
4719 *
4720 * In case of DF instructions in HSW/IVB, the exec_size is limited by
4721 * the EU decompression logic not handling VxH indirect addressing
4722 * correctly.
4723 */
4724 const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
4725 /* Prior to Broadwell, we only have 8 address subregisters. */
4726 return MIN3(devinfo->ver >= 8 ? 16 : 8,
4727 max_size / (inst->dst.stride * type_sz(inst->dst.type)),
4728 inst->exec_size);
4729 }
4730
4731 case ELK_SHADER_OPCODE_LOAD_PAYLOAD: {
4732 const unsigned reg_count =
4733 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
4734
4735 if (reg_count > 2) {
4736 /* Only LOAD_PAYLOAD instructions with per-channel destination region
4737 * can be easily lowered (which excludes headers and heterogeneous
4738 * types).
4739 */
4740 assert(!inst->header_size);
4741 for (unsigned i = 0; i < inst->sources; i++)
4742 assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
4743 inst->src[i].file == BAD_FILE);
4744
4745 return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4746 } else {
4747 return inst->exec_size;
4748 }
4749 }
4750 default:
4751 return inst->exec_size;
4752 }
4753 }
4754
4755 /**
4756 * Return true if splitting out the group of channels of instruction \p inst
4757 * given by lbld.group() requires allocating a temporary for the i-th source
4758 * of the lowered instruction.
4759 */
4760 static inline bool
needs_src_copy(const fs_builder & lbld,const elk_fs_inst * inst,unsigned i)4761 needs_src_copy(const fs_builder &lbld, const elk_fs_inst *inst, unsigned i)
4762 {
4763 return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
4764 (inst->components_read(i) == 1 &&
4765 lbld.dispatch_width() <= inst->exec_size)) ||
4766 (inst->flags_written(lbld.shader->devinfo) &
4767 flag_mask(inst->src[i], type_sz(inst->src[i].type)));
4768 }
4769
4770 /**
4771 * Extract the data that would be consumed by the channel group given by
4772 * lbld.group() from the i-th source region of instruction \p inst and return
4773 * it as result in packed form.
4774 */
4775 static elk_fs_reg
emit_unzip(const fs_builder & lbld,elk_fs_inst * inst,unsigned i)4776 emit_unzip(const fs_builder &lbld, elk_fs_inst *inst, unsigned i)
4777 {
4778 assert(lbld.group() >= inst->group);
4779
4780 /* Specified channel group from the source region. */
4781 const elk_fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
4782
4783 if (needs_src_copy(lbld, inst, i)) {
4784 /* Builder of the right width to perform the copy avoiding uninitialized
4785 * data if the lowered execution size is greater than the original
4786 * execution size of the instruction.
4787 */
4788 const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
4789 inst->exec_size), 0);
4790 const elk_fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
4791
4792 for (unsigned k = 0; k < inst->components_read(i); ++k)
4793 cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
4794
4795 return tmp;
4796
4797 } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
4798 /* The source is invariant for all dispatch_width-wide groups of the
4799 * original region.
4800 */
4801 return inst->src[i];
4802
4803 } else {
4804 /* We can just point the lowered instruction at the right channel group
4805 * from the original region.
4806 */
4807 return src;
4808 }
4809 }
4810
4811 /**
4812 * Return true if splitting out the group of channels of instruction \p inst
4813 * given by lbld.group() requires allocating a temporary for the destination
4814 * of the lowered instruction and copying the data back to the original
4815 * destination region.
4816 */
4817 static inline bool
needs_dst_copy(const fs_builder & lbld,const elk_fs_inst * inst)4818 needs_dst_copy(const fs_builder &lbld, const elk_fs_inst *inst)
4819 {
4820 if (inst->dst.is_null())
4821 return false;
4822
4823 /* If the instruction writes more than one component we'll have to shuffle
4824 * the results of multiple lowered instructions in order to make sure that
4825 * they end up arranged correctly in the original destination region.
4826 */
4827 if (inst->size_written > inst->dst.component_size(inst->exec_size))
4828 return true;
4829
4830 /* If the lowered execution size is larger than the original the result of
4831 * the instruction won't fit in the original destination, so we'll have to
4832 * allocate a temporary in any case.
4833 */
4834 if (lbld.dispatch_width() > inst->exec_size)
4835 return true;
4836
4837 for (unsigned i = 0; i < inst->sources; i++) {
4838 /* If we already made a copy of the source for other reasons there won't
4839 * be any overlap with the destination.
4840 */
4841 if (needs_src_copy(lbld, inst, i))
4842 continue;
4843
4844 /* In order to keep the logic simple we emit a copy whenever the
4845 * destination region doesn't exactly match an overlapping source, which
4846 * may point at the source and destination not being aligned group by
4847 * group which could cause one of the lowered instructions to overwrite
4848 * the data read from the same source by other lowered instructions.
4849 */
4850 if (regions_overlap(inst->dst, inst->size_written,
4851 inst->src[i], inst->size_read(i)) &&
4852 !inst->dst.equals(inst->src[i]))
4853 return true;
4854 }
4855
4856 return false;
4857 }
4858
4859 /**
4860 * Insert data from a packed temporary into the channel group given by
4861 * lbld.group() of the destination region of instruction \p inst and return
4862 * the temporary as result. Any copy instructions that are required for
4863 * unzipping the previous value (in the case of partial writes) will be
4864 * inserted using \p lbld_before and any copy instructions required for
4865 * zipping up the destination of \p inst will be inserted using \p lbld_after.
4866 */
4867 static elk_fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,elk_fs_inst * inst)4868 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
4869 elk_fs_inst *inst)
4870 {
4871 assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
4872 assert(lbld_before.group() == lbld_after.group());
4873 assert(lbld_after.group() >= inst->group);
4874
4875 const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
4876
4877 /* Specified channel group from the destination region. */
4878 const elk_fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
4879
4880 if (!needs_dst_copy(lbld_after, inst)) {
4881 /* No need to allocate a temporary for the lowered instruction, just
4882 * take the right group of channels from the original region.
4883 */
4884 return dst;
4885 }
4886
4887 /* Deal with the residency data part later */
4888 const unsigned residency_size = inst->has_sampler_residency() ?
4889 (reg_unit(devinfo) * REG_SIZE) : 0;
4890 const unsigned dst_size = (inst->size_written - residency_size) /
4891 inst->dst.component_size(inst->exec_size);
4892
4893 const elk_fs_reg tmp = lbld_after.vgrf(inst->dst.type,
4894 dst_size + inst->has_sampler_residency());
4895
4896 if (inst->predicate) {
4897 /* Handle predication by copying the original contents of the
4898 * destination into the temporary before emitting the lowered
4899 * instruction.
4900 */
4901 const fs_builder gbld_before =
4902 lbld_before.group(MIN2(lbld_before.dispatch_width(),
4903 inst->exec_size), 0);
4904 for (unsigned k = 0; k < dst_size; ++k) {
4905 gbld_before.MOV(offset(tmp, lbld_before, k),
4906 offset(dst, inst->exec_size, k));
4907 }
4908 }
4909
4910 const fs_builder gbld_after =
4911 lbld_after.group(MIN2(lbld_after.dispatch_width(),
4912 inst->exec_size), 0);
4913 for (unsigned k = 0; k < dst_size; ++k) {
4914 /* Use a builder of the right width to perform the copy avoiding
4915 * uninitialized data if the lowered execution size is greater than the
4916 * original execution size of the instruction.
4917 */
4918 gbld_after.MOV(offset(dst, inst->exec_size, k),
4919 offset(tmp, lbld_after, k));
4920 }
4921
4922 if (inst->has_sampler_residency()) {
4923 /* Sampler messages with residency need a special attention. In the
4924 * first lane of the last component are located the Pixel Null Mask
4925 * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
4926 * have to build a single 32bit value for the SIMD32 message out of 2
4927 * SIMD16 16 bit values.
4928 */
4929 const fs_builder rbld = gbld_after.exec_all().group(1, 0);
4930 elk_fs_reg local_res_reg = component(
4931 retype(offset(tmp, lbld_before, dst_size),
4932 ELK_REGISTER_TYPE_UW), 0);
4933 elk_fs_reg final_res_reg =
4934 retype(byte_offset(inst->dst,
4935 inst->size_written - residency_size +
4936 gbld_after.group() / 8),
4937 ELK_REGISTER_TYPE_UW);
4938 rbld.MOV(final_res_reg, local_res_reg);
4939 }
4940
4941 return tmp;
4942 }
4943
4944 bool
lower_simd_width()4945 elk_fs_visitor::lower_simd_width()
4946 {
4947 bool progress = false;
4948
4949 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4950 const unsigned lower_width = get_lowered_simd_width(this, inst);
4951
4952 if (lower_width != inst->exec_size) {
4953 /* Builder matching the original instruction. We may also need to
4954 * emit an instruction of width larger than the original, set the
4955 * execution size of the builder to the highest of both for now so
4956 * we're sure that both cases can be handled.
4957 */
4958 const unsigned max_width = MAX2(inst->exec_size, lower_width);
4959
4960 const fs_builder bld =
4961 fs_builder(this, MAX2(max_width, dispatch_width)).at_end();
4962 const fs_builder ibld = bld.at(block, inst)
4963 .exec_all(inst->force_writemask_all)
4964 .group(max_width, inst->group / max_width);
4965
4966 /* Split the copies in chunks of the execution width of either the
4967 * original or the lowered instruction, whichever is lower.
4968 */
4969 const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
4970 const unsigned residency_size = inst->has_sampler_residency() ?
4971 (reg_unit(devinfo) * REG_SIZE) : 0;
4972 const unsigned dst_size =
4973 (inst->size_written - residency_size) /
4974 inst->dst.component_size(inst->exec_size);
4975
4976 assert(!inst->writes_accumulator && !inst->mlen);
4977
4978 /* Inserting the zip, unzip, and duplicated instructions in all of
4979 * the right spots is somewhat tricky. All of the unzip and any
4980 * instructions from the zip which unzip the destination prior to
4981 * writing need to happen before all of the per-group instructions
4982 * and the zip instructions need to happen after. In order to sort
4983 * this all out, we insert the unzip instructions before \p inst,
4984 * insert the per-group instructions after \p inst (i.e. before
4985 * inst->next), and insert the zip instructions before the
4986 * instruction after \p inst. Since we are inserting instructions
4987 * after \p inst, inst->next is a moving target and we need to save
4988 * it off here so that we insert the zip instructions in the right
4989 * place.
4990 *
4991 * Since we're inserting split instructions after after_inst, the
4992 * instructions will end up in the reverse order that we insert them.
4993 * However, certain render target writes require that the low group
4994 * instructions come before the high group. From the Ivy Bridge PRM
4995 * Vol. 4, Pt. 1, Section 3.9.11:
4996 *
4997 * "If multiple SIMD8 Dual Source messages are delivered by the
4998 * pixel shader thread, each SIMD8_DUALSRC_LO message must be
4999 * issued before the SIMD8_DUALSRC_HI message with the same Slot
5000 * Group Select setting."
5001 *
5002 * And, from Section 3.9.11.1 of the same PRM:
5003 *
5004 * "When SIMD32 or SIMD16 PS threads send render target writes
5005 * with multiple SIMD8 and SIMD16 messages, the following must
5006 * hold:
5007 *
5008 * All the slots (as described above) must have a corresponding
5009 * render target write irrespective of the slot's validity. A slot
5010 * is considered valid when at least one sample is enabled. For
5011 * example, a SIMD16 PS thread must send two SIMD8 render target
5012 * writes to cover all the slots.
5013 *
5014 * PS thread must send SIMD render target write messages with
5015 * increasing slot numbers. For example, SIMD16 thread has
5016 * Slot[15:0] and if two SIMD8 render target writes are used, the
5017 * first SIMD8 render target write must send Slot[7:0] and the
5018 * next one must send Slot[15:8]."
5019 *
5020 * In order to make low group instructions come before high group
5021 * instructions (this is required for some render target writes), we
5022 * split from the highest group to lowest.
5023 */
5024 exec_node *const after_inst = inst->next;
5025 for (int i = n - 1; i >= 0; i--) {
5026 /* Emit a copy of the original instruction with the lowered width.
5027 * If the EOT flag was set throw it away except for the last
5028 * instruction to avoid killing the thread prematurely.
5029 */
5030 elk_fs_inst split_inst = *inst;
5031 split_inst.exec_size = lower_width;
5032 split_inst.eot = inst->eot && i == int(n - 1);
5033
5034 /* Select the correct channel enables for the i-th group, then
5035 * transform the sources and destination and emit the lowered
5036 * instruction.
5037 */
5038 const fs_builder lbld = ibld.group(lower_width, i);
5039
5040 for (unsigned j = 0; j < inst->sources; j++)
5041 split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
5042
5043 split_inst.dst = emit_zip(lbld.at(block, inst),
5044 lbld.at(block, after_inst), inst);
5045 split_inst.size_written =
5046 split_inst.dst.component_size(lower_width) * dst_size +
5047 residency_size;
5048
5049 lbld.at(block, inst->next).emit(split_inst);
5050 }
5051
5052 inst->remove(block);
5053 progress = true;
5054 }
5055 }
5056
5057 if (progress)
5058 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5059
5060 return progress;
5061 }
5062
5063 /**
5064 * Transform barycentric vectors into the interleaved form expected by the PLN
5065 * instruction and returned by the Gfx7+ PI shared function.
5066 *
5067 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
5068 * follows in the register file:
5069 *
5070 * rN+0: X[0-7]
5071 * rN+1: Y[0-7]
5072 * rN+2: X[8-15]
5073 * rN+3: Y[8-15]
5074 *
5075 * There is no need to handle SIMD32 here -- This is expected to be run after
5076 * SIMD lowering, since SIMD lowering relies on vectors having the standard
5077 * component layout.
5078 */
5079 bool
lower_barycentrics()5080 elk_fs_visitor::lower_barycentrics()
5081 {
5082 const bool has_interleaved_layout = devinfo->has_pln ||
5083 devinfo->ver >= 7;
5084 bool progress = false;
5085
5086 if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
5087 return false;
5088
5089 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5090 if (inst->exec_size < 16)
5091 continue;
5092
5093 const fs_builder ibld(this, block, inst);
5094 const fs_builder ubld = ibld.exec_all().group(8, 0);
5095
5096 switch (inst->opcode) {
5097 case ELK_FS_OPCODE_LINTERP : {
5098 assert(inst->exec_size == 16);
5099 const elk_fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
5100 elk_fs_reg srcs[4];
5101
5102 for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
5103 srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
5104 8 * (i / 2));
5105
5106 ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
5107
5108 inst->src[0] = tmp;
5109 progress = true;
5110 break;
5111 }
5112 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
5113 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
5114 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
5115 assert(inst->exec_size == 16);
5116 const elk_fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
5117
5118 for (unsigned i = 0; i < 2; i++) {
5119 for (unsigned g = 0; g < inst->exec_size / 8; g++) {
5120 elk_fs_inst *mov = ibld.at(block, inst->next).group(8, g)
5121 .MOV(horiz_offset(offset(inst->dst, ibld, i),
5122 8 * g),
5123 offset(tmp, ubld, 2 * g + i));
5124 mov->predicate = inst->predicate;
5125 mov->predicate_inverse = inst->predicate_inverse;
5126 mov->flag_subreg = inst->flag_subreg;
5127 }
5128 }
5129
5130 inst->dst = tmp;
5131 progress = true;
5132 break;
5133 }
5134 default:
5135 break;
5136 }
5137 }
5138
5139 if (progress)
5140 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5141
5142 return progress;
5143 }
5144
5145 bool
lower_find_live_channel()5146 elk_fs_visitor::lower_find_live_channel()
5147 {
5148 bool progress = false;
5149
5150 if (devinfo->ver < 8)
5151 return false;
5152
5153 bool packed_dispatch =
5154 elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
5155 bool vmask =
5156 stage == MESA_SHADER_FRAGMENT &&
5157 elk_wm_prog_data(stage_prog_data)->uses_vmask;
5158
5159 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5160 if (inst->opcode != ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL &&
5161 inst->opcode != ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
5162 continue;
5163
5164 bool first = inst->opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL;
5165
5166 /* Getting the first active channel index is easy on Gfx8: Just find
5167 * the first bit set in the execution mask. The register exists on
5168 * HSW already but it reads back as all ones when the current
5169 * instruction has execution masking disabled, so it's kind of
5170 * useless there.
5171 */
5172 elk_fs_reg exec_mask(retype(elk_mask_reg(0), ELK_REGISTER_TYPE_UD));
5173
5174 const fs_builder ibld(this, block, inst);
5175 if (!inst->is_partial_write())
5176 ibld.emit_undef_for_dst(inst);
5177
5178 const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
5179
5180 /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
5181 * so combine the execution and dispatch masks to obtain the true mask.
5182 *
5183 * If we're looking for the first live channel, and we have packed
5184 * dispatch, we can skip this step, as we know all dispatched channels
5185 * will appear at the front of the mask.
5186 */
5187 if (!(first && packed_dispatch)) {
5188 elk_fs_reg mask = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5189 ubld.UNDEF(mask);
5190 ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, mask, elk_imm_ud(vmask ? 3 : 2));
5191
5192 /* Quarter control has the effect of magically shifting the value of
5193 * ce0 so you'll get the first/last active channel relative to the
5194 * specified quarter control as result.
5195 */
5196 if (inst->group > 0)
5197 ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
5198
5199 ubld.AND(mask, exec_mask, mask);
5200 exec_mask = mask;
5201 }
5202
5203 if (first) {
5204 ubld.FBL(inst->dst, exec_mask);
5205 } else {
5206 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 1);
5207 ubld.UNDEF(tmp);
5208 ubld.LZD(tmp, exec_mask);
5209 ubld.ADD(inst->dst, negate(tmp), elk_imm_uw(31));
5210 }
5211
5212 inst->remove(block);
5213 progress = true;
5214 }
5215
5216 if (progress)
5217 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5218
5219 return progress;
5220 }
5221
5222 void
dump_instructions_to_file(FILE * file) const5223 elk_fs_visitor::dump_instructions_to_file(FILE *file) const
5224 {
5225 if (cfg) {
5226 const register_pressure &rp = regpressure_analysis.require();
5227 unsigned ip = 0, max_pressure = 0;
5228 unsigned cf_count = 0;
5229 foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5230 if (inst->is_control_flow_end())
5231 cf_count -= 1;
5232
5233 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5234 fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
5235 for (unsigned i = 0; i < cf_count; i++)
5236 fprintf(file, " ");
5237 dump_instruction(inst, file);
5238 ip++;
5239
5240 if (inst->is_control_flow_begin())
5241 cf_count += 1;
5242 }
5243 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5244 } else {
5245 int ip = 0;
5246 foreach_in_list(elk_backend_instruction, inst, &instructions) {
5247 fprintf(file, "%4d: ", ip++);
5248 dump_instruction(inst, file);
5249 }
5250 }
5251 }
5252
5253 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const5254 elk_fs_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
5255 {
5256 const elk_fs_inst *inst = (const elk_fs_inst *)be_inst;
5257
5258 if (inst->predicate) {
5259 fprintf(file, "(%cf%d.%d) ",
5260 inst->predicate_inverse ? '-' : '+',
5261 inst->flag_subreg / 2,
5262 inst->flag_subreg % 2);
5263 }
5264
5265 fprintf(file, "%s", elk_instruction_name(&compiler->isa, inst->opcode));
5266 if (inst->saturate)
5267 fprintf(file, ".sat");
5268 if (inst->conditional_mod) {
5269 fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
5270 if (!inst->predicate &&
5271 (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
5272 inst->opcode != ELK_OPCODE_CSEL &&
5273 inst->opcode != ELK_OPCODE_IF &&
5274 inst->opcode != ELK_OPCODE_WHILE))) {
5275 fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
5276 inst->flag_subreg % 2);
5277 }
5278 }
5279 fprintf(file, "(%d) ", inst->exec_size);
5280
5281 if (inst->mlen) {
5282 fprintf(file, "(mlen: %d) ", inst->mlen);
5283 }
5284
5285 if (inst->eot) {
5286 fprintf(file, "(EOT) ");
5287 }
5288
5289 switch (inst->dst.file) {
5290 case VGRF:
5291 fprintf(file, "vgrf%d", inst->dst.nr);
5292 break;
5293 case FIXED_GRF:
5294 fprintf(file, "g%d", inst->dst.nr);
5295 break;
5296 case MRF:
5297 fprintf(file, "m%d", inst->dst.nr);
5298 break;
5299 case BAD_FILE:
5300 fprintf(file, "(null)");
5301 break;
5302 case UNIFORM:
5303 fprintf(file, "***u%d***", inst->dst.nr);
5304 break;
5305 case ATTR:
5306 fprintf(file, "***attr%d***", inst->dst.nr);
5307 break;
5308 case ARF:
5309 switch (inst->dst.nr) {
5310 case ELK_ARF_NULL:
5311 fprintf(file, "null");
5312 break;
5313 case ELK_ARF_ADDRESS:
5314 fprintf(file, "a0.%d", inst->dst.subnr);
5315 break;
5316 case ELK_ARF_ACCUMULATOR:
5317 fprintf(file, "acc%d", inst->dst.subnr);
5318 break;
5319 case ELK_ARF_FLAG:
5320 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5321 break;
5322 default:
5323 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5324 break;
5325 }
5326 break;
5327 case IMM:
5328 unreachable("not reached");
5329 }
5330
5331 if (inst->dst.offset ||
5332 (inst->dst.file == VGRF &&
5333 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5334 const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5335 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5336 inst->dst.offset % reg_size);
5337 }
5338
5339 if (inst->dst.stride != 1)
5340 fprintf(file, "<%u>", inst->dst.stride);
5341 fprintf(file, ":%s, ", elk_reg_type_to_letters(inst->dst.type));
5342
5343 for (int i = 0; i < inst->sources; i++) {
5344 if (inst->src[i].negate)
5345 fprintf(file, "-");
5346 if (inst->src[i].abs)
5347 fprintf(file, "|");
5348 switch (inst->src[i].file) {
5349 case VGRF:
5350 fprintf(file, "vgrf%d", inst->src[i].nr);
5351 break;
5352 case FIXED_GRF:
5353 fprintf(file, "g%d", inst->src[i].nr);
5354 break;
5355 case MRF:
5356 fprintf(file, "***m%d***", inst->src[i].nr);
5357 break;
5358 case ATTR:
5359 fprintf(file, "attr%d", inst->src[i].nr);
5360 break;
5361 case UNIFORM:
5362 fprintf(file, "u%d", inst->src[i].nr);
5363 break;
5364 case BAD_FILE:
5365 fprintf(file, "(null)");
5366 break;
5367 case IMM:
5368 switch (inst->src[i].type) {
5369 case ELK_REGISTER_TYPE_HF:
5370 fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
5371 break;
5372 case ELK_REGISTER_TYPE_F:
5373 fprintf(file, "%-gf", inst->src[i].f);
5374 break;
5375 case ELK_REGISTER_TYPE_DF:
5376 fprintf(file, "%fdf", inst->src[i].df);
5377 break;
5378 case ELK_REGISTER_TYPE_W:
5379 case ELK_REGISTER_TYPE_D:
5380 fprintf(file, "%dd", inst->src[i].d);
5381 break;
5382 case ELK_REGISTER_TYPE_UW:
5383 case ELK_REGISTER_TYPE_UD:
5384 fprintf(file, "%uu", inst->src[i].ud);
5385 break;
5386 case ELK_REGISTER_TYPE_Q:
5387 fprintf(file, "%" PRId64 "q", inst->src[i].d64);
5388 break;
5389 case ELK_REGISTER_TYPE_UQ:
5390 fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
5391 break;
5392 case ELK_REGISTER_TYPE_VF:
5393 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5394 elk_vf_to_float((inst->src[i].ud >> 0) & 0xff),
5395 elk_vf_to_float((inst->src[i].ud >> 8) & 0xff),
5396 elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5397 elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5398 break;
5399 case ELK_REGISTER_TYPE_V:
5400 case ELK_REGISTER_TYPE_UV:
5401 fprintf(file, "%08x%s", inst->src[i].ud,
5402 inst->src[i].type == ELK_REGISTER_TYPE_V ? "V" : "UV");
5403 break;
5404 default:
5405 fprintf(file, "???");
5406 break;
5407 }
5408 break;
5409 case ARF:
5410 switch (inst->src[i].nr) {
5411 case ELK_ARF_NULL:
5412 fprintf(file, "null");
5413 break;
5414 case ELK_ARF_ADDRESS:
5415 fprintf(file, "a0.%d", inst->src[i].subnr);
5416 break;
5417 case ELK_ARF_ACCUMULATOR:
5418 fprintf(file, "acc%d", inst->src[i].subnr);
5419 break;
5420 case ELK_ARF_FLAG:
5421 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5422 break;
5423 default:
5424 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5425 break;
5426 }
5427 break;
5428 }
5429
5430 if (inst->src[i].offset ||
5431 (inst->src[i].file == VGRF &&
5432 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5433 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5434 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5435 inst->src[i].offset % reg_size);
5436 }
5437
5438 if (inst->src[i].abs)
5439 fprintf(file, "|");
5440
5441 if (inst->src[i].file != IMM) {
5442 unsigned stride;
5443 if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5444 unsigned hstride = inst->src[i].hstride;
5445 stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5446 } else {
5447 stride = inst->src[i].stride;
5448 }
5449 if (stride != 1)
5450 fprintf(file, "<%u>", stride);
5451
5452 fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
5453 }
5454
5455 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5456 fprintf(file, ", ");
5457 }
5458
5459 fprintf(file, " ");
5460
5461 if (inst->force_writemask_all)
5462 fprintf(file, "NoMask ");
5463
5464 if (inst->exec_size != dispatch_width)
5465 fprintf(file, "group%d ", inst->group);
5466
5467 fprintf(file, "\n");
5468 }
5469
register_pressure(const elk_fs_visitor * v)5470 elk::register_pressure::register_pressure(const elk_fs_visitor *v)
5471 {
5472 const fs_live_variables &live = v->live_analysis.require();
5473 const unsigned num_instructions = v->cfg->num_blocks ?
5474 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
5475
5476 regs_live_at_ip = new unsigned[num_instructions]();
5477
5478 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
5479 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
5480 regs_live_at_ip[ip] += v->alloc.sizes[reg];
5481 }
5482
5483 const unsigned payload_count = v->first_non_payload_grf;
5484
5485 int *payload_last_use_ip = new int[payload_count];
5486 v->calculate_payload_ranges(payload_count, payload_last_use_ip);
5487
5488 for (unsigned reg = 0; reg < payload_count; reg++) {
5489 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
5490 ++regs_live_at_ip[ip];
5491 }
5492
5493 delete[] payload_last_use_ip;
5494 }
5495
~register_pressure()5496 elk::register_pressure::~register_pressure()
5497 {
5498 delete[] regs_live_at_ip;
5499 }
5500
5501 void
invalidate_analysis(elk::analysis_dependency_class c)5502 elk_fs_visitor::invalidate_analysis(elk::analysis_dependency_class c)
5503 {
5504 elk_backend_shader::invalidate_analysis(c);
5505 live_analysis.invalidate(c);
5506 regpressure_analysis.invalidate(c);
5507 }
5508
5509 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const5510 elk_fs_visitor::debug_optimizer(const nir_shader *nir,
5511 const char *pass_name,
5512 int iteration, int pass_num) const
5513 {
5514 if (!elk_should_print_shader(nir, DEBUG_OPTIMIZER))
5515 return;
5516
5517 char *filename;
5518 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
5519 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
5520 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
5521 iteration, pass_num, pass_name);
5522 if (ret == -1)
5523 return;
5524 dump_instructions(filename);
5525 free(filename);
5526 }
5527
5528 void
optimize()5529 elk_fs_visitor::optimize()
5530 {
5531 debug_optimizer(nir, "start", 0, 0);
5532
5533 /* Start by validating the shader we currently have. */
5534 validate();
5535
5536 bool progress = false;
5537 int iteration = 0;
5538 int pass_num = 0;
5539
5540 #define OPT(pass, args...) ({ \
5541 pass_num++; \
5542 bool this_progress = pass(args); \
5543 \
5544 if (this_progress) \
5545 debug_optimizer(nir, #pass, iteration, pass_num); \
5546 \
5547 validate(); \
5548 \
5549 progress = progress || this_progress; \
5550 this_progress; \
5551 })
5552
5553 assign_constant_locations();
5554 OPT(lower_constant_loads);
5555
5556 validate();
5557
5558 OPT(split_virtual_grfs);
5559
5560 /* Before anything else, eliminate dead code. The results of some NIR
5561 * instructions may effectively be calculated twice. Once when the
5562 * instruction is encountered, and again when the user of that result is
5563 * encountered. Wipe those away before algebraic optimizations and
5564 * especially copy propagation can mix things up.
5565 */
5566 OPT(dead_code_eliminate);
5567
5568 OPT(remove_extra_rounding_modes);
5569
5570 do {
5571 progress = false;
5572 pass_num = 0;
5573 iteration++;
5574
5575 OPT(remove_duplicate_mrf_writes);
5576
5577 OPT(opt_algebraic);
5578 OPT(opt_cse);
5579 OPT(opt_copy_propagation);
5580 OPT(elk_opt_predicated_break, this);
5581 OPT(opt_cmod_propagation);
5582 OPT(dead_code_eliminate);
5583 OPT(opt_peephole_sel);
5584 OPT(elk_dead_control_flow_eliminate, this);
5585 OPT(opt_saturate_propagation);
5586 OPT(register_coalesce);
5587 OPT(compute_to_mrf);
5588 OPT(eliminate_find_live_channel);
5589
5590 OPT(compact_virtual_grfs);
5591 } while (progress);
5592
5593 progress = false;
5594 pass_num = 0;
5595
5596 if (OPT(lower_pack)) {
5597 OPT(register_coalesce);
5598 OPT(dead_code_eliminate);
5599 }
5600
5601 OPT(lower_simd_width);
5602 OPT(lower_barycentrics);
5603 OPT(lower_logical_sends);
5604
5605 /* After logical SEND lowering. */
5606
5607 if (OPT(opt_copy_propagation))
5608 OPT(opt_algebraic);
5609
5610 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
5611 * Do this before splitting SENDs.
5612 */
5613 if (devinfo->ver >= 7) {
5614 if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
5615 OPT(opt_algebraic);
5616 }
5617
5618 if (progress) {
5619 if (OPT(opt_copy_propagation))
5620 OPT(opt_algebraic);
5621
5622 /* Run after logical send lowering to give it a chance to CSE the
5623 * LOAD_PAYLOAD instructions created to construct the payloads of
5624 * e.g. texturing messages in cases where it wasn't possible to CSE the
5625 * whole logical instruction.
5626 */
5627 OPT(opt_cse);
5628 OPT(register_coalesce);
5629 OPT(compute_to_mrf);
5630 OPT(dead_code_eliminate);
5631 OPT(remove_duplicate_mrf_writes);
5632 OPT(opt_peephole_sel);
5633 }
5634
5635 OPT(opt_redundant_halt);
5636
5637 if (OPT(lower_load_payload)) {
5638 OPT(split_virtual_grfs);
5639
5640 /* Lower 64 bit MOVs generated by payload lowering. */
5641 if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
5642 OPT(opt_algebraic);
5643
5644 OPT(register_coalesce);
5645 OPT(lower_simd_width);
5646 OPT(compute_to_mrf);
5647 OPT(dead_code_eliminate);
5648 }
5649
5650 OPT(opt_combine_constants);
5651 if (OPT(lower_integer_multiplication)) {
5652 /* If lower_integer_multiplication made progress, it may have produced
5653 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
5654 * one more time to clean those up if they exist.
5655 */
5656 OPT(lower_integer_multiplication);
5657 }
5658 OPT(lower_sub_sat);
5659
5660 if (devinfo->ver <= 5 && OPT(lower_minmax)) {
5661 OPT(opt_cmod_propagation);
5662 OPT(opt_cse);
5663 if (OPT(opt_copy_propagation))
5664 OPT(opt_algebraic);
5665 OPT(dead_code_eliminate);
5666 }
5667
5668 progress = false;
5669 OPT(lower_regioning);
5670 if (progress) {
5671 if (OPT(opt_copy_propagation))
5672 OPT(opt_algebraic);
5673 OPT(dead_code_eliminate);
5674 OPT(lower_simd_width);
5675 }
5676
5677 OPT(lower_uniform_pull_constant_loads);
5678
5679 OPT(lower_find_live_channel);
5680
5681 validate();
5682 }
5683
5684 /**
5685 * Three source instruction must have a GRF/MRF destination register.
5686 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
5687 */
5688 void
fixup_3src_null_dest()5689 elk_fs_visitor::fixup_3src_null_dest()
5690 {
5691 bool progress = false;
5692
5693 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
5694 if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
5695 inst->dst = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5696 inst->dst.type);
5697 progress = true;
5698 }
5699 }
5700
5701 if (progress)
5702 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
5703 DEPENDENCY_VARIABLES);
5704 }
5705
5706 uint32_t
compute_max_register_pressure()5707 elk_fs_visitor::compute_max_register_pressure()
5708 {
5709 const register_pressure &rp = regpressure_analysis.require();
5710 uint32_t ip = 0, max_pressure = 0;
5711 foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5712 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5713 ip++;
5714 }
5715 return max_pressure;
5716 }
5717
5718 static elk_fs_inst **
save_instruction_order(const struct elk_cfg_t * cfg)5719 save_instruction_order(const struct elk_cfg_t *cfg)
5720 {
5721 /* Before we schedule anything, stash off the instruction order as an array
5722 * of elk_fs_inst *. This way, we can reset it between scheduling passes to
5723 * prevent dependencies between the different scheduling modes.
5724 */
5725 int num_insts = cfg->last_block()->end_ip + 1;
5726 elk_fs_inst **inst_arr = new elk_fs_inst * [num_insts];
5727
5728 int ip = 0;
5729 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
5730 assert(ip >= block->start_ip && ip <= block->end_ip);
5731 inst_arr[ip++] = inst;
5732 }
5733 assert(ip == num_insts);
5734
5735 return inst_arr;
5736 }
5737
5738 static void
restore_instruction_order(struct elk_cfg_t * cfg,elk_fs_inst ** inst_arr)5739 restore_instruction_order(struct elk_cfg_t *cfg, elk_fs_inst **inst_arr)
5740 {
5741 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
5742
5743 int ip = 0;
5744 foreach_block (block, cfg) {
5745 block->instructions.make_empty();
5746
5747 assert(ip == block->start_ip);
5748 for (; ip <= block->end_ip; ip++)
5749 block->instructions.push_tail(inst_arr[ip]);
5750 }
5751 assert(ip == num_insts);
5752 }
5753
5754 void
allocate_registers(bool allow_spilling)5755 elk_fs_visitor::allocate_registers(bool allow_spilling)
5756 {
5757 bool allocated;
5758
5759 static const enum instruction_scheduler_mode pre_modes[] = {
5760 SCHEDULE_PRE,
5761 SCHEDULE_PRE_NON_LIFO,
5762 SCHEDULE_NONE,
5763 SCHEDULE_PRE_LIFO,
5764 };
5765
5766 static const char *scheduler_mode_name[] = {
5767 [SCHEDULE_PRE] = "top-down",
5768 [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
5769 [SCHEDULE_PRE_LIFO] = "lifo",
5770 [SCHEDULE_POST] = "post",
5771 [SCHEDULE_NONE] = "none",
5772 };
5773
5774 uint32_t best_register_pressure = UINT32_MAX;
5775 enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
5776
5777 compact_virtual_grfs();
5778
5779 if (needs_register_pressure)
5780 shader_stats.max_register_pressure = compute_max_register_pressure();
5781
5782 debug_optimizer(nir, "pre_register_allocate", 90, 90);
5783
5784 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
5785
5786 /* Before we schedule anything, stash off the instruction order as an array
5787 * of elk_fs_inst *. This way, we can reset it between scheduling passes to
5788 * prevent dependencies between the different scheduling modes.
5789 */
5790 elk_fs_inst **orig_order = save_instruction_order(cfg);
5791 elk_fs_inst **best_pressure_order = NULL;
5792
5793 void *scheduler_ctx = ralloc_context(NULL);
5794 elk_fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
5795
5796 /* Try each scheduling heuristic to see if it can successfully register
5797 * allocate without spilling. They should be ordered by decreasing
5798 * performance but increasing likelihood of allocating.
5799 */
5800 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5801 enum instruction_scheduler_mode sched_mode = pre_modes[i];
5802
5803 schedule_instructions_pre_ra(sched, sched_mode);
5804 this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
5805
5806 debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
5807
5808 if (0) {
5809 assign_regs_trivial();
5810 allocated = true;
5811 break;
5812 }
5813
5814 /* We should only spill registers on the last scheduling. */
5815 assert(!spilled_any_registers);
5816
5817 allocated = assign_regs(false, spill_all);
5818 if (allocated)
5819 break;
5820
5821 /* Save the maximum register pressure */
5822 uint32_t this_pressure = compute_max_register_pressure();
5823
5824 if (0) {
5825 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
5826 scheduler_mode_name[sched_mode], this_pressure);
5827 }
5828
5829 if (this_pressure < best_register_pressure) {
5830 best_register_pressure = this_pressure;
5831 best_sched = sched_mode;
5832 delete[] best_pressure_order;
5833 best_pressure_order = save_instruction_order(cfg);
5834 }
5835
5836 /* Reset back to the original order before trying the next mode */
5837 restore_instruction_order(cfg, orig_order);
5838 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
5839 }
5840
5841 ralloc_free(scheduler_ctx);
5842
5843 if (!allocated) {
5844 if (0) {
5845 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
5846 scheduler_mode_name[best_sched]);
5847 }
5848 restore_instruction_order(cfg, best_pressure_order);
5849 shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
5850
5851 allocated = assign_regs(allow_spilling, spill_all);
5852 }
5853
5854 delete[] orig_order;
5855 delete[] best_pressure_order;
5856
5857 if (!allocated) {
5858 fail("Failure to register allocate. Reduce number of "
5859 "live scalar values to avoid this.");
5860 } else if (spilled_any_registers) {
5861 elk_shader_perf_log(compiler, log_data,
5862 "%s shader triggered register spilling. "
5863 "Try reducing the number of live scalar "
5864 "values to improve performance.\n",
5865 _mesa_shader_stage_to_string(stage));
5866 }
5867
5868 /* This must come after all optimization and register allocation, since
5869 * it inserts dead code that happens to have side effects, and it does
5870 * so based on the actual physical registers in use.
5871 */
5872 insert_gfx4_send_dependency_workarounds();
5873
5874 if (failed)
5875 return;
5876
5877 opt_bank_conflicts();
5878
5879 schedule_instructions_post_ra();
5880
5881 if (last_scratch > 0) {
5882 ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
5883
5884 /* Take the max of any previously compiled variant of the shader. In the
5885 * case of bindless shaders with return parts, this will also take the
5886 * max of all parts.
5887 */
5888 prog_data->total_scratch = MAX2(elk_get_scratch_size(last_scratch),
5889 prog_data->total_scratch);
5890
5891 if (gl_shader_stage_is_compute(stage)) {
5892 if (devinfo->platform == INTEL_PLATFORM_HSW) {
5893 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5894 * field documentation, Haswell supports a minimum of 2kB of
5895 * scratch space for compute shaders, unlike every other stage
5896 * and platform.
5897 */
5898 prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
5899 } else if (devinfo->ver <= 7) {
5900 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5901 * field documentation, platforms prior to Haswell measure scratch
5902 * size linearly with a range of [1kB, 12kB] and 1kB granularity.
5903 */
5904 prog_data->total_scratch = ALIGN(last_scratch, 1024);
5905 max_scratch_size = 12 * 1024;
5906 }
5907 }
5908
5909 /* We currently only support up to 2MB of scratch space. If we
5910 * need to support more eventually, the documentation suggests
5911 * that we could allocate a larger buffer, and partition it out
5912 * ourselves. We'd just have to undo the hardware's address
5913 * calculation by subtracting (FFTID * Per Thread Scratch Space)
5914 * and then add FFTID * (Larger Per Thread Scratch Space).
5915 *
5916 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
5917 * Thread Group Tracking > Local Memory/Scratch Space.
5918 */
5919 assert(prog_data->total_scratch < max_scratch_size);
5920 }
5921 }
5922
5923 bool
run_vs()5924 elk_fs_visitor::run_vs()
5925 {
5926 assert(stage == MESA_SHADER_VERTEX);
5927
5928 payload_ = new elk_vs_thread_payload(*this);
5929
5930 nir_to_elk(this);
5931
5932 if (failed)
5933 return false;
5934
5935 emit_urb_writes();
5936
5937 calculate_cfg();
5938
5939 optimize();
5940
5941 assign_curb_setup();
5942 assign_vs_urb_setup();
5943
5944 fixup_3src_null_dest();
5945
5946 allocate_registers(true /* allow_spilling */);
5947
5948 return !failed;
5949 }
5950
5951 void
set_tcs_invocation_id()5952 elk_fs_visitor::set_tcs_invocation_id()
5953 {
5954 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(prog_data);
5955 struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
5956 const fs_builder bld = fs_builder(this).at_end();
5957
5958 const unsigned instance_id_mask = INTEL_MASK(23, 17);
5959 const unsigned instance_id_shift = 17;
5960
5961 elk_fs_reg t = bld.vgrf(ELK_REGISTER_TYPE_UD);
5962 bld.AND(t, elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD)),
5963 elk_imm_ud(instance_id_mask));
5964
5965 invocation_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
5966
5967 if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
5968 /* gl_InvocationID is just the thread number */
5969 bld.SHR(invocation_id, t, elk_imm_ud(instance_id_shift));
5970 return;
5971 }
5972
5973 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
5974
5975 elk_fs_reg channels_uw = bld.vgrf(ELK_REGISTER_TYPE_UW);
5976 elk_fs_reg channels_ud = bld.vgrf(ELK_REGISTER_TYPE_UD);
5977 bld.MOV(channels_uw, elk_fs_reg(elk_imm_uv(0x76543210)));
5978 bld.MOV(channels_ud, channels_uw);
5979
5980 if (tcs_prog_data->instances == 1) {
5981 invocation_id = channels_ud;
5982 } else {
5983 elk_fs_reg instance_times_8 = bld.vgrf(ELK_REGISTER_TYPE_UD);
5984 bld.SHR(instance_times_8, t, elk_imm_ud(instance_id_shift - 3));
5985 bld.ADD(invocation_id, instance_times_8, channels_ud);
5986 }
5987 }
5988
5989 void
emit_tcs_thread_end()5990 elk_fs_visitor::emit_tcs_thread_end()
5991 {
5992 /* Try and tag the last URB write with EOT instead of emitting a whole
5993 * separate write just to finish the thread. There isn't guaranteed to
5994 * be one, so this may not succeed.
5995 */
5996 if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
5997 return;
5998
5999 const fs_builder bld = fs_builder(this).at_end();
6000
6001 /* Emit a URB write to end the thread. On Broadwell, we use this to write
6002 * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
6003 * algorithm to set it optimally). On other platforms, we simply write
6004 * zero to a reserved/MBZ patch header DWord which has no consequence.
6005 */
6006 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
6007 srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
6008 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = elk_imm_ud(WRITEMASK_X << 16);
6009 srcs[URB_LOGICAL_SRC_DATA] = elk_imm_ud(0);
6010 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
6011 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
6012 reg_undef, srcs, ARRAY_SIZE(srcs));
6013 inst->eot = true;
6014 }
6015
6016 bool
run_tcs()6017 elk_fs_visitor::run_tcs()
6018 {
6019 assert(stage == MESA_SHADER_TESS_CTRL);
6020
6021 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
6022 const fs_builder bld = fs_builder(this).at_end();
6023
6024 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
6025 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
6026
6027 payload_ = new elk_tcs_thread_payload(*this);
6028
6029 /* Initialize gl_InvocationID */
6030 set_tcs_invocation_id();
6031
6032 const bool fix_dispatch_mask =
6033 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
6034 (nir->info.tess.tcs_vertices_out % 8) != 0;
6035
6036 /* Fix the disptach mask */
6037 if (fix_dispatch_mask) {
6038 bld.CMP(bld.null_reg_ud(), invocation_id,
6039 elk_imm_ud(nir->info.tess.tcs_vertices_out), ELK_CONDITIONAL_L);
6040 bld.IF(ELK_PREDICATE_NORMAL);
6041 }
6042
6043 nir_to_elk(this);
6044
6045 if (fix_dispatch_mask) {
6046 bld.emit(ELK_OPCODE_ENDIF);
6047 }
6048
6049 emit_tcs_thread_end();
6050
6051 if (failed)
6052 return false;
6053
6054 calculate_cfg();
6055
6056 optimize();
6057
6058 assign_curb_setup();
6059 assign_tcs_urb_setup();
6060
6061 fixup_3src_null_dest();
6062
6063 allocate_registers(true /* allow_spilling */);
6064
6065 return !failed;
6066 }
6067
6068 bool
run_tes()6069 elk_fs_visitor::run_tes()
6070 {
6071 assert(stage == MESA_SHADER_TESS_EVAL);
6072
6073 payload_ = new elk_tes_thread_payload(*this);
6074
6075 nir_to_elk(this);
6076
6077 if (failed)
6078 return false;
6079
6080 emit_urb_writes();
6081
6082 calculate_cfg();
6083
6084 optimize();
6085
6086 assign_curb_setup();
6087 assign_tes_urb_setup();
6088
6089 fixup_3src_null_dest();
6090
6091 allocate_registers(true /* allow_spilling */);
6092
6093 return !failed;
6094 }
6095
6096 bool
run_gs()6097 elk_fs_visitor::run_gs()
6098 {
6099 assert(stage == MESA_SHADER_GEOMETRY);
6100
6101 payload_ = new elk_gs_thread_payload(*this);
6102
6103 this->final_gs_vertex_count = vgrf(glsl_uint_type());
6104
6105 if (gs_compile->control_data_header_size_bits > 0) {
6106 /* Create a VGRF to store accumulated control data bits. */
6107 this->control_data_bits = vgrf(glsl_uint_type());
6108
6109 /* If we're outputting more than 32 control data bits, then EmitVertex()
6110 * will set control_data_bits to 0 after emitting the first vertex.
6111 * Otherwise, we need to initialize it to 0 here.
6112 */
6113 if (gs_compile->control_data_header_size_bits <= 32) {
6114 const fs_builder bld = fs_builder(this).at_end();
6115 const fs_builder abld = bld.annotate("initialize control data bits");
6116 abld.MOV(this->control_data_bits, elk_imm_ud(0u));
6117 }
6118 }
6119
6120 nir_to_elk(this);
6121
6122 emit_gs_thread_end();
6123
6124 if (failed)
6125 return false;
6126
6127 calculate_cfg();
6128
6129 optimize();
6130
6131 assign_curb_setup();
6132 assign_gs_urb_setup();
6133
6134 fixup_3src_null_dest();
6135
6136 allocate_registers(true /* allow_spilling */);
6137
6138 return !failed;
6139 }
6140
6141 bool
run_fs(bool allow_spilling,bool do_rep_send)6142 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6143 {
6144 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(this->prog_data);
6145 elk_wm_prog_key *wm_key = (elk_wm_prog_key *) this->key;
6146 const fs_builder bld = fs_builder(this).at_end();
6147
6148 assert(stage == MESA_SHADER_FRAGMENT);
6149
6150 payload_ = new elk_fs_thread_payload(*this, source_depth_to_render_target,
6151 runtime_check_aads_emit);
6152
6153 if (do_rep_send) {
6154 assert(dispatch_width == 16);
6155 emit_repclear_shader();
6156 } else {
6157 if (nir->info.inputs_read > 0 ||
6158 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
6159 (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6160 if (devinfo->ver < 6)
6161 emit_interpolation_setup_gfx4();
6162 else
6163 emit_interpolation_setup_gfx6();
6164 }
6165
6166 /* We handle discards by keeping track of the still-live pixels in f0.1.
6167 * Initialize it with the dispatched pixels.
6168 */
6169 if (wm_prog_data->uses_kill) {
6170 const unsigned lower_width = MIN2(dispatch_width, 16);
6171 for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
6172 /* According to the "PS Thread Payload for Normal
6173 * Dispatch" pages on the BSpec, the dispatch mask is
6174 * stored in R1.7/R2.7 on gfx6+.
6175 */
6176 const elk_fs_reg dispatch_mask =
6177 devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
6178 elk_vec1_grf(0, 0);
6179 bld.exec_all().group(1, 0)
6180 .MOV(elk_sample_mask_reg(bld.group(lower_width, i)),
6181 retype(dispatch_mask, ELK_REGISTER_TYPE_UW));
6182 }
6183 }
6184
6185 if (nir->info.writes_memory)
6186 wm_prog_data->has_side_effects = true;
6187
6188 nir_to_elk(this);
6189
6190 if (failed)
6191 return false;
6192
6193 if (wm_key->emit_alpha_test)
6194 emit_alpha_test();
6195
6196 emit_fb_writes();
6197
6198 calculate_cfg();
6199
6200 optimize();
6201
6202 assign_curb_setup();
6203
6204 assign_urb_setup();
6205
6206 fixup_3src_null_dest();
6207
6208 allocate_registers(allow_spilling);
6209 }
6210
6211 return !failed;
6212 }
6213
6214 bool
run_cs(bool allow_spilling)6215 elk_fs_visitor::run_cs(bool allow_spilling)
6216 {
6217 assert(gl_shader_stage_is_compute(stage));
6218 assert(devinfo->ver >= 7);
6219 const fs_builder bld = fs_builder(this).at_end();
6220
6221 payload_ = new elk_cs_thread_payload(*this);
6222
6223 if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
6224 /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
6225 const fs_builder abld = bld.exec_all().group(1, 0);
6226 abld.MOV(retype(elk_sr0_reg(1), ELK_REGISTER_TYPE_UW),
6227 suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW), 1));
6228 }
6229
6230 nir_to_elk(this);
6231
6232 if (failed)
6233 return false;
6234
6235 emit_cs_terminate();
6236
6237 calculate_cfg();
6238
6239 optimize();
6240
6241 assign_curb_setup();
6242
6243 fixup_3src_null_dest();
6244
6245 allocate_registers(allow_spilling);
6246
6247 return !failed;
6248 }
6249
6250 static bool
is_used_in_not_interp_frag_coord(nir_def * def)6251 is_used_in_not_interp_frag_coord(nir_def *def)
6252 {
6253 nir_foreach_use_including_if(src, def) {
6254 if (nir_src_is_if(src))
6255 return true;
6256
6257 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
6258 return true;
6259
6260 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
6261 if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
6262 return true;
6263 }
6264
6265 return false;
6266 }
6267
6268 /**
6269 * Return a bitfield where bit n is set if barycentric interpolation mode n
6270 * (see enum elk_barycentric_mode) is needed by the fragment shader.
6271 *
6272 * We examine the load_barycentric intrinsics rather than looking at input
6273 * variables so that we catch interpolateAtCentroid() messages too, which
6274 * also need the ELK_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
6275 */
6276 static unsigned
elk_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)6277 elk_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
6278 const nir_shader *shader)
6279 {
6280 unsigned barycentric_interp_modes = 0;
6281
6282 nir_foreach_function_impl(impl, shader) {
6283 nir_foreach_block(block, impl) {
6284 nir_foreach_instr(instr, block) {
6285 if (instr->type != nir_instr_type_intrinsic)
6286 continue;
6287
6288 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6289 switch (intrin->intrinsic) {
6290 case nir_intrinsic_load_barycentric_pixel:
6291 case nir_intrinsic_load_barycentric_centroid:
6292 case nir_intrinsic_load_barycentric_sample:
6293 case nir_intrinsic_load_barycentric_at_sample:
6294 case nir_intrinsic_load_barycentric_at_offset:
6295 break;
6296 default:
6297 continue;
6298 }
6299
6300 /* Ignore WPOS; it doesn't require interpolation. */
6301 if (!is_used_in_not_interp_frag_coord(&intrin->def))
6302 continue;
6303
6304 nir_intrinsic_op bary_op = intrin->intrinsic;
6305 enum elk_barycentric_mode bary =
6306 elk_barycentric_mode(intrin);
6307
6308 barycentric_interp_modes |= 1 << bary;
6309
6310 if (devinfo->needs_unlit_centroid_workaround &&
6311 bary_op == nir_intrinsic_load_barycentric_centroid)
6312 barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
6313 }
6314 }
6315 }
6316
6317 return barycentric_interp_modes;
6318 }
6319
6320 static void
elk_compute_flat_inputs(struct elk_wm_prog_data * prog_data,const nir_shader * shader)6321 elk_compute_flat_inputs(struct elk_wm_prog_data *prog_data,
6322 const nir_shader *shader)
6323 {
6324 prog_data->flat_inputs = 0;
6325
6326 nir_foreach_shader_in_variable(var, shader) {
6327 /* flat shading */
6328 if (var->data.interpolation != INTERP_MODE_FLAT)
6329 continue;
6330
6331 if (var->data.per_primitive)
6332 continue;
6333
6334 unsigned slots = glsl_count_attribute_slots(var->type, false);
6335 for (unsigned s = 0; s < slots; s++) {
6336 int input_index = prog_data->urb_setup[var->data.location + s];
6337
6338 if (input_index >= 0)
6339 prog_data->flat_inputs |= 1 << input_index;
6340 }
6341 }
6342 }
6343
6344 static uint8_t
computed_depth_mode(const nir_shader * shader)6345 computed_depth_mode(const nir_shader *shader)
6346 {
6347 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
6348 switch (shader->info.fs.depth_layout) {
6349 case FRAG_DEPTH_LAYOUT_NONE:
6350 case FRAG_DEPTH_LAYOUT_ANY:
6351 return ELK_PSCDEPTH_ON;
6352 case FRAG_DEPTH_LAYOUT_GREATER:
6353 return ELK_PSCDEPTH_ON_GE;
6354 case FRAG_DEPTH_LAYOUT_LESS:
6355 return ELK_PSCDEPTH_ON_LE;
6356 case FRAG_DEPTH_LAYOUT_UNCHANGED:
6357 /* We initially set this to OFF, but having the shader write the
6358 * depth means we allocate register space in the SEND message. The
6359 * difference between the SEND register count and the OFF state
6360 * programming makes the HW hang.
6361 *
6362 * Removing the depth writes also leads to test failures. So use
6363 * LesserThanOrEqual, which fits writing the same value
6364 * (unchanged/equal).
6365 *
6366 */
6367 return ELK_PSCDEPTH_ON_LE;
6368 }
6369 }
6370 return ELK_PSCDEPTH_OFF;
6371 }
6372
6373 /**
6374 * Move load_interpolated_input with simple (payload-based) barycentric modes
6375 * to the top of the program so we don't emit multiple PLNs for the same input.
6376 *
6377 * This works around CSE not being able to handle non-dominating cases
6378 * such as:
6379 *
6380 * if (...) {
6381 * interpolate input
6382 * } else {
6383 * interpolate the same exact input
6384 * }
6385 *
6386 * This should be replaced by global value numbering someday.
6387 */
6388 bool
elk_nir_move_interpolation_to_top(nir_shader * nir)6389 elk_nir_move_interpolation_to_top(nir_shader *nir)
6390 {
6391 bool progress = false;
6392
6393 nir_foreach_function_impl(impl, nir) {
6394 nir_block *top = nir_start_block(impl);
6395 nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
6396 bool impl_progress = false;
6397
6398 for (nir_block *block = nir_block_cf_tree_next(top);
6399 block != NULL;
6400 block = nir_block_cf_tree_next(block)) {
6401
6402 nir_foreach_instr_safe(instr, block) {
6403 if (instr->type != nir_instr_type_intrinsic)
6404 continue;
6405
6406 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6407 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
6408 continue;
6409 nir_intrinsic_instr *bary_intrinsic =
6410 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
6411 nir_intrinsic_op op = bary_intrinsic->intrinsic;
6412
6413 /* Leave interpolateAtSample/Offset() where they are. */
6414 if (op == nir_intrinsic_load_barycentric_at_sample ||
6415 op == nir_intrinsic_load_barycentric_at_offset)
6416 continue;
6417
6418 nir_instr *move[3] = {
6419 &bary_intrinsic->instr,
6420 intrin->src[1].ssa->parent_instr,
6421 instr
6422 };
6423
6424 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
6425 if (move[i]->block != top) {
6426 nir_instr_move(cursor, move[i]);
6427 impl_progress = true;
6428 }
6429 }
6430 }
6431 }
6432
6433 progress = progress || impl_progress;
6434
6435 nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
6436 : nir_metadata_all);
6437 }
6438
6439 return progress;
6440 }
6441
6442 static void
elk_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data)6443 elk_nir_populate_wm_prog_data(nir_shader *shader,
6444 const struct intel_device_info *devinfo,
6445 const struct elk_wm_prog_key *key,
6446 struct elk_wm_prog_data *prog_data)
6447 {
6448 /* key->alpha_test_func means simulating alpha testing via discards,
6449 * so the shader definitely kills pixels.
6450 */
6451 prog_data->uses_kill = shader->info.fs.uses_discard ||
6452 key->emit_alpha_test;
6453 prog_data->uses_omask = !key->ignore_sample_mask_out &&
6454 (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
6455 prog_data->color_outputs_written = key->color_outputs_valid;
6456 prog_data->computed_depth_mode = computed_depth_mode(shader);
6457 prog_data->computed_stencil =
6458 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
6459
6460 prog_data->sample_shading =
6461 shader->info.fs.uses_sample_shading ||
6462 shader->info.outputs_read;
6463
6464 assert(key->multisample_fbo != ELK_NEVER ||
6465 key->persample_interp == ELK_NEVER);
6466
6467 prog_data->persample_dispatch = key->persample_interp;
6468 if (prog_data->sample_shading)
6469 prog_data->persample_dispatch = ELK_ALWAYS;
6470
6471 /* We can only persample dispatch if we have a multisample FBO */
6472 prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
6473 key->multisample_fbo);
6474
6475 /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
6476 * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
6477 * to definitively tell whether alpha_to_coverage is on or off.
6478 */
6479 prog_data->alpha_to_coverage = key->alpha_to_coverage;
6480 assert(prog_data->alpha_to_coverage != ELK_SOMETIMES ||
6481 prog_data->persample_dispatch == ELK_SOMETIMES);
6482
6483 if (devinfo->ver >= 6) {
6484 prog_data->uses_sample_mask =
6485 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
6486
6487 /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
6488 *
6489 * "MSDISPMODE_PERSAMPLE is required in order to select
6490 * POSOFFSET_SAMPLE"
6491 *
6492 * So we can only really get sample positions if we are doing real
6493 * per-sample dispatch. If we need gl_SamplePosition and we don't have
6494 * persample dispatch, we hard-code it to 0.5.
6495 */
6496 prog_data->uses_pos_offset =
6497 prog_data->persample_dispatch != ELK_NEVER &&
6498 (BITSET_TEST(shader->info.system_values_read,
6499 SYSTEM_VALUE_SAMPLE_POS) ||
6500 BITSET_TEST(shader->info.system_values_read,
6501 SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
6502 }
6503
6504 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
6505 prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
6506 prog_data->inner_coverage = shader->info.fs.inner_coverage;
6507
6508 prog_data->barycentric_interp_modes =
6509 elk_compute_barycentric_interp_modes(devinfo, shader);
6510
6511 /* From the BDW PRM documentation for 3DSTATE_WM:
6512 *
6513 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
6514 * Sample or Non- perspective Sample barycentric coordinates."
6515 *
6516 * So cleanup any potentially set sample barycentric mode when not in per
6517 * sample dispatch.
6518 */
6519 if (prog_data->persample_dispatch == ELK_NEVER) {
6520 prog_data->barycentric_interp_modes &=
6521 ~BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
6522 }
6523
6524 prog_data->uses_nonperspective_interp_modes |=
6525 (prog_data->barycentric_interp_modes &
6526 ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
6527
6528 /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
6529 * Message Descriptor :
6530 *
6531 * "Message Type. Specifies the type of message being sent when
6532 * pixel-rate evaluation is requested :
6533 *
6534 * Format = U2
6535 * 0: Per Message Offset (eval_snapped with immediate offset)
6536 * 1: Sample Position Offset (eval_sindex)
6537 * 2: Centroid Position Offset (eval_centroid)
6538 * 3: Per Slot Offset (eval_snapped with register offset)
6539 *
6540 * Message Type. Specifies the type of message being sent when
6541 * coarse-rate evaluation is requested :
6542 *
6543 * Format = U2
6544 * 0: Coarse to Pixel Mapping Message (internal message)
6545 * 1: Reserved
6546 * 2: Coarse Centroid Position (eval_centroid)
6547 * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
6548 *
6549 * The Sample Position Offset is marked as reserved for coarse rate
6550 * evaluation and leads to hangs if we try to use it. So disable coarse
6551 * pixel shading if we have any intrinsic that will result in a pixel
6552 * interpolater message at sample.
6553 */
6554 intel_nir_pulls_at_sample(shader);
6555
6556 /* We choose to always enable VMask prior to XeHP, as it would cause
6557 * us to lose out on the eliminate_find_live_channel() optimization.
6558 */
6559 prog_data->uses_vmask = true;
6560
6561 prog_data->uses_src_w =
6562 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6563 prog_data->uses_src_depth =
6564 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6565
6566 calculate_urb_setup(devinfo, key, prog_data, shader);
6567 elk_compute_flat_inputs(prog_data, shader);
6568 }
6569
6570 /**
6571 * Pre-gfx6, the register file of the EUs was shared between threads,
6572 * and each thread used some subset allocated on a 16-register block
6573 * granularity. The unit states wanted these block counts.
6574 */
6575 static inline int
elk_register_blocks(int reg_count)6576 elk_register_blocks(int reg_count)
6577 {
6578 return ALIGN(reg_count, 16) / 16 - 1;
6579 }
6580
6581 const unsigned *
elk_compile_fs(const struct elk_compiler * compiler,struct elk_compile_fs_params * params)6582 elk_compile_fs(const struct elk_compiler *compiler,
6583 struct elk_compile_fs_params *params)
6584 {
6585 struct nir_shader *nir = params->base.nir;
6586 const struct elk_wm_prog_key *key = params->key;
6587 struct elk_wm_prog_data *prog_data = params->prog_data;
6588 bool allow_spilling = params->allow_spilling;
6589 const bool debug_enabled =
6590 elk_should_print_shader(nir, params->base.debug_flag ?
6591 params->base.debug_flag : DEBUG_WM);
6592
6593 prog_data->base.stage = MESA_SHADER_FRAGMENT;
6594 prog_data->base.total_scratch = 0;
6595
6596 const struct intel_device_info *devinfo = compiler->devinfo;
6597 const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
6598
6599 elk_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
6600 elk_nir_lower_fs_inputs(nir, devinfo, key);
6601 elk_nir_lower_fs_outputs(nir);
6602
6603 if (devinfo->ver < 6)
6604 elk_setup_vue_interpolation(params->vue_map, nir, prog_data);
6605
6606 /* From the SKL PRM, Volume 7, "Alpha Coverage":
6607 * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
6608 * hardware, regardless of the state setting for this feature."
6609 */
6610 if (devinfo->ver > 6 && key->alpha_to_coverage != ELK_NEVER) {
6611 /* Run constant fold optimization in order to get the correct source
6612 * offset to determine render target 0 store instruction in
6613 * emit_alpha_to_coverage pass.
6614 */
6615 NIR_PASS(_, nir, nir_opt_constant_folding);
6616 NIR_PASS(_, nir, elk_nir_lower_alpha_to_coverage, key, prog_data);
6617 }
6618
6619 NIR_PASS(_, nir, elk_nir_move_interpolation_to_top);
6620 elk_postprocess_nir(nir, compiler, debug_enabled,
6621 key->base.robust_flags);
6622
6623 elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
6624
6625 std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
6626 elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
6627 float throughput = 0;
6628 bool has_spilled = false;
6629
6630 v8 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
6631 prog_data, nir, 8,
6632 params->base.stats != NULL,
6633 debug_enabled);
6634 if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
6635 params->base.error_str = ralloc_strdup(params->base.mem_ctx,
6636 v8->fail_msg);
6637 return NULL;
6638 } else if (INTEL_SIMD(FS, 8)) {
6639 simd8_cfg = v8->cfg;
6640
6641 assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
6642 prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
6643
6644 prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
6645 const performance &perf = v8->performance_analysis.require();
6646 throughput = MAX2(throughput, perf.throughput);
6647 has_spilled = v8->spilled_any_registers;
6648 allow_spilling = false;
6649 }
6650
6651 /* Limit dispatch width to simd8 with dual source blending on gfx8.
6652 * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
6653 */
6654 if (devinfo->ver == 8 && prog_data->dual_src_blend &&
6655 INTEL_SIMD(FS, 8)) {
6656 assert(!params->use_rep_send);
6657 v8->limit_dispatch_width(8, "gfx8 workaround: "
6658 "using SIMD8 when dual src blending.\n");
6659 }
6660
6661 if (!has_spilled &&
6662 (!v8 || v8->max_dispatch_width >= 16) &&
6663 (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
6664 /* Try a SIMD16 compile */
6665 v16 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
6666 prog_data, nir, 16,
6667 params->base.stats != NULL,
6668 debug_enabled);
6669 if (v8)
6670 v16->import_uniforms(v8.get());
6671 if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
6672 elk_shader_perf_log(compiler, params->base.log_data,
6673 "SIMD16 shader failed to compile: %s\n",
6674 v16->fail_msg);
6675 } else {
6676 simd16_cfg = v16->cfg;
6677
6678 assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
6679 prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
6680
6681 prog_data->reg_blocks_16 = elk_register_blocks(v16->grf_used);
6682 const performance &perf = v16->performance_analysis.require();
6683 throughput = MAX2(throughput, perf.throughput);
6684 has_spilled = v16->spilled_any_registers;
6685 allow_spilling = false;
6686 }
6687 }
6688
6689 const bool simd16_failed = v16 && !simd16_cfg;
6690
6691 /* Currently, the compiler only supports SIMD32 on SNB+ */
6692 if (!has_spilled &&
6693 (!v8 || v8->max_dispatch_width >= 32) &&
6694 (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
6695 devinfo->ver >= 6 && !simd16_failed &&
6696 INTEL_SIMD(FS, 32)) {
6697 /* Try a SIMD32 compile */
6698 v32 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
6699 prog_data, nir, 32,
6700 params->base.stats != NULL,
6701 debug_enabled);
6702 if (v8)
6703 v32->import_uniforms(v8.get());
6704 else if (v16)
6705 v32->import_uniforms(v16.get());
6706
6707 if (!v32->run_fs(allow_spilling, false)) {
6708 elk_shader_perf_log(compiler, params->base.log_data,
6709 "SIMD32 shader failed to compile: %s\n",
6710 v32->fail_msg);
6711 } else {
6712 const performance &perf = v32->performance_analysis.require();
6713
6714 if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
6715 elk_shader_perf_log(compiler, params->base.log_data,
6716 "SIMD32 shader inefficient\n");
6717 } else {
6718 simd32_cfg = v32->cfg;
6719
6720 assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
6721 prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
6722
6723 prog_data->reg_blocks_32 = elk_register_blocks(v32->grf_used);
6724 throughput = MAX2(throughput, perf.throughput);
6725 }
6726 }
6727 }
6728
6729 /* When the caller requests a repclear shader, they want SIMD16-only */
6730 if (params->use_rep_send)
6731 simd8_cfg = NULL;
6732
6733 /* Prior to Iron Lake, the PS had a single shader offset with a jump table
6734 * at the top to select the shader. We've never implemented that.
6735 * Instead, we just give them exactly one shader and we pick the widest one
6736 * available.
6737 */
6738 if (compiler->devinfo->ver < 5) {
6739 if (simd32_cfg || simd16_cfg)
6740 simd8_cfg = NULL;
6741 if (simd32_cfg)
6742 simd16_cfg = NULL;
6743 }
6744
6745 /* If computed depth is enabled SNB only allows SIMD8. */
6746 if (compiler->devinfo->ver == 6 &&
6747 prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF)
6748 assert(simd16_cfg == NULL && simd32_cfg == NULL);
6749
6750 if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
6751 /* Iron lake and earlier only have one Dispatch GRF start field. Make
6752 * the data available in the base prog data struct for convenience.
6753 */
6754 if (simd16_cfg) {
6755 prog_data->base.dispatch_grf_start_reg =
6756 prog_data->dispatch_grf_start_reg_16;
6757 } else if (simd32_cfg) {
6758 prog_data->base.dispatch_grf_start_reg =
6759 prog_data->dispatch_grf_start_reg_32;
6760 }
6761 }
6762
6763 elk_fs_generator g(compiler, ¶ms->base, &prog_data->base,
6764 v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
6765
6766 if (unlikely(debug_enabled)) {
6767 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
6768 "%s fragment shader %s",
6769 nir->info.label ?
6770 nir->info.label : "unnamed",
6771 nir->info.name));
6772 }
6773
6774 struct elk_compile_stats *stats = params->base.stats;
6775 uint32_t max_dispatch_width = 0;
6776
6777 if (simd8_cfg) {
6778 prog_data->dispatch_8 = true;
6779 g.generate_code(simd8_cfg, 8, v8->shader_stats,
6780 v8->performance_analysis.require(), stats);
6781 stats = stats ? stats + 1 : NULL;
6782 max_dispatch_width = 8;
6783 }
6784
6785 if (simd16_cfg) {
6786 prog_data->dispatch_16 = true;
6787 prog_data->prog_offset_16 = g.generate_code(
6788 simd16_cfg, 16, v16->shader_stats,
6789 v16->performance_analysis.require(), stats);
6790 stats = stats ? stats + 1 : NULL;
6791 max_dispatch_width = 16;
6792 }
6793
6794 if (simd32_cfg) {
6795 prog_data->dispatch_32 = true;
6796 prog_data->prog_offset_32 = g.generate_code(
6797 simd32_cfg, 32, v32->shader_stats,
6798 v32->performance_analysis.require(), stats);
6799 stats = stats ? stats + 1 : NULL;
6800 max_dispatch_width = 32;
6801 }
6802
6803 for (struct elk_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
6804 s->max_dispatch_width = max_dispatch_width;
6805
6806 g.add_const_data(nir->constant_data, nir->constant_data_size);
6807 return g.get_assembly();
6808 }
6809
6810 unsigned
elk_cs_push_const_total_size(const struct elk_cs_prog_data * cs_prog_data,unsigned threads)6811 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
6812 unsigned threads)
6813 {
6814 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
6815 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
6816 return cs_prog_data->push.per_thread.size * threads +
6817 cs_prog_data->push.cross_thread.size;
6818 }
6819
6820 static void
fill_push_const_block_info(struct elk_push_const_block * block,unsigned dwords)6821 fill_push_const_block_info(struct elk_push_const_block *block, unsigned dwords)
6822 {
6823 block->dwords = dwords;
6824 block->regs = DIV_ROUND_UP(dwords, 8);
6825 block->size = block->regs * 32;
6826 }
6827
6828 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct elk_cs_prog_data * cs_prog_data)6829 cs_fill_push_const_info(const struct intel_device_info *devinfo,
6830 struct elk_cs_prog_data *cs_prog_data)
6831 {
6832 const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
6833 int subgroup_id_index = elk_get_subgroup_id_param_index(devinfo, prog_data);
6834 bool cross_thread_supported = devinfo->verx10 >= 75;
6835
6836 /* The thread ID should be stored in the last param dword */
6837 assert(subgroup_id_index == -1 ||
6838 subgroup_id_index == (int)prog_data->nr_params - 1);
6839
6840 unsigned cross_thread_dwords, per_thread_dwords;
6841 if (!cross_thread_supported) {
6842 cross_thread_dwords = 0u;
6843 per_thread_dwords = prog_data->nr_params;
6844 } else if (subgroup_id_index >= 0) {
6845 /* Fill all but the last register with cross-thread payload */
6846 cross_thread_dwords = 8 * (subgroup_id_index / 8);
6847 per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
6848 assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
6849 } else {
6850 /* Fill all data using cross-thread payload */
6851 cross_thread_dwords = prog_data->nr_params;
6852 per_thread_dwords = 0u;
6853 }
6854
6855 fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
6856 fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
6857
6858 assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
6859 cs_prog_data->push.per_thread.size == 0);
6860 assert(cs_prog_data->push.cross_thread.dwords +
6861 cs_prog_data->push.per_thread.dwords ==
6862 prog_data->nr_params);
6863 }
6864
6865 static bool
filter_simd(const nir_instr * instr,const void *)6866 filter_simd(const nir_instr *instr, const void * /* options */)
6867 {
6868 if (instr->type != nir_instr_type_intrinsic)
6869 return false;
6870
6871 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6872 case nir_intrinsic_load_simd_width_intel:
6873 case nir_intrinsic_load_subgroup_id:
6874 return true;
6875
6876 default:
6877 return false;
6878 }
6879 }
6880
6881 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)6882 lower_simd(nir_builder *b, nir_instr *instr, void *options)
6883 {
6884 uintptr_t simd_width = (uintptr_t)options;
6885
6886 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6887 case nir_intrinsic_load_simd_width_intel:
6888 return nir_imm_int(b, simd_width);
6889
6890 case nir_intrinsic_load_subgroup_id:
6891 /* If the whole workgroup fits in one thread, we can lower subgroup_id
6892 * to a constant zero.
6893 */
6894 if (!b->shader->info.workgroup_size_variable) {
6895 unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
6896 b->shader->info.workgroup_size[1] *
6897 b->shader->info.workgroup_size[2];
6898 if (local_workgroup_size <= simd_width)
6899 return nir_imm_int(b, 0);
6900 }
6901 return NULL;
6902
6903 default:
6904 return NULL;
6905 }
6906 }
6907
6908 bool
elk_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)6909 elk_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
6910 {
6911 return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
6912 (void *)(uintptr_t)dispatch_width);
6913 }
6914
6915 const unsigned *
elk_compile_cs(const struct elk_compiler * compiler,struct elk_compile_cs_params * params)6916 elk_compile_cs(const struct elk_compiler *compiler,
6917 struct elk_compile_cs_params *params)
6918 {
6919 const nir_shader *nir = params->base.nir;
6920 const struct elk_cs_prog_key *key = params->key;
6921 struct elk_cs_prog_data *prog_data = params->prog_data;
6922
6923 const bool debug_enabled =
6924 elk_should_print_shader(nir, params->base.debug_flag ?
6925 params->base.debug_flag : DEBUG_CS);
6926
6927 prog_data->base.stage = MESA_SHADER_COMPUTE;
6928 prog_data->base.total_shared = nir->info.shared_size;
6929 prog_data->base.total_scratch = 0;
6930
6931 if (!nir->info.workgroup_size_variable) {
6932 prog_data->local_size[0] = nir->info.workgroup_size[0];
6933 prog_data->local_size[1] = nir->info.workgroup_size[1];
6934 prog_data->local_size[2] = nir->info.workgroup_size[2];
6935 }
6936
6937 elk_simd_selection_state simd_state{
6938 .devinfo = compiler->devinfo,
6939 .prog_data = prog_data,
6940 .required_width = elk_required_dispatch_width(&nir->info),
6941 };
6942
6943 std::unique_ptr<elk_fs_visitor> v[3];
6944
6945 for (unsigned simd = 0; simd < 3; simd++) {
6946 if (!elk_simd_should_compile(simd_state, simd))
6947 continue;
6948
6949 const unsigned dispatch_width = 8u << simd;
6950
6951 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
6952 elk_nir_apply_key(shader, compiler, &key->base,
6953 dispatch_width);
6954
6955 NIR_PASS(_, shader, elk_nir_lower_simd, dispatch_width);
6956
6957 /* Clean up after the local index and ID calculations. */
6958 NIR_PASS(_, shader, nir_opt_constant_folding);
6959 NIR_PASS(_, shader, nir_opt_dce);
6960
6961 elk_postprocess_nir(shader, compiler, debug_enabled,
6962 key->base.robust_flags);
6963
6964 v[simd] = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base,
6965 &key->base,
6966 &prog_data->base,
6967 shader, dispatch_width,
6968 params->base.stats != NULL,
6969 debug_enabled);
6970
6971 const int first = elk_simd_first_compiled(simd_state);
6972 if (first >= 0)
6973 v[simd]->import_uniforms(v[first].get());
6974
6975 const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
6976
6977 if (v[simd]->run_cs(allow_spilling)) {
6978 cs_fill_push_const_info(compiler->devinfo, prog_data);
6979
6980 elk_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
6981 } else {
6982 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
6983 if (simd > 0) {
6984 elk_shader_perf_log(compiler, params->base.log_data,
6985 "SIMD%u shader failed to compile: %s\n",
6986 dispatch_width, v[simd]->fail_msg);
6987 }
6988 }
6989 }
6990
6991 const int selected_simd = elk_simd_select(simd_state);
6992 if (selected_simd < 0) {
6993 params->base.error_str =
6994 ralloc_asprintf(params->base.mem_ctx,
6995 "Can't compile shader: "
6996 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
6997 simd_state.error[0], simd_state.error[1],
6998 simd_state.error[2]);
6999 return NULL;
7000 }
7001
7002 assert(selected_simd < 3);
7003 elk_fs_visitor *selected = v[selected_simd].get();
7004
7005 if (!nir->info.workgroup_size_variable)
7006 prog_data->prog_mask = 1 << selected_simd;
7007
7008 elk_fs_generator g(compiler, ¶ms->base, &prog_data->base,
7009 selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
7010 if (unlikely(debug_enabled)) {
7011 char *name = ralloc_asprintf(params->base.mem_ctx,
7012 "%s compute shader %s",
7013 nir->info.label ?
7014 nir->info.label : "unnamed",
7015 nir->info.name);
7016 g.enable_debug(name);
7017 }
7018
7019 uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
7020
7021 struct elk_compile_stats *stats = params->base.stats;
7022 for (unsigned simd = 0; simd < 3; simd++) {
7023 if (prog_data->prog_mask & (1u << simd)) {
7024 assert(v[simd]);
7025 prog_data->prog_offset[simd] =
7026 g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
7027 v[simd]->performance_analysis.require(), stats);
7028 if (stats)
7029 stats->max_dispatch_width = max_dispatch_width;
7030 stats = stats ? stats + 1 : NULL;
7031 max_dispatch_width = 8u << simd;
7032 }
7033 }
7034
7035 g.add_const_data(nir->constant_data, nir->constant_data_size);
7036
7037 return g.get_assembly();
7038 }
7039
7040 struct intel_cs_dispatch_info
elk_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * override_local_size)7041 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
7042 const struct elk_cs_prog_data *prog_data,
7043 const unsigned *override_local_size)
7044 {
7045 struct intel_cs_dispatch_info info = {};
7046
7047 const unsigned *sizes =
7048 override_local_size ? override_local_size :
7049 prog_data->local_size;
7050
7051 const int simd = elk_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
7052 assert(simd >= 0 && simd < 3);
7053
7054 info.group_size = sizes[0] * sizes[1] * sizes[2];
7055 info.simd_size = 8u << simd;
7056 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
7057
7058 const uint32_t remainder = info.group_size & (info.simd_size - 1);
7059 if (remainder > 0)
7060 info.right_mask = ~0u >> (32 - remainder);
7061 else
7062 info.right_mask = ~0u >> (32 - info.simd_size);
7063
7064 return info;
7065 }
7066
7067 uint64_t
elk_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)7068 elk_bsr(const struct intel_device_info *devinfo,
7069 uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
7070 {
7071 assert(offset % 64 == 0);
7072 assert(simd_size == 8 || simd_size == 16);
7073 assert(local_arg_offset % 8 == 0);
7074
7075 return offset |
7076 SET_BITS(simd_size == 8, 4, 4) |
7077 SET_BITS(local_arg_offset / 8, 2, 0);
7078 }
7079
7080 /**
7081 * Test the dispatch mask packing assumptions of
7082 * elk_stage_has_packed_dispatch(). Call this from e.g. the top of
7083 * elk_fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
7084 * executed with an unexpected dispatch mask.
7085 */
7086 static UNUSED void
elk_fs_test_dispatch_packing(const fs_builder & bld)7087 elk_fs_test_dispatch_packing(const fs_builder &bld)
7088 {
7089 const elk_fs_visitor *shader = static_cast<const elk_fs_visitor *>(bld.shader);
7090 const gl_shader_stage stage = shader->stage;
7091 const bool uses_vmask =
7092 stage == MESA_SHADER_FRAGMENT &&
7093 elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
7094
7095 if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
7096 shader->stage_prog_data)) {
7097 const fs_builder ubld = bld.exec_all().group(1, 0);
7098 const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
7099 const elk_fs_reg mask = uses_vmask ? elk_vmask_reg() : elk_dmask_reg();
7100
7101 ubld.ADD(tmp, mask, elk_imm_ud(1));
7102 ubld.AND(tmp, mask, tmp);
7103
7104 /* This will loop forever if the dispatch mask doesn't have the expected
7105 * form '2^n-1', in which case tmp will be non-zero.
7106 */
7107 bld.emit(ELK_OPCODE_DO);
7108 bld.CMP(bld.null_reg_ud(), tmp, elk_imm_ud(0), ELK_CONDITIONAL_NZ);
7109 set_predicate(ELK_PREDICATE_NORMAL, bld.emit(ELK_OPCODE_WHILE));
7110 }
7111 }
7112
7113 unsigned
workgroup_size() const7114 elk_fs_visitor::workgroup_size() const
7115 {
7116 assert(gl_shader_stage_uses_workgroup(stage));
7117 const struct elk_cs_prog_data *cs = elk_cs_prog_data(prog_data);
7118 return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
7119 }
7120
elk_should_print_shader(const nir_shader * shader,uint64_t debug_flag)7121 bool elk_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
7122 {
7123 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
7124 }
7125
7126 namespace elk {
7127 elk_fs_reg
fetch_payload_reg(const elk::fs_builder & bld,uint8_t regs[2],elk_reg_type type,unsigned n)7128 fetch_payload_reg(const elk::fs_builder &bld, uint8_t regs[2],
7129 elk_reg_type type, unsigned n)
7130 {
7131 if (!regs[0])
7132 return elk_fs_reg();
7133
7134 if (bld.dispatch_width() > 16) {
7135 const elk_fs_reg tmp = bld.vgrf(type, n);
7136 const elk::fs_builder hbld = bld.exec_all().group(16, 0);
7137 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7138 elk_fs_reg *const components = new elk_fs_reg[m * n];
7139
7140 for (unsigned c = 0; c < n; c++) {
7141 for (unsigned g = 0; g < m; g++)
7142 components[c * m + g] =
7143 offset(retype(elk_vec8_grf(regs[g], 0), type), hbld, c);
7144 }
7145
7146 hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
7147
7148 delete[] components;
7149 return tmp;
7150
7151 } else {
7152 return elk_fs_reg(retype(elk_vec8_grf(regs[0], 0), type));
7153 }
7154 }
7155
7156 elk_fs_reg
fetch_barycentric_reg(const elk::fs_builder & bld,uint8_t regs[2])7157 fetch_barycentric_reg(const elk::fs_builder &bld, uint8_t regs[2])
7158 {
7159 if (!regs[0])
7160 return elk_fs_reg();
7161
7162 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
7163 const elk::fs_builder hbld = bld.exec_all().group(8, 0);
7164 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7165 elk_fs_reg *const components = new elk_fs_reg[2 * m];
7166
7167 for (unsigned c = 0; c < 2; c++) {
7168 for (unsigned g = 0; g < m; g++)
7169 components[c * m + g] = offset(elk_vec8_grf(regs[g / 2], 0),
7170 hbld, c + 2 * (g % 2));
7171 }
7172
7173 hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
7174
7175 delete[] components;
7176 return tmp;
7177 }
7178
7179 void
check_dynamic_msaa_flag(const fs_builder & bld,const struct elk_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)7180 check_dynamic_msaa_flag(const fs_builder &bld,
7181 const struct elk_wm_prog_data *wm_prog_data,
7182 enum intel_msaa_flags flag)
7183 {
7184 elk_fs_inst *inst = bld.AND(bld.null_reg_ud(),
7185 dynamic_msaa_flags(wm_prog_data),
7186 elk_imm_ud(flag));
7187 inst->conditional_mod = ELK_CONDITIONAL_NZ;
7188 }
7189 }
7190