xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file elk_fs.cpp
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "elk_eu.h"
32 #include "elk_fs.h"
33 #include "elk_fs_builder.h"
34 #include "elk_fs_live_variables.h"
35 #include "elk_nir.h"
36 #include "elk_vec4_gs_visitor.h"
37 #include "elk_cfg.h"
38 #include "elk_dead_control_flow.h"
39 #include "elk_private.h"
40 #include "../intel_nir.h"
41 #include "shader_enums.h"
42 #include "dev/intel_debug.h"
43 #include "dev/intel_wa.h"
44 #include "compiler/glsl_types.h"
45 #include "compiler/nir/nir_builder.h"
46 #include "util/u_math.h"
47 
48 #include <memory>
49 
50 using namespace elk;
51 
52 static unsigned get_lowered_simd_width(const elk_fs_visitor *shader,
53                                        const elk_fs_inst *inst);
54 
55 void
init(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources)56 elk_fs_inst::init(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
57               const elk_fs_reg *src, unsigned sources)
58 {
59    memset((void*)this, 0, sizeof(*this));
60 
61    this->src = new elk_fs_reg[MAX2(sources, 3)];
62    for (unsigned i = 0; i < sources; i++)
63       this->src[i] = src[i];
64 
65    this->opcode = opcode;
66    this->dst = dst;
67    this->sources = sources;
68    this->exec_size = exec_size;
69    this->base_mrf = -1;
70 
71    assert(dst.file != IMM && dst.file != UNIFORM);
72 
73    assert(this->exec_size != 0);
74 
75    this->conditional_mod = ELK_CONDITIONAL_NONE;
76 
77    /* This will be the case for almost all instructions. */
78    switch (dst.file) {
79    case VGRF:
80    case ARF:
81    case FIXED_GRF:
82    case MRF:
83    case ATTR:
84       this->size_written = dst.component_size(exec_size);
85       break;
86    case BAD_FILE:
87       this->size_written = 0;
88       break;
89    case IMM:
90    case UNIFORM:
91       unreachable("Invalid destination register file");
92    }
93 
94    this->writes_accumulator = false;
95 }
96 
elk_fs_inst()97 elk_fs_inst::elk_fs_inst()
98 {
99    init(ELK_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size)102 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size)
103 {
104    init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst)107 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst)
108 {
109    init(opcode, exec_size, dst, NULL, 0);
110 }
111 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0)112 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
113                  const elk_fs_reg &src0)
114 {
115    const elk_fs_reg src[1] = { src0 };
116    init(opcode, exec_size, dst, src, 1);
117 }
118 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1)119 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
120                  const elk_fs_reg &src0, const elk_fs_reg &src1)
121 {
122    const elk_fs_reg src[2] = { src0, src1 };
123    init(opcode, exec_size, dst, src, 2);
124 }
125 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1,const elk_fs_reg & src2)126 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
127                  const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2)
128 {
129    const elk_fs_reg src[3] = { src0, src1, src2 };
130    init(opcode, exec_size, dst, src, 3);
131 }
132 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_width,const elk_fs_reg & dst,const elk_fs_reg src[],unsigned sources)133 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
134                  const elk_fs_reg src[], unsigned sources)
135 {
136    init(opcode, exec_width, dst, src, sources);
137 }
138 
elk_fs_inst(const elk_fs_inst & that)139 elk_fs_inst::elk_fs_inst(const elk_fs_inst &that)
140 {
141    memcpy((void*)this, &that, sizeof(that));
142 
143    this->src = new elk_fs_reg[MAX2(that.sources, 3)];
144 
145    for (unsigned i = 0; i < that.sources; i++)
146       this->src[i] = that.src[i];
147 }
148 
~elk_fs_inst()149 elk_fs_inst::~elk_fs_inst()
150 {
151    delete[] this->src;
152 }
153 
154 void
resize_sources(uint8_t num_sources)155 elk_fs_inst::resize_sources(uint8_t num_sources)
156 {
157    if (this->sources != num_sources) {
158       elk_fs_reg *src = new elk_fs_reg[MAX2(num_sources, 3)];
159 
160       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161          src[i] = this->src[i];
162 
163       delete[] this->src;
164       this->src = src;
165       this->sources = num_sources;
166    }
167 }
168 
169 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & surface,const elk_fs_reg & surface_handle,const elk_fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)170 elk_fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171                                        const elk_fs_reg &dst,
172                                        const elk_fs_reg &surface,
173                                        const elk_fs_reg &surface_handle,
174                                        const elk_fs_reg &varying_offset,
175                                        uint32_t const_offset,
176                                        uint8_t alignment,
177                                        unsigned components)
178 {
179    assert(components <= 4);
180 
181    /* We have our constant surface use a pitch of 4 bytes, so our index can
182     * be any component of a vector, and then we load 4 contiguous
183     * components starting from that.  TODO: Support loading fewer than 4.
184     */
185    elk_fs_reg total_offset = vgrf(glsl_uint_type());
186    bld.ADD(total_offset, varying_offset, elk_imm_ud(const_offset));
187 
188    /* The pull load message will load a vec4 (16 bytes). If we are loading
189     * a double this means we are only loading 2 elements worth of data.
190     * We also want to use a 32-bit data type for the dst of the load operation
191     * so other parts of the driver don't get confused about the size of the
192     * result.
193     */
194    elk_fs_reg vec4_result = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
195 
196    elk_fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
197    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
198    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
199    srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
200    srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = elk_imm_ud(alignment);
201 
202    elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
203                             vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
204    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
205 
206    elk_shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
207 }
208 
209 /**
210  * A helper for MOV generation for fixing up broken hardware SEND dependency
211  * handling.
212  */
213 void
DEP_RESOLVE_MOV(const fs_builder & bld,int grf)214 elk_fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
215 {
216    /* The caller always wants uncompressed to emit the minimal extra
217     * dependencies, and to avoid having to deal with aligning its regs to 2.
218     */
219    const fs_builder ubld = bld.annotate("send dependency resolve")
220                               .quarter(0);
221 
222    ubld.MOV(ubld.null_reg_f(), elk_fs_reg(VGRF, grf, ELK_REGISTER_TYPE_F));
223 }
224 
225 bool
is_send_from_grf() const226 elk_fs_inst::is_send_from_grf() const
227 {
228    switch (opcode) {
229    case ELK_SHADER_OPCODE_SEND:
230    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
231    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
232    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
233    case ELK_SHADER_OPCODE_INTERLOCK:
234    case ELK_SHADER_OPCODE_MEMORY_FENCE:
235    case ELK_SHADER_OPCODE_BARRIER:
236       return true;
237    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238       return src[1].file == VGRF;
239    case ELK_FS_OPCODE_FB_WRITE:
240       return src[0].file == VGRF;
241    default:
242       return false;
243    }
244 }
245 
246 bool
is_control_source(unsigned arg) const247 elk_fs_inst::is_control_source(unsigned arg) const
248 {
249    switch (opcode) {
250    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
251    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
252       return arg == 0;
253 
254    case ELK_SHADER_OPCODE_BROADCAST:
255    case ELK_SHADER_OPCODE_SHUFFLE:
256    case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
257    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
258    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
259    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
260       return arg == 1;
261 
262    case ELK_SHADER_OPCODE_MOV_INDIRECT:
263    case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
264    case ELK_SHADER_OPCODE_TEX:
265    case ELK_FS_OPCODE_TXB:
266    case ELK_SHADER_OPCODE_TXD:
267    case ELK_SHADER_OPCODE_TXF:
268    case ELK_SHADER_OPCODE_TXF_LZ:
269    case ELK_SHADER_OPCODE_TXF_CMS:
270    case ELK_SHADER_OPCODE_TXF_CMS_W:
271    case ELK_SHADER_OPCODE_TXF_UMS:
272    case ELK_SHADER_OPCODE_TXF_MCS:
273    case ELK_SHADER_OPCODE_TXL:
274    case ELK_SHADER_OPCODE_TXL_LZ:
275    case ELK_SHADER_OPCODE_TXS:
276    case ELK_SHADER_OPCODE_LOD:
277    case ELK_SHADER_OPCODE_TG4:
278    case ELK_SHADER_OPCODE_TG4_OFFSET:
279    case ELK_SHADER_OPCODE_SAMPLEINFO:
280       return arg == 1 || arg == 2;
281 
282    case ELK_SHADER_OPCODE_SEND:
283       return arg == 0;
284 
285    default:
286       return false;
287    }
288 }
289 
290 bool
is_payload(unsigned arg) const291 elk_fs_inst::is_payload(unsigned arg) const
292 {
293    switch (opcode) {
294    case ELK_FS_OPCODE_FB_WRITE:
295    case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
296    case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
297    case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
298    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
299    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
300    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
301    case ELK_SHADER_OPCODE_INTERLOCK:
302    case ELK_SHADER_OPCODE_MEMORY_FENCE:
303    case ELK_SHADER_OPCODE_BARRIER:
304    case ELK_SHADER_OPCODE_TEX:
305    case ELK_FS_OPCODE_TXB:
306    case ELK_SHADER_OPCODE_TXD:
307    case ELK_SHADER_OPCODE_TXF:
308    case ELK_SHADER_OPCODE_TXF_LZ:
309    case ELK_SHADER_OPCODE_TXF_CMS:
310    case ELK_SHADER_OPCODE_TXF_CMS_W:
311    case ELK_SHADER_OPCODE_TXF_UMS:
312    case ELK_SHADER_OPCODE_TXF_MCS:
313    case ELK_SHADER_OPCODE_TXL:
314    case ELK_SHADER_OPCODE_TXL_LZ:
315    case ELK_SHADER_OPCODE_TXS:
316    case ELK_SHADER_OPCODE_LOD:
317    case ELK_SHADER_OPCODE_TG4:
318    case ELK_SHADER_OPCODE_TG4_OFFSET:
319    case ELK_SHADER_OPCODE_SAMPLEINFO:
320       return arg == 0;
321 
322    case ELK_SHADER_OPCODE_SEND:
323       return arg == 1;
324 
325    default:
326       return false;
327    }
328 }
329 
330 /**
331  * Returns true if this instruction's sources and destinations cannot
332  * safely be the same register.
333  *
334  * In most cases, a register can be written over safely by the same
335  * instruction that is its last use.  For a single instruction, the
336  * sources are dereferenced before writing of the destination starts
337  * (naturally).
338  *
339  * However, there are a few cases where this can be problematic:
340  *
341  * - Virtual opcodes that translate to multiple instructions in the
342  *   code generator: if src == dst and one instruction writes the
343  *   destination before a later instruction reads the source, then
344  *   src will have been clobbered.
345  *
346  * - SIMD16 compressed instructions with certain regioning (see below).
347  *
348  * The register allocator uses this information to set up conflicts between
349  * GRF sources and the destination.
350  */
351 bool
has_source_and_destination_hazard() const352 elk_fs_inst::has_source_and_destination_hazard() const
353 {
354    switch (opcode) {
355    case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
356       /* Multiple partial writes to the destination */
357       return true;
358    case ELK_SHADER_OPCODE_SHUFFLE:
359       /* This instruction returns an arbitrary channel from the source and
360        * gets split into smaller instructions in the generator.  It's possible
361        * that one of the instructions will read from a channel corresponding
362        * to an earlier instruction.
363        */
364    case ELK_SHADER_OPCODE_SEL_EXEC:
365       /* This is implemented as
366        *
367        * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
368        * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
369        *
370        * Because the source is only read in the second instruction, the first
371        * may stomp all over it.
372        */
373       return true;
374    case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
375       switch (src[1].ud) {
376       case ELK_SWIZZLE_XXXX:
377       case ELK_SWIZZLE_YYYY:
378       case ELK_SWIZZLE_ZZZZ:
379       case ELK_SWIZZLE_WWWW:
380       case ELK_SWIZZLE_XXZZ:
381       case ELK_SWIZZLE_YYWW:
382       case ELK_SWIZZLE_XYXY:
383       case ELK_SWIZZLE_ZWZW:
384          /* These can be implemented as a single Align1 region on all
385           * platforms, so there's never a hazard between source and
386           * destination.  C.f. elk_fs_generator::generate_quad_swizzle().
387           */
388          return false;
389       default:
390          return !is_uniform(src[0]);
391       }
392    default:
393       /* The SIMD16 compressed instruction
394        *
395        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
396        *
397        * is actually decoded in hardware as:
398        *
399        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
400        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
401        *
402        * Which is safe.  However, if we have uniform accesses
403        * happening, we get into trouble:
404        *
405        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
406        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
407        *
408        * Now our destination for the first instruction overwrote the
409        * second instruction's src0, and we get garbage for those 8
410        * pixels.  There's a similar issue for the pre-gfx6
411        * pixel_x/pixel_y, which are registers of 16-bit values and thus
412        * would get stomped by the first decode as well.
413        */
414       if (exec_size == 16) {
415          for (int i = 0; i < sources; i++) {
416             if (src[i].file == VGRF && (src[i].stride == 0 ||
417                                         src[i].type == ELK_REGISTER_TYPE_UW ||
418                                         src[i].type == ELK_REGISTER_TYPE_W ||
419                                         src[i].type == ELK_REGISTER_TYPE_UB ||
420                                         src[i].type == ELK_REGISTER_TYPE_B)) {
421                return true;
422             }
423          }
424       }
425       return false;
426    }
427 }
428 
429 bool
can_do_source_mods(const struct intel_device_info * devinfo) const430 elk_fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
431 {
432    if (devinfo->ver == 6 && is_math())
433       return false;
434 
435    if (is_send_from_grf())
436       return false;
437 
438    return elk_backend_instruction::can_do_source_mods();
439 }
440 
441 bool
can_do_cmod()442 elk_fs_inst::can_do_cmod()
443 {
444    if (!elk_backend_instruction::can_do_cmod())
445       return false;
446 
447    /* The accumulator result appears to get used for the conditional modifier
448     * generation.  When negating a UD value, there is a 33rd bit generated for
449     * the sign in the accumulator value, so now you can't check, for example,
450     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
451     */
452    for (unsigned i = 0; i < sources; i++) {
453       if (elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
454          return false;
455    }
456 
457    return true;
458 }
459 
460 bool
can_change_types() const461 elk_fs_inst::can_change_types() const
462 {
463    return dst.type == src[0].type &&
464           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
465           (opcode == ELK_OPCODE_MOV ||
466            (opcode == ELK_OPCODE_SEL &&
467             dst.type == src[1].type &&
468             predicate != ELK_PREDICATE_NONE &&
469             !src[1].abs && !src[1].negate && src[1].file != ATTR));
470 }
471 
472 void
init()473 elk_fs_reg::init()
474 {
475    memset((void*)this, 0, sizeof(*this));
476    type = ELK_REGISTER_TYPE_UD;
477    stride = 1;
478 }
479 
480 /** Generic unset register constructor. */
elk_fs_reg()481 elk_fs_reg::elk_fs_reg()
482 {
483    init();
484    this->file = BAD_FILE;
485 }
486 
elk_fs_reg(struct::elk_reg reg)487 elk_fs_reg::elk_fs_reg(struct ::elk_reg reg) :
488    elk_backend_reg(reg)
489 {
490    this->offset = 0;
491    this->stride = 1;
492    if (this->file == IMM &&
493        (this->type != ELK_REGISTER_TYPE_V &&
494         this->type != ELK_REGISTER_TYPE_UV &&
495         this->type != ELK_REGISTER_TYPE_VF)) {
496       this->stride = 0;
497    }
498 }
499 
500 bool
equals(const elk_fs_reg & r) const501 elk_fs_reg::equals(const elk_fs_reg &r) const
502 {
503    return (this->elk_backend_reg::equals(r) &&
504            stride == r.stride);
505 }
506 
507 bool
negative_equals(const elk_fs_reg & r) const508 elk_fs_reg::negative_equals(const elk_fs_reg &r) const
509 {
510    return (this->elk_backend_reg::negative_equals(r) &&
511            stride == r.stride);
512 }
513 
514 bool
is_contiguous() const515 elk_fs_reg::is_contiguous() const
516 {
517    switch (file) {
518    case ARF:
519    case FIXED_GRF:
520       return hstride == ELK_HORIZONTAL_STRIDE_1 &&
521              vstride == width + hstride;
522    case MRF:
523    case VGRF:
524    case ATTR:
525       return stride == 1;
526    case UNIFORM:
527    case IMM:
528    case BAD_FILE:
529       return true;
530    }
531 
532    unreachable("Invalid register file");
533 }
534 
535 unsigned
component_size(unsigned width) const536 elk_fs_reg::component_size(unsigned width) const
537 {
538    if (file == ARF || file == FIXED_GRF) {
539       const unsigned w = MIN2(width, 1u << this->width);
540       const unsigned h = width >> this->width;
541       const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
542       const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
543       assert(w > 0);
544       return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
545    } else {
546       return MAX2(width * stride, 1) * type_sz(type);
547    }
548 }
549 
550 void
vfail(const char * format,va_list va)551 elk_fs_visitor::vfail(const char *format, va_list va)
552 {
553    char *msg;
554 
555    if (failed)
556       return;
557 
558    failed = true;
559 
560    msg = ralloc_vasprintf(mem_ctx, format, va);
561    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
562          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
563 
564    this->fail_msg = msg;
565 
566    if (unlikely(debug_enabled)) {
567       fprintf(stderr, "%s",  msg);
568    }
569 }
570 
571 void
fail(const char * format,...)572 elk_fs_visitor::fail(const char *format, ...)
573 {
574    va_list va;
575 
576    va_start(va, format);
577    vfail(format, va);
578    va_end(va);
579 }
580 
581 /**
582  * Mark this program as impossible to compile with dispatch width greater
583  * than n.
584  *
585  * During the SIMD8 compile (which happens first), we can detect and flag
586  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
587  * SIMD16+ compile altogether.
588  *
589  * During a compile of dispatch width greater than n (if one happens anyway),
590  * this just calls fail().
591  */
592 void
limit_dispatch_width(unsigned n,const char * msg)593 elk_fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
594 {
595    if (dispatch_width > n) {
596       fail("%s", msg);
597    } else {
598       max_dispatch_width = MIN2(max_dispatch_width, n);
599       elk_shader_perf_log(compiler, log_data,
600                           "Shader dispatch width limited to SIMD%d: %s\n",
601                           n, msg);
602    }
603 }
604 
605 /**
606  * Returns true if the instruction has a flag that means it won't
607  * update an entire destination register.
608  *
609  * For example, dead code elimination and live variable analysis want to know
610  * when a write to a variable screens off any preceding values that were in
611  * it.
612  */
613 bool
is_partial_write() const614 elk_fs_inst::is_partial_write() const
615 {
616    if (this->predicate && !this->predicate_trivial &&
617        this->opcode != ELK_OPCODE_SEL)
618       return true;
619 
620    if (this->dst.offset % REG_SIZE != 0)
621       return true;
622 
623    /* SEND instructions always write whole registers */
624    if (this->opcode == ELK_SHADER_OPCODE_SEND)
625       return false;
626 
627    /* Special case UNDEF since a lot of places in the backend do things like this :
628     *
629     *  fs_builder ubld = bld.exec_all().group(1, 0);
630     *  elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
631     *  ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
632     */
633    if (this->opcode == ELK_SHADER_OPCODE_UNDEF) {
634       assert(this->dst.is_contiguous());
635       return this->size_written < 32;
636    }
637 
638    return this->exec_size * type_sz(this->dst.type) < 32 ||
639           !this->dst.is_contiguous();
640 }
641 
642 unsigned
components_read(unsigned i) const643 elk_fs_inst::components_read(unsigned i) const
644 {
645    /* Return zero if the source is not present. */
646    if (src[i].file == BAD_FILE)
647       return 0;
648 
649    switch (opcode) {
650    case ELK_FS_OPCODE_LINTERP:
651       if (i == 0)
652          return 2;
653       else
654          return 1;
655 
656    case ELK_FS_OPCODE_PIXEL_X:
657    case ELK_FS_OPCODE_PIXEL_Y:
658       assert(i < 2);
659       if (i == 0)
660          return 2;
661       else
662          return 1;
663 
664    case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
665       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
666       /* First/second FB write color. */
667       if (i < 2)
668          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
669       else
670          return 1;
671 
672    case ELK_SHADER_OPCODE_TEX_LOGICAL:
673    case ELK_SHADER_OPCODE_TXD_LOGICAL:
674    case ELK_SHADER_OPCODE_TXF_LOGICAL:
675    case ELK_SHADER_OPCODE_TXL_LOGICAL:
676    case ELK_SHADER_OPCODE_TXS_LOGICAL:
677    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
678    case ELK_FS_OPCODE_TXB_LOGICAL:
679    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
680    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
681    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
682    case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
683    case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
684    case ELK_SHADER_OPCODE_LOD_LOGICAL:
685    case ELK_SHADER_OPCODE_TG4_LOGICAL:
686    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
687    case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
688       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
689              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
690              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
691       /* Texture coordinates. */
692       if (i == TEX_LOGICAL_SRC_COORDINATE)
693          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
694       /* Texture derivatives. */
695       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
696                opcode == ELK_SHADER_OPCODE_TXD_LOGICAL)
697          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
698       /* Texture offset. */
699       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
700          return 2;
701       /* MCS */
702       else if (i == TEX_LOGICAL_SRC_MCS) {
703          if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL)
704             return 2;
705          else if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
706             return 4;
707          else
708             return 1;
709       } else
710          return 1;
711 
712    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
713    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
714       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
715       /* Surface coordinates. */
716       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
717          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
718       /* Surface operation source (ignored for reads). */
719       else if (i == SURFACE_LOGICAL_SRC_DATA)
720          return 0;
721       else
722          return 1;
723 
724    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
725    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
726       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
727              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
728       /* Surface coordinates. */
729       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
730          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
731       /* Surface operation source. */
732       else if (i == SURFACE_LOGICAL_SRC_DATA)
733          return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
734       else
735          return 1;
736 
737    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
738    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
739    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
740       assert(src[A64_LOGICAL_ARG].file == IMM);
741       return 1;
742 
743    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
744       assert(src[A64_LOGICAL_ARG].file == IMM);
745       if (i == A64_LOGICAL_SRC) { /* data to write */
746          const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
747          assert(comps > 0);
748          return comps;
749       } else {
750          return 1;
751       }
752 
753    case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
754       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
755       return 1;
756 
757    case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
758       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
759       if (i == SURFACE_LOGICAL_SRC_DATA) {
760          const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
761          assert(comps > 0);
762          return comps;
763       } else {
764          return 1;
765       }
766 
767    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
768       assert(src[A64_LOGICAL_ARG].file == IMM);
769       return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
770 
771    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
772       assert(src[A64_LOGICAL_ARG].file == IMM);
773       return i == A64_LOGICAL_SRC ?
774              lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
775 
776    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
777    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
778       /* Scattered logical opcodes use the following params:
779        * src[0] Surface coordinates
780        * src[1] Surface operation source (ignored for reads)
781        * src[2] Surface
782        * src[3] IMM with always 1 dimension.
783        * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
784        */
785       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
786              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
787       return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
788 
789    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
790    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
791       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
792              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
793       return 1;
794 
795    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
796    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
797       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
798              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
799       const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
800       /* Surface coordinates. */
801       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
802          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
803       /* Surface operation source. */
804       else if (i == SURFACE_LOGICAL_SRC_DATA)
805          return lsc_op_num_data_values(op);
806       else
807          return 1;
808    }
809    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
810       return (i == 0 ? 2 : 1);
811 
812    case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
813       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
814 
815       if (i == URB_LOGICAL_SRC_DATA)
816          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
817       else
818          return 1;
819 
820    default:
821       return 1;
822    }
823 }
824 
825 unsigned
size_read(int arg) const826 elk_fs_inst::size_read(int arg) const
827 {
828    switch (opcode) {
829    case ELK_SHADER_OPCODE_SEND:
830       if (arg == 1) {
831          return mlen * REG_SIZE;
832       }
833       break;
834 
835    case ELK_FS_OPCODE_FB_WRITE:
836    case ELK_FS_OPCODE_REP_FB_WRITE:
837       if (arg == 0) {
838          if (base_mrf >= 0)
839             return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
840          else
841             return mlen * REG_SIZE;
842       }
843       break;
844 
845    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
846    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
847       if (arg == 0)
848          return mlen * REG_SIZE;
849       break;
850 
851    case ELK_FS_OPCODE_SET_SAMPLE_ID:
852       if (arg == 1)
853          return 1;
854       break;
855 
856    case ELK_FS_OPCODE_LINTERP:
857       if (arg == 1)
858          return 16;
859       break;
860 
861    case ELK_SHADER_OPCODE_LOAD_PAYLOAD:
862       if (arg < this->header_size)
863          return retype(src[arg], ELK_REGISTER_TYPE_UD).component_size(8);
864       break;
865 
866    case ELK_CS_OPCODE_CS_TERMINATE:
867    case ELK_SHADER_OPCODE_BARRIER:
868       return REG_SIZE;
869 
870    case ELK_SHADER_OPCODE_MOV_INDIRECT:
871       if (arg == 0) {
872          assert(src[2].file == IMM);
873          return src[2].ud;
874       }
875       break;
876 
877    case ELK_SHADER_OPCODE_TEX:
878    case ELK_FS_OPCODE_TXB:
879    case ELK_SHADER_OPCODE_TXD:
880    case ELK_SHADER_OPCODE_TXF:
881    case ELK_SHADER_OPCODE_TXF_LZ:
882    case ELK_SHADER_OPCODE_TXF_CMS:
883    case ELK_SHADER_OPCODE_TXF_CMS_W:
884    case ELK_SHADER_OPCODE_TXF_UMS:
885    case ELK_SHADER_OPCODE_TXF_MCS:
886    case ELK_SHADER_OPCODE_TXL:
887    case ELK_SHADER_OPCODE_TXL_LZ:
888    case ELK_SHADER_OPCODE_TXS:
889    case ELK_SHADER_OPCODE_LOD:
890    case ELK_SHADER_OPCODE_TG4:
891    case ELK_SHADER_OPCODE_TG4_OFFSET:
892    case ELK_SHADER_OPCODE_SAMPLEINFO:
893       if (arg == 0 && src[0].file == VGRF)
894          return mlen * REG_SIZE;
895       break;
896 
897    default:
898       break;
899    }
900 
901    switch (src[arg].file) {
902    case UNIFORM:
903    case IMM:
904       return components_read(arg) * type_sz(src[arg].type);
905    case BAD_FILE:
906    case ARF:
907    case FIXED_GRF:
908    case VGRF:
909    case ATTR:
910       return components_read(arg) * src[arg].component_size(exec_size);
911    case MRF:
912       unreachable("MRF registers are not allowed as sources");
913    }
914    return 0;
915 }
916 
917 namespace {
918    unsigned
predicate_width(const intel_device_info * devinfo,elk_predicate predicate)919    predicate_width(const intel_device_info *devinfo, elk_predicate predicate)
920    {
921       switch (predicate) {
922       case ELK_PREDICATE_NONE:            return 1;
923       case ELK_PREDICATE_NORMAL:          return 1;
924       case ELK_PREDICATE_ALIGN1_ANY2H:    return 2;
925       case ELK_PREDICATE_ALIGN1_ALL2H:    return 2;
926       case ELK_PREDICATE_ALIGN1_ANY4H:    return 4;
927       case ELK_PREDICATE_ALIGN1_ALL4H:    return 4;
928       case ELK_PREDICATE_ALIGN1_ANY8H:    return 8;
929       case ELK_PREDICATE_ALIGN1_ALL8H:    return 8;
930       case ELK_PREDICATE_ALIGN1_ANY16H:   return 16;
931       case ELK_PREDICATE_ALIGN1_ALL16H:   return 16;
932       case ELK_PREDICATE_ALIGN1_ANY32H:   return 32;
933       case ELK_PREDICATE_ALIGN1_ALL32H:   return 32;
934       default: unreachable("Unsupported predicate");
935       }
936    }
937 
938    /* Return the subset of flag registers that an instruction could
939     * potentially read or write based on the execution controls and flag
940     * subregister number of the instruction.
941     */
942    unsigned
flag_mask(const elk_fs_inst * inst,unsigned width)943    flag_mask(const elk_fs_inst *inst, unsigned width)
944    {
945       assert(util_is_power_of_two_nonzero(width));
946       const unsigned start = (inst->flag_subreg * 16 + inst->group) &
947                              ~(width - 1);
948       const unsigned end = start + ALIGN(inst->exec_size, width);
949       return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
950    }
951 
952    unsigned
bit_mask(unsigned n)953    bit_mask(unsigned n)
954    {
955       return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
956    }
957 
958    unsigned
flag_mask(const elk_fs_reg & r,unsigned sz)959    flag_mask(const elk_fs_reg &r, unsigned sz)
960    {
961       if (r.file == ARF) {
962          const unsigned start = (r.nr - ELK_ARF_FLAG) * 4 + r.subnr;
963          const unsigned end = start + sz;
964          return bit_mask(end) & ~bit_mask(start);
965       } else {
966          return 0;
967       }
968    }
969 }
970 
971 unsigned
flags_read(const intel_device_info * devinfo) const972 elk_fs_inst::flags_read(const intel_device_info *devinfo) const
973 {
974    if (predicate == ELK_PREDICATE_ALIGN1_ANYV ||
975        predicate == ELK_PREDICATE_ALIGN1_ALLV) {
976       /* The vertical predication modes combine corresponding bits from
977        * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
978        */
979       const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
980       return flag_mask(this, 1) << shift | flag_mask(this, 1);
981    } else if (predicate) {
982       return flag_mask(this, predicate_width(devinfo, predicate));
983    } else {
984       unsigned mask = 0;
985       for (int i = 0; i < sources; i++) {
986          mask |= flag_mask(src[i], size_read(i));
987       }
988       return mask;
989    }
990 }
991 
992 unsigned
flags_written(const intel_device_info * devinfo) const993 elk_fs_inst::flags_written(const intel_device_info *devinfo) const
994 {
995    /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
996     * using a separate cmpn and sel instruction.  This lowering occurs in
997     * fs_vistor::lower_minmax which is called very, very late.
998     */
999    if ((conditional_mod && ((opcode != ELK_OPCODE_SEL || devinfo->ver <= 5) &&
1000                             opcode != ELK_OPCODE_CSEL &&
1001                             opcode != ELK_OPCODE_IF &&
1002                             opcode != ELK_OPCODE_WHILE)) ||
1003        opcode == ELK_FS_OPCODE_FB_WRITE) {
1004       return flag_mask(this, 1);
1005    } else if (opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1006               opcode == ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1007               opcode == ELK_FS_OPCODE_LOAD_LIVE_CHANNELS) {
1008       return flag_mask(this, 32);
1009    } else {
1010       return flag_mask(dst, size_written);
1011    }
1012 }
1013 
1014 /**
1015  * Returns how many MRFs an FS opcode will write over.
1016  *
1017  * Note that this is not the 0 or 1 implied writes in an actual gen
1018  * instruction -- the FS opcodes often generate MOVs in addition.
1019  */
1020 unsigned
implied_mrf_writes() const1021 elk_fs_inst::implied_mrf_writes() const
1022 {
1023    if (mlen == 0)
1024       return 0;
1025 
1026    if (base_mrf == -1)
1027       return 0;
1028 
1029    switch (opcode) {
1030    case ELK_SHADER_OPCODE_RCP:
1031    case ELK_SHADER_OPCODE_RSQ:
1032    case ELK_SHADER_OPCODE_SQRT:
1033    case ELK_SHADER_OPCODE_EXP2:
1034    case ELK_SHADER_OPCODE_LOG2:
1035    case ELK_SHADER_OPCODE_SIN:
1036    case ELK_SHADER_OPCODE_COS:
1037       return 1 * exec_size / 8;
1038    case ELK_SHADER_OPCODE_POW:
1039    case ELK_SHADER_OPCODE_INT_QUOTIENT:
1040    case ELK_SHADER_OPCODE_INT_REMAINDER:
1041       return 2 * exec_size / 8;
1042    case ELK_SHADER_OPCODE_TEX:
1043    case ELK_FS_OPCODE_TXB:
1044    case ELK_SHADER_OPCODE_TXD:
1045    case ELK_SHADER_OPCODE_TXF:
1046    case ELK_SHADER_OPCODE_TXF_CMS:
1047    case ELK_SHADER_OPCODE_TXF_MCS:
1048    case ELK_SHADER_OPCODE_TG4:
1049    case ELK_SHADER_OPCODE_TG4_OFFSET:
1050    case ELK_SHADER_OPCODE_TXL:
1051    case ELK_SHADER_OPCODE_TXS:
1052    case ELK_SHADER_OPCODE_LOD:
1053    case ELK_SHADER_OPCODE_SAMPLEINFO:
1054       return 1;
1055    case ELK_FS_OPCODE_FB_WRITE:
1056    case ELK_FS_OPCODE_REP_FB_WRITE:
1057       return src[0].file == BAD_FILE ? 0 : 2;
1058    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1059    case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1060       return 1;
1061    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1062       return mlen;
1063    case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1064       return mlen;
1065    default:
1066       unreachable("not reached");
1067    }
1068 }
1069 
1070 bool
has_sampler_residency() const1071 elk_fs_inst::has_sampler_residency() const
1072 {
1073    switch (opcode) {
1074    case ELK_SHADER_OPCODE_TEX_LOGICAL:
1075    case ELK_FS_OPCODE_TXB_LOGICAL:
1076    case ELK_SHADER_OPCODE_TXL_LOGICAL:
1077    case ELK_SHADER_OPCODE_TXD_LOGICAL:
1078    case ELK_SHADER_OPCODE_TXF_LOGICAL:
1079    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1080    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1081    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1082    case ELK_SHADER_OPCODE_TXS_LOGICAL:
1083    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1084    case ELK_SHADER_OPCODE_TG4_LOGICAL:
1085       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1086       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1087    default:
1088       return false;
1089    }
1090 }
1091 
1092 elk_fs_reg
vgrf(const glsl_type * const type)1093 elk_fs_visitor::vgrf(const glsl_type *const type)
1094 {
1095    int reg_width = dispatch_width / 8;
1096    return elk_fs_reg(VGRF,
1097                  alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1098                  elk_type_for_base_type(type));
1099 }
1100 
elk_fs_reg(enum elk_reg_file file,unsigned nr)1101 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr)
1102 {
1103    init();
1104    this->file = file;
1105    this->nr = nr;
1106    this->type = ELK_REGISTER_TYPE_F;
1107    this->stride = (file == UNIFORM ? 0 : 1);
1108 }
1109 
elk_fs_reg(enum elk_reg_file file,unsigned nr,enum elk_reg_type type)1110 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type)
1111 {
1112    init();
1113    this->file = file;
1114    this->nr = nr;
1115    this->type = type;
1116    this->stride = (file == UNIFORM ? 0 : 1);
1117 }
1118 
1119 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1120  * This brings in those uniform definitions
1121  */
1122 void
import_uniforms(elk_fs_visitor * v)1123 elk_fs_visitor::import_uniforms(elk_fs_visitor *v)
1124 {
1125    this->push_constant_loc = v->push_constant_loc;
1126    this->uniforms = v->uniforms;
1127 }
1128 
1129 enum elk_barycentric_mode
elk_barycentric_mode(nir_intrinsic_instr * intr)1130 elk_barycentric_mode(nir_intrinsic_instr *intr)
1131 {
1132    const glsl_interp_mode mode =
1133       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1134 
1135    /* Barycentric modes don't make sense for flat inputs. */
1136    assert(mode != INTERP_MODE_FLAT);
1137 
1138    unsigned bary;
1139    switch (intr->intrinsic) {
1140    case nir_intrinsic_load_barycentric_pixel:
1141    case nir_intrinsic_load_barycentric_at_offset:
1142       bary = ELK_BARYCENTRIC_PERSPECTIVE_PIXEL;
1143       break;
1144    case nir_intrinsic_load_barycentric_centroid:
1145       bary = ELK_BARYCENTRIC_PERSPECTIVE_CENTROID;
1146       break;
1147    case nir_intrinsic_load_barycentric_sample:
1148    case nir_intrinsic_load_barycentric_at_sample:
1149       bary = ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1150       break;
1151    default:
1152       unreachable("invalid intrinsic");
1153    }
1154 
1155    if (mode == INTERP_MODE_NOPERSPECTIVE)
1156       bary += 3;
1157 
1158    return (enum elk_barycentric_mode) bary;
1159 }
1160 
1161 /**
1162  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1163  */
1164 static enum elk_barycentric_mode
centroid_to_pixel(enum elk_barycentric_mode bary)1165 centroid_to_pixel(enum elk_barycentric_mode bary)
1166 {
1167    assert(bary == ELK_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1168           bary == ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1169    return (enum elk_barycentric_mode) ((unsigned) bary - 1);
1170 }
1171 
1172 /**
1173  * Walk backwards from the end of the program looking for a URB write that
1174  * isn't in control flow, and mark it with EOT.
1175  *
1176  * Return true if successful or false if a separate EOT write is needed.
1177  */
1178 bool
mark_last_urb_write_with_eot()1179 elk_fs_visitor::mark_last_urb_write_with_eot()
1180 {
1181    foreach_in_list_reverse(elk_fs_inst, prev, &this->instructions) {
1182       if (prev->opcode == ELK_SHADER_OPCODE_URB_WRITE_LOGICAL) {
1183          prev->eot = true;
1184 
1185          /* Delete now dead instructions. */
1186          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1187             if (dead == prev)
1188                break;
1189             dead->remove();
1190          }
1191          return true;
1192       } else if (prev->is_control_flow() || prev->has_side_effects()) {
1193          break;
1194       }
1195    }
1196 
1197    return false;
1198 }
1199 
1200 void
emit_gs_thread_end()1201 elk_fs_visitor::emit_gs_thread_end()
1202 {
1203    assert(stage == MESA_SHADER_GEOMETRY);
1204 
1205    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
1206 
1207    if (gs_compile->control_data_header_size_bits > 0) {
1208       emit_gs_control_data_bits(this->final_gs_vertex_count);
1209    }
1210 
1211    const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1212    elk_fs_inst *inst;
1213 
1214    if (gs_prog_data->static_vertex_count != -1) {
1215       /* Try and tag the last URB write with EOT instead of emitting a whole
1216        * separate write just to finish the thread.
1217        */
1218       if (mark_last_urb_write_with_eot())
1219          return;
1220 
1221       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1222       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1223       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(0);
1224       inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1225                        srcs, ARRAY_SIZE(srcs));
1226    } else {
1227       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1228       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1229       srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1230       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1231       inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1232                        srcs, ARRAY_SIZE(srcs));
1233    }
1234    inst->eot = true;
1235    inst->offset = 0;
1236 }
1237 
1238 void
assign_curb_setup()1239 elk_fs_visitor::assign_curb_setup()
1240 {
1241    unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1242 
1243    unsigned ubo_push_length = 0;
1244    unsigned ubo_push_start[4];
1245    for (int i = 0; i < 4; i++) {
1246       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1247       ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1248    }
1249 
1250    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1251 
1252    uint64_t used = 0;
1253 
1254    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1255    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1256       for (unsigned int i = 0; i < inst->sources; i++) {
1257 	 if (inst->src[i].file == UNIFORM) {
1258             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1259             int constant_nr;
1260             if (inst->src[i].nr >= UBO_START) {
1261                /* constant_nr is in 32-bit units, the rest are in bytes */
1262                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1263                              inst->src[i].offset / 4;
1264             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1265                constant_nr = push_constant_loc[uniform_nr];
1266             } else {
1267                /* Section 5.11 of the OpenGL 4.1 spec says:
1268                 * "Out-of-bounds reads return undefined values, which include
1269                 *  values from other variables of the active program or zero."
1270                 * Just return the first push constant.
1271                 */
1272                constant_nr = 0;
1273             }
1274 
1275             assert(constant_nr / 8 < 64);
1276             used |= BITFIELD64_BIT(constant_nr / 8);
1277 
1278 	    struct elk_reg elk_reg = elk_vec1_grf(payload().num_regs +
1279 						  constant_nr / 8,
1280 						  constant_nr % 8);
1281             elk_reg.abs = inst->src[i].abs;
1282             elk_reg.negate = inst->src[i].negate;
1283 
1284             assert(inst->src[i].stride == 0);
1285             inst->src[i] = byte_offset(
1286                retype(elk_reg, inst->src[i].type),
1287                inst->src[i].offset % 4);
1288 	 }
1289       }
1290    }
1291 
1292    uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1293    if (want_zero) {
1294       fs_builder ubld = fs_builder(this, 8).exec_all().at(
1295          cfg->first_block(), cfg->first_block()->start());
1296 
1297       /* push_reg_mask_param is in 32-bit units */
1298       unsigned mask_param = stage_prog_data->push_reg_mask_param;
1299       struct elk_reg mask = elk_vec1_grf(payload().num_regs + mask_param / 8,
1300                                                               mask_param % 8);
1301 
1302       elk_fs_reg b32;
1303       for (unsigned i = 0; i < 64; i++) {
1304          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1305             elk_fs_reg shifted = ubld.vgrf(ELK_REGISTER_TYPE_W, 2);
1306             ubld.SHL(horiz_offset(shifted, 8),
1307                      byte_offset(retype(mask, ELK_REGISTER_TYPE_W), i / 8),
1308                      elk_imm_v(0x01234567));
1309             ubld.SHL(shifted, horiz_offset(shifted, 8), elk_imm_w(8));
1310 
1311             fs_builder ubld16 = ubld.group(16, 0);
1312             b32 = ubld16.vgrf(ELK_REGISTER_TYPE_D);
1313             ubld16.group(16, 0).ASR(b32, shifted, elk_imm_w(15));
1314          }
1315 
1316          if (want_zero & BITFIELD64_BIT(i)) {
1317             assert(i < prog_data->curb_read_length);
1318             struct elk_reg push_reg =
1319                retype(elk_vec8_grf(payload().num_regs + i, 0),
1320                       ELK_REGISTER_TYPE_D);
1321 
1322             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1323          }
1324       }
1325 
1326       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1327    }
1328 
1329    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1330    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1331 }
1332 
1333 /*
1334  * Build up an array of indices into the urb_setup array that
1335  * references the active entries of the urb_setup array.
1336  * Used to accelerate walking the active entries of the urb_setup array
1337  * on each upload.
1338  */
1339 void
elk_compute_urb_setup_index(struct elk_wm_prog_data * wm_prog_data)1340 elk_compute_urb_setup_index(struct elk_wm_prog_data *wm_prog_data)
1341 {
1342    /* Make sure uint8_t is sufficient */
1343    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1344    uint8_t index = 0;
1345    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1346       if (wm_prog_data->urb_setup[attr] >= 0) {
1347          wm_prog_data->urb_setup_attribs[index++] = attr;
1348       }
1349    }
1350    wm_prog_data->urb_setup_attribs_count = index;
1351 }
1352 
1353 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * nir)1354 calculate_urb_setup(const struct intel_device_info *devinfo,
1355                     const struct elk_wm_prog_key *key,
1356                     struct elk_wm_prog_data *prog_data,
1357                     const nir_shader *nir)
1358 {
1359    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1360    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1361 
1362    int urb_next = 0; /* in vec4s */
1363 
1364    const uint64_t inputs_read =
1365       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1366 
1367    /* Figure out where each of the incoming setup attributes lands. */
1368    if (devinfo->ver >= 6) {
1369       assert(!nir->info.per_primitive_inputs);
1370 
1371       uint64_t vue_header_bits =
1372          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1373 
1374       uint64_t unique_fs_attrs = inputs_read & ELK_FS_VARYING_INPUT_MASK;
1375 
1376       /* VUE header fields all live in the same URB slot, so we pass them
1377        * as a single FS input attribute.  We want to only count them once.
1378        */
1379       if (inputs_read & vue_header_bits) {
1380          unique_fs_attrs &= ~vue_header_bits;
1381          unique_fs_attrs |= VARYING_BIT_PSIZ;
1382       }
1383 
1384       if (util_bitcount64(unique_fs_attrs) <= 16) {
1385          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1386           * first 16 varying inputs, so we can put them wherever we want.
1387           * Just put them in order.
1388           *
1389           * This is useful because it means that (a) inputs not used by the
1390           * fragment shader won't take up valuable register space, and (b) we
1391           * won't have to recompile the fragment shader if it gets paired with
1392           * a different vertex (or geometry) shader.
1393           *
1394           * VUE header fields share the same FS input attribute.
1395           */
1396          if (inputs_read & vue_header_bits) {
1397             if (inputs_read & VARYING_BIT_PSIZ)
1398                prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1399             if (inputs_read & VARYING_BIT_LAYER)
1400                prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1401             if (inputs_read & VARYING_BIT_VIEWPORT)
1402                prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1403 
1404             urb_next++;
1405          }
1406 
1407          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1408             if (inputs_read & ELK_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1409                 BITFIELD64_BIT(i)) {
1410                prog_data->urb_setup[i] = urb_next++;
1411             }
1412          }
1413       } else {
1414          /* We have enough input varyings that the SF/SBE pipeline stage can't
1415           * arbitrarily rearrange them to suit our whim; we have to put them
1416           * in an order that matches the output of the previous pipeline stage
1417           * (geometry or vertex shader).
1418           */
1419 
1420          /* Re-compute the VUE map here in the case that the one coming from
1421           * geometry has more than one position slot (used for Primitive
1422           * Replication).
1423           */
1424          struct intel_vue_map prev_stage_vue_map;
1425          elk_compute_vue_map(devinfo, &prev_stage_vue_map,
1426                              key->input_slots_valid,
1427                              nir->info.separate_shader, 1);
1428 
1429          int first_slot =
1430             elk_compute_first_urb_slot_required(inputs_read,
1431                                                 &prev_stage_vue_map);
1432 
1433          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435               slot++) {
1436             int varying = prev_stage_vue_map.slot_to_varying[slot];
1437             if (varying != ELK_VARYING_SLOT_PAD &&
1438                 (inputs_read & ELK_FS_VARYING_INPUT_MASK &
1439                  BITFIELD64_BIT(varying))) {
1440                prog_data->urb_setup[varying] = slot - first_slot;
1441             }
1442          }
1443          urb_next = prev_stage_vue_map.num_slots - first_slot;
1444       }
1445    } else {
1446       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1447       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1448          /* Point size is packed into the header, not as a general attribute */
1449          if (i == VARYING_SLOT_PSIZ)
1450             continue;
1451 
1452 	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1453 	    /* The back color slot is skipped when the front color is
1454 	     * also written to.  In addition, some slots can be
1455 	     * written in the vertex shader and not read in the
1456 	     * fragment shader.  So the register number must always be
1457 	     * incremented, mapped or not.
1458 	     */
1459 	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1460 	       prog_data->urb_setup[i] = urb_next;
1461             urb_next++;
1462 	 }
1463       }
1464 
1465       /*
1466        * It's a FS only attribute, and we did interpolation for this attribute
1467        * in SF thread. So, count it here, too.
1468        *
1469        * See compile_sf_prog() for more info.
1470        */
1471       if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1472          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1473    }
1474 
1475    prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1476    prog_data->inputs = inputs_read;
1477 
1478    elk_compute_urb_setup_index(prog_data);
1479 }
1480 
1481 void
assign_urb_setup()1482 elk_fs_visitor::assign_urb_setup()
1483 {
1484    assert(stage == MESA_SHADER_FRAGMENT);
1485    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
1486 
1487    int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1488 
1489    /* Offset all the urb_setup[] index by the actual position of the
1490     * setup regs, now that the location of the constants has been chosen.
1491     */
1492    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1493       for (int i = 0; i < inst->sources; i++) {
1494          if (inst->src[i].file == ATTR) {
1495             /* ATTR elk_fs_reg::nr in the FS is in units of logical scalar
1496              * inputs each of which consumes 16B on Gfx4-Gfx12.  In
1497              * single polygon mode this leads to the following layout
1498              * of the vertex setup plane parameters in the ATTR
1499              * register file:
1500              *
1501              *  elk_fs_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
1502              *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
1503              *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
1504              *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
1505              *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
1506              *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
1507              *     ...
1508              */
1509             const unsigned param_width = 1;
1510 
1511             /* Size of a single scalar component of a plane parameter
1512              * in bytes.
1513              */
1514             const unsigned chan_sz = 4;
1515             struct elk_reg reg;
1516 
1517             /* Calculate the base register on the thread payload of
1518              * either the block of vertex setup data or the block of
1519              * per-primitive constant data depending on whether we're
1520              * accessing a primitive or vertex input.  Also calculate
1521              * the index of the input within that block.
1522              */
1523             const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1524             const unsigned base = urb_start +
1525                (per_prim ? 0 :
1526                 ALIGN(prog_data->num_per_primitive_inputs / 2,
1527                       reg_unit(devinfo)));
1528             const unsigned idx = per_prim ? inst->src[i].nr :
1529                inst->src[i].nr - prog_data->num_per_primitive_inputs;
1530 
1531             /* Translate the offset within the param_width-wide
1532              * representation described above into an offset and a
1533              * grf, which contains the plane parameters for the first
1534              * polygon processed by the thread.
1535              *
1536              * Earlier platforms and per-primitive block pack 2 logical
1537              * input components per 32B register.
1538              */
1539             const unsigned grf = base + idx / 2;
1540             assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1541             const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1542                inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1543                inst->src[i].offset % chan_sz;
1544             reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1545                               delta);
1546 
1547             const unsigned width = inst->src[i].stride == 0 ?
1548                1 : MIN2(inst->exec_size, 8);
1549             reg = stride(reg, width * inst->src[i].stride,
1550                          width, inst->src[i].stride);
1551 
1552             reg.abs = inst->src[i].abs;
1553             reg.negate = inst->src[i].negate;
1554             inst->src[i] = reg;
1555          }
1556       }
1557    }
1558 
1559    /* Each attribute is 4 setup channels, each of which is half a reg,
1560     * but they may be replicated multiple times for multipolygon
1561     * dispatch.
1562     */
1563    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1564 
1565    /* Unlike regular attributes, per-primitive attributes have all 4 channels
1566     * in the same slot, so each GRF can store two slots.
1567     */
1568    assert(prog_data->num_per_primitive_inputs % 2 == 0);
1569    this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
1570 }
1571 
1572 void
convert_attr_sources_to_hw_regs(elk_fs_inst * inst)1573 elk_fs_visitor::convert_attr_sources_to_hw_regs(elk_fs_inst *inst)
1574 {
1575    for (int i = 0; i < inst->sources; i++) {
1576       if (inst->src[i].file == ATTR) {
1577          assert(inst->src[i].nr == 0);
1578          int grf = payload().num_regs +
1579                    prog_data->curb_read_length +
1580                    inst->src[i].offset / REG_SIZE;
1581 
1582          /* As explained at elk_reg_from_fs_reg, From the Haswell PRM:
1583           *
1584           * VertStride must be used to cross GRF register boundaries. This
1585           * rule implies that elements within a 'Width' cannot cross GRF
1586           * boundaries.
1587           *
1588           * So, for registers that are large enough, we have to split the exec
1589           * size in two and trust the compression state to sort it out.
1590           */
1591          unsigned total_size = inst->exec_size *
1592                                inst->src[i].stride *
1593                                type_sz(inst->src[i].type);
1594 
1595          assert(total_size <= 2 * REG_SIZE);
1596          const unsigned exec_size =
1597             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1598 
1599          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1600          struct elk_reg reg =
1601             stride(byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1602                                inst->src[i].offset % REG_SIZE),
1603                    exec_size * inst->src[i].stride,
1604                    width, inst->src[i].stride);
1605          reg.abs = inst->src[i].abs;
1606          reg.negate = inst->src[i].negate;
1607 
1608          inst->src[i] = reg;
1609       }
1610    }
1611 }
1612 
1613 void
assign_vs_urb_setup()1614 elk_fs_visitor::assign_vs_urb_setup()
1615 {
1616    struct elk_vs_prog_data *vs_prog_data = elk_vs_prog_data(prog_data);
1617 
1618    assert(stage == MESA_SHADER_VERTEX);
1619 
1620    /* Each attribute is 4 regs. */
1621    this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1622 
1623    assert(vs_prog_data->base.urb_read_length <= 15);
1624 
1625    /* Rewrite all ATTR file references to the hw grf that they land in. */
1626    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1627       convert_attr_sources_to_hw_regs(inst);
1628    }
1629 }
1630 
1631 void
assign_tcs_urb_setup()1632 elk_fs_visitor::assign_tcs_urb_setup()
1633 {
1634    assert(stage == MESA_SHADER_TESS_CTRL);
1635 
1636    /* Rewrite all ATTR file references to HW_REGs. */
1637    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1638       convert_attr_sources_to_hw_regs(inst);
1639    }
1640 }
1641 
1642 void
assign_tes_urb_setup()1643 elk_fs_visitor::assign_tes_urb_setup()
1644 {
1645    assert(stage == MESA_SHADER_TESS_EVAL);
1646 
1647    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1648 
1649    first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1650 
1651    /* Rewrite all ATTR file references to HW_REGs. */
1652    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1653       convert_attr_sources_to_hw_regs(inst);
1654    }
1655 }
1656 
1657 void
assign_gs_urb_setup()1658 elk_fs_visitor::assign_gs_urb_setup()
1659 {
1660    assert(stage == MESA_SHADER_GEOMETRY);
1661 
1662    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1663 
1664    first_non_payload_grf +=
1665       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1666 
1667    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1668       /* Rewrite all ATTR file references to GRFs. */
1669       convert_attr_sources_to_hw_regs(inst);
1670    }
1671 }
1672 
1673 
1674 /**
1675  * Split large virtual GRFs into separate components if we can.
1676  *
1677  * This pass aggressively splits VGRFs into as small a chunks as possible,
1678  * down to single registers if it can.  If no VGRFs can be split, we return
1679  * false so this pass can safely be used inside an optimization loop.  We
1680  * want to split, because virtual GRFs are what we register allocate and
1681  * spill (due to contiguousness requirements for some instructions), and
1682  * they're what we naturally generate in the codegen process, but most
1683  * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
1684  * split, we'll end up with reduced live intervals and better dead code
1685  * elimination and coalescing.
1686  */
1687 bool
split_virtual_grfs()1688 elk_fs_visitor::split_virtual_grfs()
1689 {
1690    /* Compact the register file so we eliminate dead vgrfs.  This
1691     * only defines split points for live registers, so if we have
1692     * too large dead registers they will hit assertions later.
1693     */
1694    compact_virtual_grfs();
1695 
1696    unsigned num_vars = this->alloc.count;
1697 
1698    /* Count the total number of registers */
1699    unsigned reg_count = 0;
1700    unsigned vgrf_to_reg[num_vars];
1701    for (unsigned i = 0; i < num_vars; i++) {
1702       vgrf_to_reg[i] = reg_count;
1703       reg_count += alloc.sizes[i];
1704    }
1705 
1706    /* An array of "split points".  For each register slot, this indicates
1707     * if this slot can be separated from the previous slot.  Every time an
1708     * instruction uses multiple elements of a register (as a source or
1709     * destination), we mark the used slots as inseparable.  Then we go
1710     * through and split the registers into the smallest pieces we can.
1711     */
1712    bool *split_points = new bool[reg_count];
1713    memset(split_points, 0, reg_count * sizeof(*split_points));
1714 
1715    /* Mark all used registers as fully splittable */
1716    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1717       if (inst->dst.file == VGRF) {
1718          unsigned reg = vgrf_to_reg[inst->dst.nr];
1719          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1720             split_points[reg + j] = true;
1721       }
1722 
1723       for (unsigned i = 0; i < inst->sources; i++) {
1724          if (inst->src[i].file == VGRF) {
1725             unsigned reg = vgrf_to_reg[inst->src[i].nr];
1726             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1727                split_points[reg + j] = true;
1728          }
1729       }
1730    }
1731 
1732    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1733       /* We fix up undef instructions later */
1734       if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1735          assert(inst->dst.file == VGRF);
1736          continue;
1737       }
1738 
1739       if (inst->dst.file == VGRF) {
1740          unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1741          for (unsigned j = 1; j < regs_written(inst); j++)
1742             split_points[reg + j] = false;
1743       }
1744       for (unsigned i = 0; i < inst->sources; i++) {
1745          if (inst->src[i].file == VGRF) {
1746             unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1747             for (unsigned j = 1; j < regs_read(inst, i); j++)
1748                split_points[reg + j] = false;
1749          }
1750       }
1751    }
1752 
1753    /* Bitset of which registers have been split */
1754    bool *vgrf_has_split = new bool[num_vars];
1755    memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
1756 
1757    unsigned *new_virtual_grf = new unsigned[reg_count];
1758    unsigned *new_reg_offset = new unsigned[reg_count];
1759 
1760    unsigned reg = 0;
1761    bool has_splits = false;
1762    for (unsigned i = 0; i < num_vars; i++) {
1763       /* The first one should always be 0 as a quick sanity check. */
1764       assert(split_points[reg] == false);
1765 
1766       /* j = 0 case */
1767       new_reg_offset[reg] = 0;
1768       reg++;
1769       unsigned offset = 1;
1770 
1771       /* j > 0 case */
1772       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1773          /* If this is a split point, reset the offset to 0 and allocate a
1774           * new virtual GRF for the previous offset many registers
1775           */
1776          if (split_points[reg]) {
1777             has_splits = true;
1778             vgrf_has_split[i] = true;
1779             assert(offset <= MAX_VGRF_SIZE(devinfo));
1780             unsigned grf = alloc.allocate(offset);
1781             for (unsigned k = reg - offset; k < reg; k++)
1782                new_virtual_grf[k] = grf;
1783             offset = 0;
1784          }
1785          new_reg_offset[reg] = offset;
1786          offset++;
1787          reg++;
1788       }
1789 
1790       /* The last one gets the original register number */
1791       assert(offset <= MAX_VGRF_SIZE(devinfo));
1792       alloc.sizes[i] = offset;
1793       for (unsigned k = reg - offset; k < reg; k++)
1794          new_virtual_grf[k] = i;
1795    }
1796    assert(reg == reg_count);
1797 
1798    bool progress;
1799    if (!has_splits) {
1800       progress = false;
1801       goto cleanup;
1802    }
1803 
1804    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
1805       if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1806          assert(inst->dst.file == VGRF);
1807          if (vgrf_has_split[inst->dst.nr]) {
1808             const fs_builder ibld(this, block, inst);
1809             assert(inst->size_written % REG_SIZE == 0);
1810             unsigned reg_offset = inst->dst.offset / REG_SIZE;
1811             unsigned size_written = 0;
1812             while (size_written < inst->size_written) {
1813                reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
1814                elk_fs_inst *undef =
1815                   ibld.UNDEF(
1816                      byte_offset(elk_fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
1817                                  new_reg_offset[reg] * REG_SIZE));
1818                undef->size_written =
1819                   MIN2(inst->size_written - size_written, undef->size_written);
1820                assert(undef->size_written % REG_SIZE == 0);
1821                size_written += undef->size_written;
1822             }
1823             inst->remove(block);
1824          } else {
1825             reg = vgrf_to_reg[inst->dst.nr];
1826             assert(new_reg_offset[reg] == 0);
1827             assert(new_virtual_grf[reg] == inst->dst.nr);
1828          }
1829          continue;
1830       }
1831 
1832       if (inst->dst.file == VGRF) {
1833          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1834          if (vgrf_has_split[inst->dst.nr]) {
1835             inst->dst.nr = new_virtual_grf[reg];
1836             inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
1837                                inst->dst.offset % REG_SIZE;
1838             assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1839          } else {
1840             assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
1841             assert(new_virtual_grf[reg] == inst->dst.nr);
1842          }
1843       }
1844       for (unsigned i = 0; i < inst->sources; i++) {
1845 	 if (inst->src[i].file != VGRF)
1846             continue;
1847 
1848          reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1849          if (vgrf_has_split[inst->src[i].nr]) {
1850             inst->src[i].nr = new_virtual_grf[reg];
1851             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
1852                                   inst->src[i].offset % REG_SIZE;
1853             assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1854          } else {
1855             assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
1856             assert(new_virtual_grf[reg] == inst->src[i].nr);
1857          }
1858       }
1859    }
1860    invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1861 
1862    progress = true;
1863 
1864 cleanup:
1865    delete[] split_points;
1866    delete[] vgrf_has_split;
1867    delete[] new_virtual_grf;
1868    delete[] new_reg_offset;
1869 
1870    return progress;
1871 }
1872 
1873 /**
1874  * Remove unused virtual GRFs and compact the vgrf_* arrays.
1875  *
1876  * During code generation, we create tons of temporary variables, many of
1877  * which get immediately killed and are never used again.  Yet, in later
1878  * optimization and analysis passes, such as compute_live_intervals, we need
1879  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1880  * overhead.
1881  */
1882 bool
compact_virtual_grfs()1883 elk_fs_visitor::compact_virtual_grfs()
1884 {
1885    bool progress = false;
1886    int *remap_table = new int[this->alloc.count];
1887    memset(remap_table, -1, this->alloc.count * sizeof(int));
1888 
1889    /* Mark which virtual GRFs are used. */
1890    foreach_block_and_inst(block, const elk_fs_inst, inst, cfg) {
1891       if (inst->dst.file == VGRF)
1892          remap_table[inst->dst.nr] = 0;
1893 
1894       for (int i = 0; i < inst->sources; i++) {
1895          if (inst->src[i].file == VGRF)
1896             remap_table[inst->src[i].nr] = 0;
1897       }
1898    }
1899 
1900    /* Compact the GRF arrays. */
1901    int new_index = 0;
1902    for (unsigned i = 0; i < this->alloc.count; i++) {
1903       if (remap_table[i] == -1) {
1904          /* We just found an unused register.  This means that we are
1905           * actually going to compact something.
1906           */
1907          progress = true;
1908       } else {
1909          remap_table[i] = new_index;
1910          alloc.sizes[new_index] = alloc.sizes[i];
1911          invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1912          ++new_index;
1913       }
1914    }
1915 
1916    this->alloc.count = new_index;
1917 
1918    /* Patch all the instructions to use the newly renumbered registers */
1919    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1920       if (inst->dst.file == VGRF)
1921          inst->dst.nr = remap_table[inst->dst.nr];
1922 
1923       for (int i = 0; i < inst->sources; i++) {
1924          if (inst->src[i].file == VGRF)
1925             inst->src[i].nr = remap_table[inst->src[i].nr];
1926       }
1927    }
1928 
1929    /* Patch all the references to delta_xy, since they're used in register
1930     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1931     * think some random VGRF is delta_xy.
1932     */
1933    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1934       if (delta_xy[i].file == VGRF) {
1935          if (remap_table[delta_xy[i].nr] != -1) {
1936             delta_xy[i].nr = remap_table[delta_xy[i].nr];
1937          } else {
1938             delta_xy[i].file = BAD_FILE;
1939          }
1940       }
1941    }
1942 
1943    delete[] remap_table;
1944 
1945    return progress;
1946 }
1947 
1948 int
elk_get_subgroup_id_param_index(const intel_device_info * devinfo,const elk_stage_prog_data * prog_data)1949 elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
1950                                 const elk_stage_prog_data *prog_data)
1951 {
1952    if (prog_data->nr_params == 0)
1953       return -1;
1954 
1955    /* The local thread id is always the last parameter in the list */
1956    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1957    if (last_param == ELK_PARAM_BUILTIN_SUBGROUP_ID)
1958       return prog_data->nr_params - 1;
1959 
1960    return -1;
1961 }
1962 
1963 /**
1964  * Assign UNIFORM file registers to either push constants or pull constants.
1965  *
1966  * We allow a fragment shader to have more than the specified minimum
1967  * maximum number of fragment shader uniform components (64).  If
1968  * there are too many of these, they'd fill up all of register space.
1969  * So, this will push some of them out to the pull constant buffer and
1970  * update the program to load them.
1971  */
1972 void
assign_constant_locations()1973 elk_fs_visitor::assign_constant_locations()
1974 {
1975    /* Only the first compile gets to decide on locations. */
1976    if (push_constant_loc)
1977       return;
1978 
1979    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1980    for (unsigned u = 0; u < uniforms; u++)
1981       push_constant_loc[u] = u;
1982 
1983    /* Now that we know how many regular uniforms we'll push, reduce the
1984     * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1985     */
1986    /* For gen4/5:
1987     * Only allow 16 registers (128 uniform components) as push constants.
1988     *
1989     * If changing this value, note the limitation about total_regs in
1990     * elk_curbe.c/crocus_state.c
1991     */
1992    const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
1993    unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1994    for (int i = 0; i < 4; i++) {
1995       struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
1996 
1997       if (push_length + range->length > max_push_length)
1998          range->length = max_push_length - push_length;
1999 
2000       push_length += range->length;
2001    }
2002    assert(push_length <= max_push_length);
2003 }
2004 
2005 bool
get_pull_locs(const elk_fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)2006 elk_fs_visitor::get_pull_locs(const elk_fs_reg &src,
2007                           unsigned *out_surf_index,
2008                           unsigned *out_pull_index)
2009 {
2010    assert(src.file == UNIFORM);
2011 
2012    if (src.nr < UBO_START)
2013       return false;
2014 
2015    const struct elk_ubo_range *range =
2016       &prog_data->ubo_ranges[src.nr - UBO_START];
2017 
2018    /* If this access is in our (reduced) range, use the push data. */
2019    if (src.offset / 32 < range->length)
2020       return false;
2021 
2022    *out_surf_index = range->block;
2023    *out_pull_index = (32 * range->start + src.offset) / 4;
2024 
2025    prog_data->has_ubo_pull = true;
2026 
2027    return true;
2028 }
2029 
2030 /**
2031  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2032  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2033  */
2034 bool
lower_constant_loads()2035 elk_fs_visitor::lower_constant_loads()
2036 {
2037    unsigned index, pull_index;
2038    bool progress = false;
2039 
2040    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
2041       /* Set up the annotation tracking for new generated instructions. */
2042       const fs_builder ibld(this, block, inst);
2043 
2044       for (int i = 0; i < inst->sources; i++) {
2045 	 if (inst->src[i].file != UNIFORM)
2046 	    continue;
2047 
2048          /* We'll handle this case later */
2049          if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT && i == 0)
2050             continue;
2051 
2052          if (!get_pull_locs(inst->src[i], &index, &pull_index))
2053 	    continue;
2054 
2055          assert(inst->src[i].stride == 0);
2056 
2057          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2058          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2059          const elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2060          const unsigned base = pull_index * 4;
2061 
2062          elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
2063          srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = elk_imm_ud(index);
2064          srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = elk_imm_ud(base & ~(block_sz - 1));
2065          srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = elk_imm_ud(block_sz);
2066 
2067 
2068          ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
2069                    srcs, PULL_UNIFORM_CONSTANT_SRCS);
2070 
2071          /* Rewrite the instruction to use the temporary VGRF. */
2072          inst->src[i].file = VGRF;
2073          inst->src[i].nr = dst.nr;
2074          inst->src[i].offset = (base & (block_sz - 1)) +
2075                                inst->src[i].offset % 4;
2076 
2077          progress = true;
2078       }
2079 
2080       if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT &&
2081           inst->src[0].file == UNIFORM) {
2082 
2083          if (!get_pull_locs(inst->src[0], &index, &pull_index))
2084             continue;
2085 
2086          VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2087                                     elk_imm_ud(index),
2088                                     elk_fs_reg() /* surface_handle */,
2089                                     inst->src[1],
2090                                     pull_index * 4, 4, 1);
2091          inst->remove(block);
2092 
2093          progress = true;
2094       }
2095    }
2096    invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2097 
2098    return progress;
2099 }
2100 
2101 static uint64_t
src_as_uint(const elk_fs_reg & src)2102 src_as_uint(const elk_fs_reg &src)
2103 {
2104    assert(src.file == IMM);
2105 
2106    switch (src.type) {
2107    case ELK_REGISTER_TYPE_W:
2108       return (uint64_t)(int16_t)(src.ud & 0xffff);
2109 
2110    case ELK_REGISTER_TYPE_UW:
2111       return (uint64_t)(uint16_t)(src.ud & 0xffff);
2112 
2113    case ELK_REGISTER_TYPE_D:
2114       return (uint64_t)src.d;
2115 
2116    case ELK_REGISTER_TYPE_UD:
2117       return (uint64_t)src.ud;
2118 
2119    case ELK_REGISTER_TYPE_Q:
2120       return src.d64;
2121 
2122    case ELK_REGISTER_TYPE_UQ:
2123       return src.u64;
2124 
2125    default:
2126       unreachable("Invalid integer type.");
2127    }
2128 }
2129 
2130 static elk_fs_reg
elk_imm_for_type(uint64_t value,enum elk_reg_type type)2131 elk_imm_for_type(uint64_t value, enum elk_reg_type type)
2132 {
2133    switch (type) {
2134    case ELK_REGISTER_TYPE_W:
2135       return elk_imm_w(value);
2136 
2137    case ELK_REGISTER_TYPE_UW:
2138       return elk_imm_uw(value);
2139 
2140    case ELK_REGISTER_TYPE_D:
2141       return elk_imm_d(value);
2142 
2143    case ELK_REGISTER_TYPE_UD:
2144       return elk_imm_ud(value);
2145 
2146    case ELK_REGISTER_TYPE_Q:
2147       return elk_imm_d(value);
2148 
2149    case ELK_REGISTER_TYPE_UQ:
2150       return elk_imm_uq(value);
2151 
2152    default:
2153       unreachable("Invalid integer type.");
2154    }
2155 }
2156 
2157 bool
opt_algebraic()2158 elk_fs_visitor::opt_algebraic()
2159 {
2160    bool progress = false;
2161 
2162    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2163       switch (inst->opcode) {
2164       case ELK_OPCODE_MOV:
2165          if (!devinfo->has_64bit_float &&
2166              inst->dst.type == ELK_REGISTER_TYPE_DF) {
2167             assert(inst->dst.type == inst->src[0].type);
2168             assert(!inst->saturate);
2169             assert(!inst->src[0].abs);
2170             assert(!inst->src[0].negate);
2171             const elk::fs_builder ibld(this, block, inst);
2172 
2173             if (!inst->is_partial_write())
2174                ibld.emit_undef_for_dst(inst);
2175 
2176             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 1),
2177                      subscript(inst->src[0], ELK_REGISTER_TYPE_F, 1));
2178             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 0),
2179                      subscript(inst->src[0], ELK_REGISTER_TYPE_F, 0));
2180 
2181             inst->remove(block);
2182             progress = true;
2183          }
2184 
2185          if (!devinfo->has_64bit_int &&
2186              (inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2187               inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2188             assert(inst->dst.type == inst->src[0].type);
2189             assert(!inst->saturate);
2190             assert(!inst->src[0].abs);
2191             assert(!inst->src[0].negate);
2192             const elk::fs_builder ibld(this, block, inst);
2193 
2194             if (!inst->is_partial_write())
2195                ibld.emit_undef_for_dst(inst);
2196 
2197             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2198                      subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1));
2199             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2200                      subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0));
2201 
2202             inst->remove(block);
2203             progress = true;
2204          }
2205 
2206          if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2207               inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2208              inst->dst.is_null() &&
2209              (inst->src[0].abs || inst->src[0].negate)) {
2210             inst->src[0].abs = false;
2211             inst->src[0].negate = false;
2212             progress = true;
2213             break;
2214          }
2215 
2216          if (inst->src[0].file != IMM)
2217             break;
2218 
2219          if (inst->saturate) {
2220             /* Full mixed-type saturates don't happen.  However, we can end up
2221              * with things like:
2222              *
2223              *    mov.sat(8) g21<1>DF       -1F
2224              *
2225              * Other mixed-size-but-same-base-type cases may also be possible.
2226              */
2227             if (inst->dst.type != inst->src[0].type &&
2228                 inst->dst.type != ELK_REGISTER_TYPE_DF &&
2229                 inst->src[0].type != ELK_REGISTER_TYPE_F)
2230                assert(!"unimplemented: saturate mixed types");
2231 
2232             if (elk_saturate_immediate(inst->src[0].type,
2233                                        &inst->src[0].as_elk_reg())) {
2234                inst->saturate = false;
2235                progress = true;
2236             }
2237          }
2238          break;
2239 
2240       case ELK_OPCODE_MUL:
2241          if (inst->src[1].file != IMM)
2242             continue;
2243 
2244          if (elk_reg_type_is_floating_point(inst->src[1].type))
2245             break;
2246 
2247          /* From the BDW PRM, Vol 2a, "mul - Multiply":
2248           *
2249           *    "When multiplying integer datatypes, if src0 is DW and src1
2250           *    is W, irrespective of the destination datatype, the
2251           *    accumulator maintains full 48-bit precision."
2252           *    ...
2253           *    "When multiplying integer data types, if one of the sources
2254           *    is a DW, the resulting full precision data is stored in
2255           *    the accumulator."
2256           *
2257           * There are also similar notes in earlier PRMs.
2258           *
2259           * The MOV instruction can copy the bits of the source, but it
2260           * does not clear the higher bits of the accumulator. So, because
2261           * we might use the full accumulator in the MUL/MACH macro, we
2262           * shouldn't replace such MULs with MOVs.
2263           */
2264          if ((elk_reg_type_to_size(inst->src[0].type) == 4 ||
2265               elk_reg_type_to_size(inst->src[1].type) == 4) &&
2266              (inst->dst.is_accumulator() ||
2267               inst->writes_accumulator_implicitly(devinfo)))
2268             break;
2269 
2270          /* a * 1.0 = a */
2271          if (inst->src[1].is_one()) {
2272             inst->opcode = ELK_OPCODE_MOV;
2273             inst->sources = 1;
2274             inst->src[1] = reg_undef;
2275             progress = true;
2276             break;
2277          }
2278 
2279          /* a * -1.0 = -a */
2280          if (inst->src[1].is_negative_one()) {
2281             inst->opcode = ELK_OPCODE_MOV;
2282             inst->sources = 1;
2283             inst->src[0].negate = !inst->src[0].negate;
2284             inst->src[1] = reg_undef;
2285             progress = true;
2286             break;
2287          }
2288 
2289          break;
2290       case ELK_OPCODE_ADD:
2291          if (inst->src[1].file != IMM)
2292             continue;
2293 
2294          if (elk_reg_type_is_integer(inst->src[1].type) &&
2295              inst->src[1].is_zero()) {
2296             inst->opcode = ELK_OPCODE_MOV;
2297             inst->sources = 1;
2298             inst->src[1] = reg_undef;
2299             progress = true;
2300             break;
2301          }
2302 
2303          if (inst->src[0].file == IMM) {
2304             assert(inst->src[0].type == ELK_REGISTER_TYPE_F);
2305             inst->opcode = ELK_OPCODE_MOV;
2306             inst->sources = 1;
2307             inst->src[0].f += inst->src[1].f;
2308             inst->src[1] = reg_undef;
2309             progress = true;
2310             break;
2311          }
2312          break;
2313 
2314       case ELK_OPCODE_AND:
2315          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2316             const uint64_t src0 = src_as_uint(inst->src[0]);
2317             const uint64_t src1 = src_as_uint(inst->src[1]);
2318 
2319             inst->opcode = ELK_OPCODE_MOV;
2320             inst->sources = 1;
2321             inst->src[0] = elk_imm_for_type(src0 & src1, inst->dst.type);
2322             inst->src[1] = reg_undef;
2323             progress = true;
2324             break;
2325          }
2326 
2327          break;
2328 
2329       case ELK_OPCODE_OR:
2330          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2331             const uint64_t src0 = src_as_uint(inst->src[0]);
2332             const uint64_t src1 = src_as_uint(inst->src[1]);
2333 
2334             inst->opcode = ELK_OPCODE_MOV;
2335             inst->sources = 1;
2336             inst->src[0] = elk_imm_for_type(src0 | src1, inst->dst.type);
2337             inst->src[1] = reg_undef;
2338             progress = true;
2339             break;
2340          }
2341 
2342          if (inst->src[0].equals(inst->src[1]) ||
2343              inst->src[1].is_zero()) {
2344             /* On Gfx8+, the OR instruction can have a source modifier that
2345              * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
2346              * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2347              */
2348             if (inst->src[0].negate) {
2349                inst->opcode = ELK_OPCODE_NOT;
2350                inst->sources = 1;
2351                inst->src[0].negate = false;
2352             } else {
2353                inst->opcode = ELK_OPCODE_MOV;
2354                inst->sources = 1;
2355             }
2356             inst->src[1] = reg_undef;
2357             progress = true;
2358             break;
2359          }
2360          break;
2361       case ELK_OPCODE_CMP:
2362          if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2363               inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2364              inst->src[1].is_zero() &&
2365              (inst->src[0].abs || inst->src[0].negate)) {
2366             inst->src[0].abs = false;
2367             inst->src[0].negate = false;
2368             progress = true;
2369             break;
2370          }
2371          break;
2372       case ELK_OPCODE_SEL:
2373          if (!devinfo->has_64bit_float &&
2374              !devinfo->has_64bit_int &&
2375              (inst->dst.type == ELK_REGISTER_TYPE_DF ||
2376               inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2377               inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2378             assert(inst->dst.type == inst->src[0].type);
2379             assert(!inst->saturate);
2380             assert(!inst->src[0].abs && !inst->src[0].negate);
2381             assert(!inst->src[1].abs && !inst->src[1].negate);
2382             const elk::fs_builder ibld(this, block, inst);
2383 
2384             if (!inst->is_partial_write())
2385                ibld.emit_undef_for_dst(inst);
2386 
2387             set_predicate(inst->predicate,
2388                           ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2389                                    subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
2390                                    subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0)));
2391             set_predicate(inst->predicate,
2392                           ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2393                                    subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
2394                                    subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1)));
2395 
2396             inst->remove(block);
2397             progress = true;
2398          }
2399          if (inst->src[0].equals(inst->src[1])) {
2400             inst->opcode = ELK_OPCODE_MOV;
2401             inst->sources = 1;
2402             inst->src[1] = reg_undef;
2403             inst->predicate = ELK_PREDICATE_NONE;
2404             inst->predicate_inverse = false;
2405             progress = true;
2406          } else if (inst->saturate && inst->src[1].file == IMM) {
2407             switch (inst->conditional_mod) {
2408             case ELK_CONDITIONAL_LE:
2409             case ELK_CONDITIONAL_L:
2410                switch (inst->src[1].type) {
2411                case ELK_REGISTER_TYPE_F:
2412                   if (inst->src[1].f >= 1.0f) {
2413                      inst->opcode = ELK_OPCODE_MOV;
2414                      inst->sources = 1;
2415                      inst->src[1] = reg_undef;
2416                      inst->conditional_mod = ELK_CONDITIONAL_NONE;
2417                      progress = true;
2418                   }
2419                   break;
2420                default:
2421                   break;
2422                }
2423                break;
2424             case ELK_CONDITIONAL_GE:
2425             case ELK_CONDITIONAL_G:
2426                switch (inst->src[1].type) {
2427                case ELK_REGISTER_TYPE_F:
2428                   if (inst->src[1].f <= 0.0f) {
2429                      inst->opcode = ELK_OPCODE_MOV;
2430                      inst->sources = 1;
2431                      inst->src[1] = reg_undef;
2432                      inst->conditional_mod = ELK_CONDITIONAL_NONE;
2433                      progress = true;
2434                   }
2435                   break;
2436                default:
2437                   break;
2438                }
2439             default:
2440                break;
2441             }
2442          }
2443          break;
2444       case ELK_OPCODE_MAD:
2445          if (inst->src[0].type != ELK_REGISTER_TYPE_F ||
2446              inst->src[1].type != ELK_REGISTER_TYPE_F ||
2447              inst->src[2].type != ELK_REGISTER_TYPE_F)
2448             break;
2449          if (inst->src[1].is_one()) {
2450             inst->opcode = ELK_OPCODE_ADD;
2451             inst->sources = 2;
2452             inst->src[1] = inst->src[2];
2453             inst->src[2] = reg_undef;
2454             progress = true;
2455          } else if (inst->src[2].is_one()) {
2456             inst->opcode = ELK_OPCODE_ADD;
2457             inst->sources = 2;
2458             inst->src[2] = reg_undef;
2459             progress = true;
2460          }
2461          break;
2462       case ELK_OPCODE_SHL:
2463          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2464             /* It's not currently possible to generate this, and this constant
2465              * folding does not handle it.
2466              */
2467             assert(!inst->saturate);
2468 
2469             elk_fs_reg result;
2470 
2471             switch (type_sz(inst->src[0].type)) {
2472             case 2:
2473                result = elk_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
2474                break;
2475             case 4:
2476                result = elk_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
2477                break;
2478             case 8:
2479                result = elk_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
2480                break;
2481             default:
2482                /* Just in case a future platform re-enables B or UB types. */
2483                unreachable("Invalid source size.");
2484             }
2485 
2486             inst->opcode = ELK_OPCODE_MOV;
2487             inst->src[0] = retype(result, inst->dst.type);
2488             inst->src[1] = reg_undef;
2489             inst->sources = 1;
2490 
2491             progress = true;
2492          }
2493          break;
2494 
2495       case ELK_SHADER_OPCODE_BROADCAST:
2496          if (is_uniform(inst->src[0])) {
2497             inst->opcode = ELK_OPCODE_MOV;
2498             inst->sources = 1;
2499             inst->force_writemask_all = true;
2500             progress = true;
2501          } else if (inst->src[1].file == IMM) {
2502             inst->opcode = ELK_OPCODE_MOV;
2503             /* It's possible that the selected component will be too large and
2504              * overflow the register.  This can happen if someone does a
2505              * readInvocation() from GLSL or SPIR-V and provides an OOB
2506              * invocationIndex.  If this happens and we some how manage
2507              * to constant fold it in and get here, then component() may cause
2508              * us to start reading outside of the VGRF which will lead to an
2509              * assert later.  Instead, just let it wrap around if it goes over
2510              * exec_size.
2511              */
2512             const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2513             inst->src[0] = component(inst->src[0], comp);
2514             inst->sources = 1;
2515             inst->force_writemask_all = true;
2516             progress = true;
2517          }
2518          break;
2519 
2520       case ELK_SHADER_OPCODE_SHUFFLE:
2521          if (is_uniform(inst->src[0])) {
2522             inst->opcode = ELK_OPCODE_MOV;
2523             inst->sources = 1;
2524             progress = true;
2525          } else if (inst->src[1].file == IMM) {
2526             inst->opcode = ELK_OPCODE_MOV;
2527             inst->src[0] = component(inst->src[0],
2528                                      inst->src[1].ud);
2529             inst->sources = 1;
2530             progress = true;
2531          }
2532          break;
2533 
2534       default:
2535 	 break;
2536       }
2537 
2538       /* Ensure that the correct source has the immediate value. 2-source
2539        * instructions must have the immediate in src[1]. On Gfx12 and later,
2540        * some 3-source instructions can have the immediate in src[0] or
2541        * src[2]. It's complicated, so don't mess with 3-source instructions
2542        * here.
2543        */
2544       if (progress && inst->sources == 2 && inst->is_commutative()) {
2545          if (inst->src[0].file == IMM) {
2546             elk_fs_reg tmp = inst->src[1];
2547             inst->src[1] = inst->src[0];
2548             inst->src[0] = tmp;
2549          }
2550       }
2551    }
2552 
2553    if (progress)
2554       invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
2555                           DEPENDENCY_INSTRUCTION_DETAIL);
2556 
2557    return progress;
2558 }
2559 
2560 static unsigned
load_payload_sources_read_for_size(elk_fs_inst * lp,unsigned size_read)2561 load_payload_sources_read_for_size(elk_fs_inst *lp, unsigned size_read)
2562 {
2563    assert(lp->opcode == ELK_SHADER_OPCODE_LOAD_PAYLOAD);
2564    assert(size_read >= lp->header_size * REG_SIZE);
2565 
2566    unsigned i;
2567    unsigned size = lp->header_size * REG_SIZE;
2568    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
2569       size += lp->exec_size * type_sz(lp->src[i].type);
2570 
2571    /* Size read must cover exactly a subset of sources. */
2572    assert(size == size_read);
2573    return i;
2574 }
2575 
2576 /**
2577  * Optimize sample messages that have constant zero values for the trailing
2578  * parameters. We can just reduce the message length for these
2579  * instructions instead of reserving a register for it. Trailing parameters
2580  * that aren't sent default to zero anyway. This will cause the dead code
2581  * eliminator to remove the MOV instruction that would otherwise be emitted to
2582  * set up the zero value.
2583  */
2584 bool
opt_zero_samples()2585 elk_fs_visitor::opt_zero_samples()
2586 {
2587    /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
2588    assert(devinfo->ver >= 7);
2589 
2590    bool progress = false;
2591 
2592    foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2593       if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2594           send->sfid != ELK_SFID_SAMPLER)
2595          continue;
2596 
2597       /* Wa_14012688258:
2598        *
2599        * Don't trim zeros at the end of payload for sample operations
2600        * in cube and cube arrays.
2601        */
2602       if (send->keep_payload_trailing_zeros)
2603          continue;
2604 
2605       elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2606 
2607       if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2608          continue;
2609 
2610       /* How much of the payload are actually read by this SEND. */
2611       const unsigned params =
2612          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2613 
2614       /* We don't want to remove the message header or the first parameter.
2615        * Removing the first parameter is not allowed, see the Haswell PRM
2616        * volume 7, page 149:
2617        *
2618        *     "Parameter 0 is required except for the sampleinfo message, which
2619        *      has no parameter 0"
2620        */
2621       const unsigned first_param_idx = lp->header_size;
2622       unsigned zero_size = 0;
2623       for (unsigned i = params - 1; i > first_param_idx; i--) {
2624          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
2625             break;
2626          zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
2627       }
2628 
2629       const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
2630       if (zero_len > 0) {
2631          send->mlen -= zero_len;
2632          progress = true;
2633       }
2634    }
2635 
2636    if (progress)
2637       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2638 
2639    return progress;
2640 }
2641 
2642 /**
2643  * Remove redundant or useless halts.
2644  *
2645  * For example, we can eliminate halts in the following sequence:
2646  *
2647  * halt        (redundant with the next halt)
2648  * halt        (useless; jumps to the next instruction)
2649  * halt-target
2650  */
2651 bool
opt_redundant_halt()2652 elk_fs_visitor::opt_redundant_halt()
2653 {
2654    bool progress = false;
2655 
2656    unsigned halt_count = 0;
2657    elk_fs_inst *halt_target = NULL;
2658    elk_bblock_t *halt_target_block = NULL;
2659    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2660       if (inst->opcode == ELK_OPCODE_HALT)
2661          halt_count++;
2662 
2663       if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
2664          halt_target = inst;
2665          halt_target_block = block;
2666          break;
2667       }
2668    }
2669 
2670    if (!halt_target) {
2671       assert(halt_count == 0);
2672       return false;
2673    }
2674 
2675    /* Delete any HALTs immediately before the halt target. */
2676    for (elk_fs_inst *prev = (elk_fs_inst *) halt_target->prev;
2677         !prev->is_head_sentinel() && prev->opcode == ELK_OPCODE_HALT;
2678         prev = (elk_fs_inst *) halt_target->prev) {
2679       prev->remove(halt_target_block);
2680       halt_count--;
2681       progress = true;
2682    }
2683 
2684    if (halt_count == 0) {
2685       halt_target->remove(halt_target_block);
2686       progress = true;
2687    }
2688 
2689    if (progress)
2690       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2691 
2692    return progress;
2693 }
2694 
2695 /**
2696  * Compute a bitmask with GRF granularity with a bit set for each GRF starting
2697  * from \p r.offset which overlaps the region starting at \p s.offset and
2698  * spanning \p ds bytes.
2699  */
2700 static inline unsigned
mask_relative_to(const elk_fs_reg & r,const elk_fs_reg & s,unsigned ds)2701 mask_relative_to(const elk_fs_reg &r, const elk_fs_reg &s, unsigned ds)
2702 {
2703    const int rel_offset = reg_offset(s) - reg_offset(r);
2704    const int shift = rel_offset / REG_SIZE;
2705    const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
2706    assert(reg_space(r) == reg_space(s) &&
2707           shift >= 0 && shift < int(8 * sizeof(unsigned)));
2708    return ((1 << n) - 1) << shift;
2709 }
2710 
2711 bool
compute_to_mrf()2712 elk_fs_visitor::compute_to_mrf()
2713 {
2714    bool progress = false;
2715    int next_ip = 0;
2716 
2717    /* No MRFs on Gen >= 7. */
2718    if (devinfo->ver >= 7)
2719       return false;
2720 
2721    const fs_live_variables &live = live_analysis.require();
2722 
2723    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2724       int ip = next_ip;
2725       next_ip++;
2726 
2727       if (inst->opcode != ELK_OPCODE_MOV ||
2728 	  inst->is_partial_write() ||
2729 	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
2730 	  inst->dst.type != inst->src[0].type ||
2731 	  inst->src[0].abs || inst->src[0].negate ||
2732           !inst->src[0].is_contiguous() ||
2733           inst->src[0].offset % REG_SIZE != 0)
2734 	 continue;
2735 
2736       /* Can't compute-to-MRF this GRF if someone else was going to
2737        * read it later.
2738        */
2739       if (live.vgrf_end[inst->src[0].nr] > ip)
2740 	 continue;
2741 
2742       /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
2743        * things that computed the value of all GRFs of the source region.  The
2744        * regs_left bitset keeps track of the registers we haven't yet found a
2745        * generating instruction for.
2746        */
2747       unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
2748 
2749       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2750          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2751                              inst->src[0], inst->size_read(0))) {
2752 	    /* Found the last thing to write our reg we want to turn
2753 	     * into a compute-to-MRF.
2754 	     */
2755 
2756 	    /* If this one instruction didn't populate all the
2757 	     * channels, bail.  We might be able to rewrite everything
2758 	     * that writes that reg, but it would require smarter
2759 	     * tracking.
2760 	     */
2761 	    if (scan_inst->is_partial_write())
2762 	       break;
2763 
2764             /* Handling things not fully contained in the source of the copy
2765              * would need us to understand coalescing out more than one MOV at
2766              * a time.
2767              */
2768             if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
2769                                      inst->src[0], inst->size_read(0)))
2770                break;
2771 
2772 	    /* SEND instructions can't have MRF as a destination. */
2773 	    if (scan_inst->mlen)
2774 	       break;
2775 
2776 	    if (devinfo->ver == 6) {
2777 	       /* gfx6 math instructions must have the destination be
2778 		* GRF, so no compute-to-MRF for them.
2779 		*/
2780 	       if (scan_inst->is_math()) {
2781 		  break;
2782 	       }
2783 	    }
2784 
2785             /* Clear the bits for any registers this instruction overwrites. */
2786             regs_left &= ~mask_relative_to(
2787                inst->src[0], scan_inst->dst, scan_inst->size_written);
2788             if (!regs_left)
2789                break;
2790 	 }
2791 
2792 	 /* We don't handle control flow here.  Most computation of
2793 	  * values that end up in MRFs are shortly before the MRF
2794 	  * write anyway.
2795 	  */
2796 	 if (block->start() == scan_inst)
2797 	    break;
2798 
2799 	 /* You can't read from an MRF, so if someone else reads our
2800 	  * MRF's source GRF that we wanted to rewrite, that stops us.
2801 	  */
2802 	 bool interfered = false;
2803 	 for (int i = 0; i < scan_inst->sources; i++) {
2804             if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
2805                                 inst->src[0], inst->size_read(0))) {
2806 	       interfered = true;
2807 	    }
2808 	 }
2809 	 if (interfered)
2810 	    break;
2811 
2812          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2813                              inst->dst, inst->size_written)) {
2814 	    /* If somebody else writes our MRF here, we can't
2815 	     * compute-to-MRF before that.
2816 	     */
2817             break;
2818          }
2819 
2820          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
2821              regions_overlap(elk_fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
2822                              inst->dst, inst->size_written)) {
2823 	    /* Found a SEND instruction, which means that there are
2824 	     * live values in MRFs from base_mrf to base_mrf +
2825 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2826 	     * above it.
2827 	     */
2828             break;
2829          }
2830       }
2831 
2832       if (regs_left)
2833          continue;
2834 
2835       /* Found all generating instructions of our MRF's source value, so it
2836        * should be safe to rewrite them to point to the MRF directly.
2837        */
2838       regs_left = (1 << regs_read(inst, 0)) - 1;
2839 
2840       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2841          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2842                              inst->src[0], inst->size_read(0))) {
2843             /* Clear the bits for any registers this instruction overwrites. */
2844             regs_left &= ~mask_relative_to(
2845                inst->src[0], scan_inst->dst, scan_inst->size_written);
2846 
2847             const unsigned rel_offset = reg_offset(scan_inst->dst) -
2848                                         reg_offset(inst->src[0]);
2849 
2850             if (inst->dst.nr & ELK_MRF_COMPR4) {
2851                /* Apply the same address transformation done by the hardware
2852                 * for COMPR4 MRF writes.
2853                 */
2854                assert(rel_offset < 2 * REG_SIZE);
2855                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
2856 
2857                /* Clear the COMPR4 bit if the generating instruction is not
2858                 * compressed.
2859                 */
2860                if (scan_inst->size_written < 2 * REG_SIZE)
2861                   scan_inst->dst.nr &= ~ELK_MRF_COMPR4;
2862 
2863             } else {
2864                /* Calculate the MRF number the result of this instruction is
2865                 * ultimately written to.
2866                 */
2867                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
2868             }
2869 
2870             scan_inst->dst.file = MRF;
2871             scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
2872             scan_inst->saturate |= inst->saturate;
2873             if (!regs_left)
2874                break;
2875          }
2876       }
2877 
2878       assert(!regs_left);
2879       inst->remove(block);
2880       progress = true;
2881    }
2882 
2883    if (progress)
2884       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2885 
2886    return progress;
2887 }
2888 
2889 /**
2890  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2891  * flow.  We could probably do better here with some form of divergence
2892  * analysis.
2893  */
2894 bool
eliminate_find_live_channel()2895 elk_fs_visitor::eliminate_find_live_channel()
2896 {
2897    bool progress = false;
2898    unsigned depth = 0;
2899 
2900    if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
2901       /* The optimization below assumes that channel zero is live on thread
2902        * dispatch, which may not be the case if the fixed function dispatches
2903        * threads sparsely.
2904        */
2905       return false;
2906    }
2907 
2908    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2909       switch (inst->opcode) {
2910       case ELK_OPCODE_IF:
2911       case ELK_OPCODE_DO:
2912          depth++;
2913          break;
2914 
2915       case ELK_OPCODE_ENDIF:
2916       case ELK_OPCODE_WHILE:
2917          depth--;
2918          break;
2919 
2920       case ELK_OPCODE_HALT:
2921          /* This can potentially make control flow non-uniform until the end
2922           * of the program.
2923           */
2924          goto out;
2925 
2926       case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
2927          if (depth == 0) {
2928             inst->opcode = ELK_OPCODE_MOV;
2929             inst->src[0] = elk_imm_ud(0u);
2930             inst->sources = 1;
2931             inst->force_writemask_all = true;
2932             progress = true;
2933          }
2934          break;
2935 
2936       default:
2937          break;
2938       }
2939    }
2940 
2941 out:
2942    if (progress)
2943       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2944 
2945    return progress;
2946 }
2947 
2948 /**
2949  * Once we've generated code, try to convert normal ELK_FS_OPCODE_FB_WRITE
2950  * instructions to ELK_FS_OPCODE_REP_FB_WRITE.
2951  */
2952 void
emit_repclear_shader()2953 elk_fs_visitor::emit_repclear_shader()
2954 {
2955    elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
2956    elk_fs_inst *write = NULL;
2957 
2958    assert(uniforms == 0);
2959    assume(key->nr_color_regions > 0);
2960 
2961    elk_fs_reg color_output, header;
2962    if (devinfo->ver >= 7) {
2963       color_output = retype(elk_vec4_grf(127, 0), ELK_REGISTER_TYPE_UD);
2964       header = retype(elk_vec8_grf(125, 0), ELK_REGISTER_TYPE_UD);
2965    } else {
2966       color_output = retype(elk_vec4_reg(MRF, 2, 0), ELK_REGISTER_TYPE_UD);
2967       header = retype(elk_vec8_reg(MRF, 0, 0), ELK_REGISTER_TYPE_UD);
2968    }
2969 
2970    /* We pass the clear color as a flat input.  Copy it to the output. */
2971    elk_fs_reg color_input =
2972       elk_reg(ELK_GENERAL_REGISTER_FILE, 2, 3, 0, 0, ELK_REGISTER_TYPE_UD,
2973               ELK_VERTICAL_STRIDE_8, ELK_WIDTH_2, ELK_HORIZONTAL_STRIDE_4,
2974               ELK_SWIZZLE_XYZW, WRITEMASK_XYZW);
2975 
2976    const fs_builder bld = fs_builder(this).at_end();
2977    bld.exec_all().group(4, 0).MOV(color_output, color_input);
2978 
2979    if (key->nr_color_regions > 1) {
2980       /* Copy g0..g1 as the message header */
2981       bld.exec_all().group(16, 0)
2982          .MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2983    }
2984 
2985    for (int i = 0; i < key->nr_color_regions; ++i) {
2986       if (i > 0)
2987          bld.exec_all().group(1, 0).MOV(component(header, 2), elk_imm_ud(i));
2988 
2989       if (devinfo->ver >= 7) {
2990          write = bld.emit(ELK_SHADER_OPCODE_SEND);
2991          write->resize_sources(2);
2992          write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
2993          write->src[0] = elk_imm_ud(0);
2994          write->src[1] = i == 0 ? color_output : header;
2995          write->check_tdr = true;
2996          write->send_has_side_effects = true;
2997          write->desc = elk_fb_write_desc(devinfo, i,
2998             ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
2999             i == key->nr_color_regions - 1, false);
3000       } else {
3001          write = bld.emit(ELK_FS_OPCODE_REP_FB_WRITE);
3002          write->target = i;
3003          write->base_mrf = i == 0 ? color_output.nr : header.nr;
3004       }
3005 
3006       /* We can use a headerless message for the first render target */
3007       write->header_size = i == 0 ? 0 : 2;
3008       write->mlen = 1 + write->header_size;
3009    }
3010    write->eot = true;
3011    write->last_rt = true;
3012 
3013    calculate_cfg();
3014 
3015    this->first_non_payload_grf = payload().num_regs;
3016 }
3017 
3018 /**
3019  * Walks through basic blocks, looking for repeated MRF writes and
3020  * removing the later ones.
3021  */
3022 bool
remove_duplicate_mrf_writes()3023 elk_fs_visitor::remove_duplicate_mrf_writes()
3024 {
3025    elk_fs_inst *last_mrf_move[ELK_MAX_MRF(devinfo->ver)];
3026    bool progress = false;
3027 
3028    /* Need to update the MRF tracking for compressed instructions. */
3029    if (dispatch_width >= 16)
3030       return false;
3031 
3032    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3033 
3034    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3035       if (inst->is_control_flow()) {
3036 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3037       }
3038 
3039       if (inst->opcode == ELK_OPCODE_MOV &&
3040 	  inst->dst.file == MRF) {
3041          elk_fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3042 	 if (prev_inst && prev_inst->opcode == ELK_OPCODE_MOV &&
3043              inst->dst.equals(prev_inst->dst) &&
3044              inst->src[0].equals(prev_inst->src[0]) &&
3045              inst->saturate == prev_inst->saturate &&
3046              inst->predicate == prev_inst->predicate &&
3047              inst->conditional_mod == prev_inst->conditional_mod &&
3048              inst->exec_size == prev_inst->exec_size) {
3049 	    inst->remove(block);
3050 	    progress = true;
3051 	    continue;
3052 	 }
3053       }
3054 
3055       /* Clear out the last-write records for MRFs that were overwritten. */
3056       if (inst->dst.file == MRF) {
3057          last_mrf_move[inst->dst.nr] = NULL;
3058       }
3059 
3060       if (inst->mlen > 0 && inst->base_mrf != -1) {
3061 	 /* Found a SEND instruction, which will include two or fewer
3062 	  * implied MRF writes.  We could do better here.
3063 	  */
3064 	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3065 	    last_mrf_move[inst->base_mrf + i] = NULL;
3066 	 }
3067       }
3068 
3069       /* Clear out any MRF move records whose sources got overwritten. */
3070       for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3071          if (last_mrf_move[i] &&
3072              regions_overlap(inst->dst, inst->size_written,
3073                              last_mrf_move[i]->src[0],
3074                              last_mrf_move[i]->size_read(0))) {
3075             last_mrf_move[i] = NULL;
3076          }
3077       }
3078 
3079       if (inst->opcode == ELK_OPCODE_MOV &&
3080 	  inst->dst.file == MRF &&
3081 	  inst->src[0].file != ARF &&
3082 	  !inst->is_partial_write()) {
3083          last_mrf_move[inst->dst.nr] = inst;
3084       }
3085    }
3086 
3087    if (progress)
3088       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3089 
3090    return progress;
3091 }
3092 
3093 /**
3094  * Rounding modes for conversion instructions are included for each
3095  * conversion, but right now it is a state. So once it is set,
3096  * we don't need to call it again for subsequent calls.
3097  *
3098  * This is useful for vector/matrices conversions, as setting the
3099  * mode once is enough for the full vector/matrix
3100  */
3101 bool
remove_extra_rounding_modes()3102 elk_fs_visitor::remove_extra_rounding_modes()
3103 {
3104    bool progress = false;
3105    unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3106 
3107    elk_rnd_mode base_mode = ELK_RND_MODE_UNSPECIFIED;
3108    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3109         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3110         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3111        execution_mode)
3112       base_mode = ELK_RND_MODE_RTNE;
3113    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3114         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3115         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3116        execution_mode)
3117       base_mode = ELK_RND_MODE_RTZ;
3118 
3119    foreach_block (block, cfg) {
3120       elk_rnd_mode prev_mode = base_mode;
3121 
3122       foreach_inst_in_block_safe (elk_fs_inst, inst, block) {
3123          if (inst->opcode == ELK_SHADER_OPCODE_RND_MODE) {
3124             assert(inst->src[0].file == ELK_IMMEDIATE_VALUE);
3125             const elk_rnd_mode mode = (elk_rnd_mode) inst->src[0].d;
3126             if (mode == prev_mode) {
3127                inst->remove(block);
3128                progress = true;
3129             } else {
3130                prev_mode = mode;
3131             }
3132          }
3133       }
3134    }
3135 
3136    if (progress)
3137       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3138 
3139    return progress;
3140 }
3141 
3142 static void
clear_deps_for_inst_src(elk_fs_inst * inst,bool * deps,int first_grf,int grf_len)3143 clear_deps_for_inst_src(elk_fs_inst *inst, bool *deps, int first_grf, int grf_len)
3144 {
3145    /* Clear the flag for registers that actually got read (as expected). */
3146    for (int i = 0; i < inst->sources; i++) {
3147       int grf;
3148       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3149          grf = inst->src[i].nr;
3150       } else {
3151          continue;
3152       }
3153 
3154       if (grf >= first_grf &&
3155           grf < first_grf + grf_len) {
3156          deps[grf - first_grf] = false;
3157          if (inst->exec_size == 16)
3158             deps[grf - first_grf + 1] = false;
3159       }
3160    }
3161 }
3162 
3163 /**
3164  * Implements this workaround for the original 965:
3165  *
3166  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3167  *      check for post destination dependencies on this instruction, software
3168  *      must ensure that there is no destination hazard for the case of ‘write
3169  *      followed by a posted write’ shown in the following example.
3170  *
3171  *      1. mov r3 0
3172  *      2. send r3.xy <rest of send instruction>
3173  *      3. mov r2 r3
3174  *
3175  *      Due to no post-destination dependency check on the ‘send’, the above
3176  *      code sequence could have two instructions (1 and 2) in flight at the
3177  *      same time that both consider ‘r3’ as the target of their final writes.
3178  */
3179 void
insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3180 elk_fs_visitor::insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t *block,
3181                                                         elk_fs_inst *inst)
3182 {
3183    int write_len = regs_written(inst);
3184    int first_write_grf = inst->dst.nr;
3185    bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3186    assert(write_len < (int)sizeof(needs_dep) - 1);
3187 
3188    memset(needs_dep, false, sizeof(needs_dep));
3189    memset(needs_dep, true, write_len);
3190 
3191    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3192 
3193    /* Walk backwards looking for writes to registers we're writing which
3194     * aren't read since being written.  If we hit the start of the program,
3195     * we assume that there are no outstanding dependencies on entry to the
3196     * program.
3197     */
3198    foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3199       /* If we hit control flow, assume that there *are* outstanding
3200        * dependencies, and force their cleanup before our instruction.
3201        */
3202       if (block->start() == scan_inst && block->num != 0) {
3203          for (int i = 0; i < write_len; i++) {
3204             if (needs_dep[i])
3205                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3206                                first_write_grf + i);
3207          }
3208          return;
3209       }
3210 
3211       /* We insert our reads as late as possible on the assumption that any
3212        * instruction but a MOV that might have left us an outstanding
3213        * dependency has more latency than a MOV.
3214        */
3215       if (scan_inst->dst.file == VGRF) {
3216          for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3217             int reg = scan_inst->dst.nr + i;
3218 
3219             if (reg >= first_write_grf &&
3220                 reg < first_write_grf + write_len &&
3221                 needs_dep[reg - first_write_grf]) {
3222                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3223                needs_dep[reg - first_write_grf] = false;
3224                if (scan_inst->exec_size == 16)
3225                   needs_dep[reg - first_write_grf + 1] = false;
3226             }
3227          }
3228       }
3229 
3230       /* Clear the flag for registers that actually got read (as expected). */
3231       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3232 
3233       /* Continue the loop only if we haven't resolved all the dependencies */
3234       int i;
3235       for (i = 0; i < write_len; i++) {
3236          if (needs_dep[i])
3237             break;
3238       }
3239       if (i == write_len)
3240          return;
3241    }
3242 }
3243 
3244 /**
3245  * Implements this workaround for the original 965:
3246  *
3247  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3248  *      used as a destination register until after it has been sourced by an
3249  *      instruction with a different destination register.
3250  */
3251 void
insert_gfx4_post_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3252 elk_fs_visitor::insert_gfx4_post_send_dependency_workarounds(elk_bblock_t *block, elk_fs_inst *inst)
3253 {
3254    int write_len = regs_written(inst);
3255    unsigned first_write_grf = inst->dst.nr;
3256    bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3257    assert(write_len < (int)sizeof(needs_dep) - 1);
3258 
3259    memset(needs_dep, false, sizeof(needs_dep));
3260    memset(needs_dep, true, write_len);
3261    /* Walk forwards looking for writes to registers we're writing which aren't
3262     * read before being written.
3263     */
3264    foreach_inst_in_block_starting_from(elk_fs_inst, scan_inst, inst) {
3265       /* If we hit control flow, force resolve all remaining dependencies. */
3266       if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3267          for (int i = 0; i < write_len; i++) {
3268             if (needs_dep[i])
3269                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3270                                first_write_grf + i);
3271          }
3272          return;
3273       }
3274 
3275       /* Clear the flag for registers that actually got read (as expected). */
3276       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3277 
3278       /* We insert our reads as late as possible since they're reading the
3279        * result of a SEND, which has massive latency.
3280        */
3281       if (scan_inst->dst.file == VGRF &&
3282           scan_inst->dst.nr >= first_write_grf &&
3283           scan_inst->dst.nr < first_write_grf + write_len &&
3284           needs_dep[scan_inst->dst.nr - first_write_grf]) {
3285          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3286                          scan_inst->dst.nr);
3287          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3288       }
3289 
3290       /* Continue the loop only if we haven't resolved all the dependencies */
3291       int i;
3292       for (i = 0; i < write_len; i++) {
3293          if (needs_dep[i])
3294             break;
3295       }
3296       if (i == write_len)
3297          return;
3298    }
3299 }
3300 
3301 void
insert_gfx4_send_dependency_workarounds()3302 elk_fs_visitor::insert_gfx4_send_dependency_workarounds()
3303 {
3304    if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
3305       return;
3306 
3307    bool progress = false;
3308 
3309    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
3310       if (inst->mlen != 0 && inst->dst.file == VGRF) {
3311          insert_gfx4_pre_send_dependency_workarounds(block, inst);
3312          insert_gfx4_post_send_dependency_workarounds(block, inst);
3313          progress = true;
3314       }
3315    }
3316 
3317    if (progress)
3318       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3319 }
3320 
3321 bool
lower_load_payload()3322 elk_fs_visitor::lower_load_payload()
3323 {
3324    bool progress = false;
3325 
3326    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3327       if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
3328          continue;
3329 
3330       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3331       assert(inst->saturate == false);
3332       elk_fs_reg dst = inst->dst;
3333 
3334       /* Get rid of COMPR4.  We'll add it back in if we need it */
3335       if (dst.file == MRF)
3336          dst.nr = dst.nr & ~ELK_MRF_COMPR4;
3337 
3338       const fs_builder ibld(this, block, inst);
3339       const fs_builder ubld = ibld.exec_all();
3340 
3341       for (uint8_t i = 0; i < inst->header_size;) {
3342          /* Number of header GRFs to initialize at once with a single MOV
3343           * instruction.
3344           */
3345          const unsigned n =
3346             (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3347              inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3348             2 : 1;
3349 
3350          if (inst->src[i].file != BAD_FILE)
3351             ubld.group(8 * n, 0).MOV(retype(dst, ELK_REGISTER_TYPE_UD),
3352                                      retype(inst->src[i], ELK_REGISTER_TYPE_UD));
3353 
3354          dst = byte_offset(dst, n * REG_SIZE);
3355          i += n;
3356       }
3357 
3358       if (inst->dst.file == MRF && (inst->dst.nr & ELK_MRF_COMPR4) &&
3359           inst->exec_size > 8) {
3360          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3361           * a straightforward copy.  Instead, the result of the
3362           * LOAD_PAYLOAD is treated as interleaved and the first four
3363           * non-header sources are unpacked as:
3364           *
3365           * m + 0: r0
3366           * m + 1: g0
3367           * m + 2: b0
3368           * m + 3: a0
3369           * m + 4: r1
3370           * m + 5: g1
3371           * m + 6: b1
3372           * m + 7: a1
3373           *
3374           * This is used for gen <= 5 fb writes.
3375           */
3376          assert(inst->exec_size == 16);
3377          assert(inst->header_size + 4 <= inst->sources);
3378          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3379             if (inst->src[i].file != BAD_FILE) {
3380                if (devinfo->has_compr4) {
3381                   elk_fs_reg compr4_dst = retype(dst, inst->src[i].type);
3382                   compr4_dst.nr |= ELK_MRF_COMPR4;
3383                   ibld.MOV(compr4_dst, inst->src[i]);
3384                } else {
3385                   /* Platform doesn't have COMPR4.  We have to fake it */
3386                   elk_fs_reg mov_dst = retype(dst, inst->src[i].type);
3387                   ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
3388                   mov_dst.nr += 4;
3389                   ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
3390                }
3391             }
3392 
3393             dst.nr++;
3394          }
3395 
3396          /* The loop above only ever incremented us through the first set
3397           * of 4 registers.  However, thanks to the magic of COMPR4, we
3398           * actually wrote to the first 8 registers, so we need to take
3399           * that into account now.
3400           */
3401          dst.nr += 4;
3402 
3403          /* The COMPR4 code took care of the first 4 sources.  We'll let
3404           * the regular path handle any remaining sources.  Yes, we are
3405           * modifying the instruction but we're about to delete it so
3406           * this really doesn't hurt anything.
3407           */
3408          inst->header_size += 4;
3409       }
3410 
3411       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3412          dst.type = inst->src[i].type;
3413          if (inst->src[i].file != BAD_FILE) {
3414             ibld.MOV(dst, inst->src[i]);
3415          }
3416          dst = offset(dst, ibld, 1);
3417       }
3418 
3419       inst->remove(block);
3420       progress = true;
3421    }
3422 
3423    if (progress)
3424       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3425 
3426    return progress;
3427 }
3428 
3429 /**
3430  * Factor an unsigned 32-bit integer.
3431  *
3432  * Attempts to factor \c x into two values that are at most 0xFFFF.  If no
3433  * such factorization is possible, either because the value is too large or is
3434  * prime, both \c result_a and \c result_b will be zero.
3435  */
3436 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)3437 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
3438 {
3439    /* This is necessary to prevent various opportunities for division by zero
3440     * below.
3441     */
3442    assert(x > 0xffff);
3443 
3444    /* This represents the actual expected constraints on the input.  Namely,
3445     * both the upper and lower words should be > 1.
3446     */
3447    assert(x >= 0x00020002);
3448 
3449    *result_a = 0;
3450    *result_b = 0;
3451 
3452    /* The value is too large to factor with the constraints. */
3453    if (x > (0xffffu * 0xffffu))
3454       return;
3455 
3456    /* A non-prime number will have the form p*q*d where p is some prime
3457     * number, q > 1, and 1 <= d <= q.  To meet the constraints of this
3458     * function, (p*d) < 0x10000.  This implies d <= floor(0xffff / p).
3459     * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)).  Finally,
3460     * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
3461     *
3462     * The observation is finding the largest possible value of p reduces the
3463     * possible range of d.  After selecting p, all values of d in this range
3464     * are tested until a factorization is found.  The size of the range of
3465     * possible values of d sets an upper bound on the run time of the
3466     * function.
3467     */
3468    static const uint16_t primes[256] = {
3469          2,    3,    5,    7,   11,   13,   17,   19,
3470         23,   29,   31,   37,   41,   43,   47,   53,
3471         59,   61,   67,   71,   73,   79,   83,   89,
3472         97,  101,  103,  107,  109,  113,  127,  131,  /*  32 */
3473        137,  139,  149,  151,  157,  163,  167,  173,
3474        179,  181,  191,  193,  197,  199,  211,  223,
3475        227,  229,  233,  239,  241,  251,  257,  263,
3476        269,  271,  277,  281,  283,  293,  307,  311,  /*  64 */
3477        313,  317,  331,  337,  347,  349,  353,  359,
3478        367,  373,  379,  383,  389,  397,  401,  409,
3479        419,  421,  431,  433,  439,  443,  449,  457,
3480        461,  463,  467,  479,  487,  491,  499,  503,  /*  96 */
3481        509,  521,  523,  541,  547,  557,  563,  569,
3482        571,  577,  587,  593,  599,  601,  607,  613,
3483        617,  619,  631,  641,  643,  647,  653,  659,
3484        661,  673,  677,  683,  691,  701,  709,  719,   /* 128 */
3485        727,  733,  739,  743,  751,  757,  761,  769,
3486        773,  787,  797,  809,  811,  821,  823,  827,
3487        829,  839,  853,  857,  859,  863,  877,  881,
3488        883,  887,  907,  911,  919,  929,  937,  941,  /* 160 */
3489        947,  953,  967,  971,  977,  983,  991,  997,
3490       1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
3491       1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
3492       1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,  /* 192 */
3493       1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
3494       1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
3495       1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
3496       1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,  /* 224 */
3497       1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
3498       1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
3499       1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
3500       1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,  /* 256 */
3501    };
3502 
3503    unsigned p;
3504    unsigned x_div_p;
3505 
3506    for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
3507       p = primes[i];
3508       x_div_p = x / p;
3509 
3510       if ((x_div_p * p) == x)
3511          break;
3512    }
3513 
3514    /* A prime factor was not found. */
3515    if (x_div_p * p != x)
3516       return;
3517 
3518    /* Terminate early if d=1 is a solution. */
3519    if (x_div_p < 0x10000) {
3520       *result_a = x_div_p;
3521       *result_b = p;
3522       return;
3523    }
3524 
3525    /* Pick the maximum possible value for 'd'.  It's important that the loop
3526     * below execute while d <= max_d because max_d is a valid value.  Having
3527     * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
3528     * incorrectly reported as not being factorable.  The problem would occur
3529     * with any value that is a factor of two primes in the table and one prime
3530     * not in the table.
3531     */
3532    const unsigned max_d = 0xffff / p;
3533 
3534    /* Pick an initial value of 'd' that (combined with rejecting too large
3535     * values above) guarantees that 'q' will always be small enough.
3536     * DIV_ROUND_UP is used to prevent 'd' from being zero.
3537     */
3538    for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
3539       unsigned q = x_div_p / d;
3540 
3541       if ((q * d) == x_div_p) {
3542          assert(p * d * q == x);
3543          assert((p * d) < 0x10000);
3544 
3545          *result_a = q;
3546          *result_b = p * d;
3547          break;
3548       }
3549 
3550       /* Since every value of 'd' is tried, as soon as 'd' is larger
3551        * than 'q', we're just re-testing combinations that have
3552        * already been tested.
3553        */
3554       if (d > q)
3555          break;
3556    }
3557 }
3558 
3559 void
lower_mul_dword_inst(elk_fs_inst * inst,elk_bblock_t * block)3560 elk_fs_visitor::lower_mul_dword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3561 {
3562    const fs_builder ibld(this, block, inst);
3563 
3564    /* It is correct to use inst->src[1].d in both end of the comparison.
3565     * Using .ud in the UINT16_MAX comparison would cause any negative value to
3566     * fail the check.
3567     */
3568    if (inst->src[1].file == IMM &&
3569        (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
3570       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3571        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3572        * src1 are used.
3573        *
3574        * If multiplying by an immediate value that fits in 16-bits, do a
3575        * single MUL instruction with that value in the proper location.
3576        */
3577       const bool ud = (inst->src[1].d >= 0);
3578       if (devinfo->ver < 7) {
3579          elk_fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
3580          ibld.MOV(imm, inst->src[1]);
3581          ibld.MUL(inst->dst, imm, inst->src[0]);
3582       } else {
3583          ibld.MUL(inst->dst, inst->src[0],
3584                   ud ? elk_imm_uw(inst->src[1].ud)
3585                      : elk_imm_w(inst->src[1].d));
3586       }
3587    } else {
3588       /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
3589        * do 32-bit integer multiplication in one instruction, but instead
3590        * must do a sequence (which actually calculates a 64-bit result):
3591        *
3592        *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3593        *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3594        *    mov(8)  g2<1>D     acc0<8,8,1>D
3595        *
3596        * But on Gen > 6, the ability to use second accumulator register
3597        * (acc1) for non-float data types was removed, preventing a simple
3598        * implementation in SIMD16. A 16-channel result can be calculated by
3599        * executing the three instructions twice in SIMD8, once with quarter
3600        * control of 1Q for the first eight channels and again with 2Q for
3601        * the second eight channels.
3602        *
3603        * Which accumulator register is implicitly accessed (by AccWrEnable
3604        * for instance) is determined by the quarter control. Unfortunately
3605        * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3606        * implicit accumulator access by an instruction with 2Q will access
3607        * acc1 regardless of whether the data type is usable in acc1.
3608        *
3609        * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3610        * integer data types.
3611        *
3612        * Since we only want the low 32-bits of the result, we can do two
3613        * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3614        * adjust the high result and add them (like the mach is doing):
3615        *
3616        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3617        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3618        *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3619        *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3620        *
3621        * We avoid the shl instruction by realizing that we only want to add
3622        * the low 16-bits of the "high" result to the high 16-bits of the
3623        * "low" result and using proper regioning on the add:
3624        *
3625        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3626        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3627        *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3628        *
3629        * Since it does not use the (single) accumulator register, we can
3630        * schedule multi-component multiplications much better.
3631        */
3632 
3633       bool needs_mov = false;
3634       elk_fs_reg orig_dst = inst->dst;
3635 
3636       /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3637        * reusing the original destination is impossible due to hardware
3638        * restrictions, source/destination overlap, or it being the null
3639        * register.
3640        */
3641       elk_fs_reg low = inst->dst;
3642       if (orig_dst.is_null() || orig_dst.file == MRF ||
3643           regions_overlap(inst->dst, inst->size_written,
3644                           inst->src[0], inst->size_read(0)) ||
3645           regions_overlap(inst->dst, inst->size_written,
3646                           inst->src[1], inst->size_read(1)) ||
3647           inst->dst.stride >= 4) {
3648          needs_mov = true;
3649          low = elk_fs_reg(VGRF, alloc.allocate(regs_written(inst)),
3650                       inst->dst.type);
3651       }
3652 
3653       /* Get a new VGRF but keep the same stride as inst->dst */
3654       elk_fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
3655       high.stride = inst->dst.stride;
3656       high.offset = inst->dst.offset % REG_SIZE;
3657 
3658       bool do_addition = true;
3659       if (devinfo->ver >= 7) {
3660          if (inst->src[1].abs)
3661             lower_src_modifiers(this, block, inst, 1);
3662 
3663          if (inst->src[1].file == IMM) {
3664             unsigned a;
3665             unsigned b;
3666 
3667             /* If the immeditate value can be factored into two values, A and
3668              * B, that each fit in 16-bits, the multiplication result can
3669              * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
3670              * This saves an operation (the addition) and a temporary register
3671              * (high).
3672              *
3673              * Skip the optimization if either the high word or the low word
3674              * is 0 or 1.  In these conditions, at least one of the
3675              * multiplications generated by the straightforward method will be
3676              * eliminated anyway.
3677              */
3678             if (inst->src[1].ud > 0x0001ffff &&
3679                 (inst->src[1].ud & 0xffff) > 1) {
3680                factor_uint32(inst->src[1].ud, &a, &b);
3681 
3682                if (a != 0) {
3683                   ibld.MUL(low, inst->src[0], elk_imm_uw(a));
3684                   ibld.MUL(low, low, elk_imm_uw(b));
3685                   do_addition = false;
3686                }
3687             }
3688 
3689             if (do_addition) {
3690                ibld.MUL(low, inst->src[0],
3691                         elk_imm_uw(inst->src[1].ud & 0xffff));
3692                ibld.MUL(high, inst->src[0],
3693                         elk_imm_uw(inst->src[1].ud >> 16));
3694             }
3695          } else {
3696             ibld.MUL(low, inst->src[0],
3697                      subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3698             ibld.MUL(high, inst->src[0],
3699                      subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 1));
3700          }
3701       } else {
3702          if (inst->src[0].abs)
3703             lower_src_modifiers(this, block, inst, 0);
3704 
3705          ibld.MUL(low, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 0),
3706                   inst->src[1]);
3707          ibld.MUL(high, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 1),
3708                   inst->src[1]);
3709       }
3710 
3711       if (do_addition) {
3712          ibld.ADD(subscript(low, ELK_REGISTER_TYPE_UW, 1),
3713                   subscript(low, ELK_REGISTER_TYPE_UW, 1),
3714                   subscript(high, ELK_REGISTER_TYPE_UW, 0));
3715       }
3716 
3717       if (needs_mov || inst->conditional_mod)
3718          set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
3719    }
3720 }
3721 
3722 void
lower_mul_qword_inst(elk_fs_inst * inst,elk_bblock_t * block)3723 elk_fs_visitor::lower_mul_qword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3724 {
3725    const fs_builder ibld(this, block, inst);
3726 
3727    /* Considering two 64-bit integers ab and cd where each letter        ab
3728     * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
3729     * only need to provide the YZ part of the result.               -------
3730     *                                                                    BD
3731     *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
3732     *  about the lower 32 bits (since they are part of the upper     +  BC
3733     *  32 bits of our result). AC is not needed since it starts      + AC
3734     *  on the 65th bit of the result.                               -------
3735     *                                                                  WXYZ
3736     */
3737    unsigned int q_regs = regs_written(inst);
3738    unsigned int d_regs = (q_regs + 1) / 2;
3739 
3740    elk_fs_reg bd(VGRF, alloc.allocate(q_regs), ELK_REGISTER_TYPE_UQ);
3741    elk_fs_reg ad(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3742    elk_fs_reg bc(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3743 
3744    /* Here we need the full 64 bit result for 32b * 32b. */
3745    if (devinfo->has_integer_dword_mul) {
3746       ibld.MUL(bd, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3747                subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3748    } else {
3749       elk_fs_reg bd_high(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3750       elk_fs_reg bd_low(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3751       const unsigned acc_width = reg_unit(devinfo) * 8;
3752       elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), ELK_REGISTER_TYPE_UD),
3753                              inst->group % acc_width);
3754 
3755       elk_fs_inst *mul = ibld.MUL(acc,
3756                             subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3757                             subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3758       mul->writes_accumulator = true;
3759 
3760       ibld.MACH(bd_high, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3761                 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3762       ibld.MOV(bd_low, acc);
3763 
3764       ibld.UNDEF(bd);
3765       ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 0), bd_low);
3766       ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 1), bd_high);
3767    }
3768 
3769    ibld.MUL(ad, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
3770             subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3771    ibld.MUL(bc, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3772             subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1));
3773 
3774    ibld.ADD(ad, ad, bc);
3775    ibld.ADD(subscript(bd, ELK_REGISTER_TYPE_UD, 1),
3776             subscript(bd, ELK_REGISTER_TYPE_UD, 1), ad);
3777 
3778    if (devinfo->has_64bit_int) {
3779       ibld.MOV(inst->dst, bd);
3780    } else {
3781       if (!inst->is_partial_write())
3782          ibld.emit_undef_for_dst(inst);
3783       ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
3784                subscript(bd, ELK_REGISTER_TYPE_UD, 0));
3785       ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
3786                subscript(bd, ELK_REGISTER_TYPE_UD, 1));
3787    }
3788 }
3789 
3790 void
lower_mulh_inst(elk_fs_inst * inst,elk_bblock_t * block)3791 elk_fs_visitor::lower_mulh_inst(elk_fs_inst *inst, elk_bblock_t *block)
3792 {
3793    const fs_builder ibld(this, block, inst);
3794 
3795    /* According to the BDW+ BSpec page for the "Multiply Accumulate
3796     * High" instruction:
3797     *
3798     *  "An added preliminary mov is required for source modification on
3799     *   src1:
3800     *      mov (8) r3.0<1>:d -r3<8;8,1>:d
3801     *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
3802     *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
3803     */
3804    if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
3805       lower_src_modifiers(this, block, inst, 1);
3806 
3807    /* Should have been lowered to 8-wide. */
3808    assert(inst->exec_size <= get_lowered_simd_width(this, inst));
3809    const unsigned acc_width = reg_unit(devinfo) * 8;
3810    const elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), inst->dst.type),
3811                                 inst->group % acc_width);
3812    elk_fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3813    elk_fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3814 
3815    if (devinfo->ver >= 8) {
3816       /* Until Gfx8, integer multiplies read 32-bits from one source,
3817        * and 16-bits from the other, and relying on the MACH instruction
3818        * to generate the high bits of the result.
3819        *
3820        * On Gfx8, the multiply instruction does a full 32x32-bit
3821        * multiply, but in order to do a 64-bit multiply we can simulate
3822        * the previous behavior and then use a MACH instruction.
3823        */
3824       assert(mul->src[1].type == ELK_REGISTER_TYPE_D ||
3825              mul->src[1].type == ELK_REGISTER_TYPE_UD);
3826       mul->src[1].type = ELK_REGISTER_TYPE_UW;
3827       mul->src[1].stride *= 2;
3828 
3829       if (mul->src[1].file == IMM) {
3830          mul->src[1] = elk_imm_uw(mul->src[1].ud);
3831       }
3832    } else if (devinfo->verx10 == 70 &&
3833               inst->group > 0) {
3834       /* Among other things the quarter control bits influence which
3835        * accumulator register is used by the hardware for instructions
3836        * that access the accumulator implicitly (e.g. MACH).  A
3837        * second-half instruction would normally map to acc1, which
3838        * doesn't exist on Gfx7 and up (the hardware does emulate it for
3839        * floating-point instructions *only* by taking advantage of the
3840        * extra precision of acc0 not normally used for floating point
3841        * arithmetic).
3842        *
3843        * HSW and up are careful enough not to try to access an
3844        * accumulator register that doesn't exist, but on earlier Gfx7
3845        * hardware we need to make sure that the quarter control bits are
3846        * zero to avoid non-deterministic behaviour and emit an extra MOV
3847        * to get the result masked correctly according to the current
3848        * channel enables.
3849        */
3850       mach->group = 0;
3851       mach->force_writemask_all = true;
3852       mach->dst = ibld.vgrf(inst->dst.type);
3853       ibld.MOV(inst->dst, mach->dst);
3854    }
3855 }
3856 
3857 bool
lower_integer_multiplication()3858 elk_fs_visitor::lower_integer_multiplication()
3859 {
3860    bool progress = false;
3861 
3862    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3863       if (inst->opcode == ELK_OPCODE_MUL) {
3864          /* If the instruction is already in a form that does not need lowering,
3865           * return early.
3866           */
3867          if (devinfo->ver >= 7) {
3868             if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
3869                continue;
3870          } else {
3871             if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
3872                continue;
3873          }
3874 
3875          if ((inst->dst.type == ELK_REGISTER_TYPE_Q ||
3876               inst->dst.type == ELK_REGISTER_TYPE_UQ) &&
3877              (inst->src[0].type == ELK_REGISTER_TYPE_Q ||
3878               inst->src[0].type == ELK_REGISTER_TYPE_UQ) &&
3879              (inst->src[1].type == ELK_REGISTER_TYPE_Q ||
3880               inst->src[1].type == ELK_REGISTER_TYPE_UQ)) {
3881             lower_mul_qword_inst(inst, block);
3882             inst->remove(block);
3883             progress = true;
3884          } else if (!inst->dst.is_accumulator() &&
3885                     (inst->dst.type == ELK_REGISTER_TYPE_D ||
3886                      inst->dst.type == ELK_REGISTER_TYPE_UD) &&
3887                     !devinfo->has_integer_dword_mul) {
3888             lower_mul_dword_inst(inst, block);
3889             inst->remove(block);
3890             progress = true;
3891          }
3892       } else if (inst->opcode == ELK_SHADER_OPCODE_MULH) {
3893          lower_mulh_inst(inst, block);
3894          inst->remove(block);
3895          progress = true;
3896       }
3897 
3898    }
3899 
3900    if (progress)
3901       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3902 
3903    return progress;
3904 }
3905 
3906 bool
lower_minmax()3907 elk_fs_visitor::lower_minmax()
3908 {
3909    assert(devinfo->ver < 6);
3910 
3911    bool progress = false;
3912 
3913    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3914       const fs_builder ibld(this, block, inst);
3915 
3916       if (inst->opcode == ELK_OPCODE_SEL &&
3917           inst->predicate == ELK_PREDICATE_NONE) {
3918          /* If src1 is an immediate value that is not NaN, then it can't be
3919           * NaN.  In that case, emit CMP because it is much better for cmod
3920           * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
3921           * support HF or DF, so it is not necessary to check for those.
3922           */
3923          if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
3924              (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
3925             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
3926                      inst->conditional_mod);
3927          } else {
3928             ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
3929                       inst->conditional_mod);
3930          }
3931          inst->predicate = ELK_PREDICATE_NORMAL;
3932          inst->conditional_mod = ELK_CONDITIONAL_NONE;
3933 
3934          progress = true;
3935       }
3936    }
3937 
3938    if (progress)
3939       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3940 
3941    return progress;
3942 }
3943 
3944 bool
lower_sub_sat()3945 elk_fs_visitor::lower_sub_sat()
3946 {
3947    bool progress = false;
3948 
3949    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3950       const fs_builder ibld(this, block, inst);
3951 
3952       if (inst->opcode == ELK_SHADER_OPCODE_USUB_SAT ||
3953           inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
3954          /* The fundamental problem is the hardware performs source negation
3955           * at the bit width of the source.  If the source is 0x80000000D, the
3956           * negation is 0x80000000D.  As a result, subtractSaturate(0,
3957           * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
3958           * are at least three ways to resolve this:
3959           *
3960           * 1. Use the accumulator for the negated source.  The accumulator is
3961           *    33 bits, so our source 0x80000000 is sign-extended to
3962           *    0x1800000000.  The negation of which is 0x080000000.  This
3963           *    doesn't help for 64-bit integers (which are already bigger than
3964           *    33 bits).  There are also only 8 accumulators, so SIMD16 or
3965           *    SIMD32 instructions would have to be split into multiple SIMD8
3966           *    instructions.
3967           *
3968           * 2. Use slightly different math.  For any n-bit value x, we know (x
3969           *    >> 1) != -(x >> 1).  We can use this fact to only do
3970           *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
3971           *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
3972           *
3973           * 3. For unsigned sources, it is sufficient to replace the
3974           *    subtractSaturate with (a > b) ? a - b : 0.
3975           *
3976           * It may also be possible to use the SUBB instruction.  This
3977           * implicitly writes the accumulator, so it could only be used in the
3978           * same situations as #1 above.  It is further limited by only
3979           * allowing UD sources.
3980           */
3981          if (inst->exec_size == 8 && inst->src[0].type != ELK_REGISTER_TYPE_Q &&
3982              inst->src[0].type != ELK_REGISTER_TYPE_UQ) {
3983             elk_fs_reg acc(ARF, ELK_ARF_ACCUMULATOR, inst->src[1].type);
3984 
3985             ibld.MOV(acc, inst->src[1]);
3986             elk_fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
3987             add->saturate = true;
3988             add->src[0].negate = true;
3989          } else if (inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
3990             /* tmp = src1 >> 1;
3991              * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
3992              */
3993             elk_fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
3994             elk_fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
3995             elk_fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
3996             elk_fs_inst *add;
3997 
3998             ibld.SHR(tmp1, inst->src[1], elk_imm_d(1));
3999 
4000             add = ibld.ADD(tmp2, inst->src[1], tmp1);
4001             add->src[1].negate = true;
4002 
4003             add = ibld.ADD(tmp3, inst->src[0], tmp1);
4004             add->src[1].negate = true;
4005             add->saturate = true;
4006 
4007             add = ibld.ADD(inst->dst, tmp3, tmp2);
4008             add->src[1].negate = true;
4009             add->saturate = true;
4010          } else {
4011             /* a > b ? a - b : 0 */
4012             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4013                      ELK_CONDITIONAL_G);
4014 
4015             elk_fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4016             add->src[1].negate = !add->src[1].negate;
4017 
4018             ibld.SEL(inst->dst, inst->dst, elk_imm_ud(0))
4019                ->predicate = ELK_PREDICATE_NORMAL;
4020          }
4021 
4022          inst->remove(block);
4023          progress = true;
4024       }
4025    }
4026 
4027    if (progress)
4028       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4029 
4030    return progress;
4031 }
4032 
4033 /**
4034  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4035  * by discard.  Due to the layout of the sample mask in the fragment shader
4036  * thread payload, \p bld is required to have a dispatch_width() not greater
4037  * than 16 for fragment shaders.
4038  */
4039 elk_fs_reg
elk_sample_mask_reg(const fs_builder & bld)4040 elk_sample_mask_reg(const fs_builder &bld)
4041 {
4042    const elk_fs_visitor &s = *bld.shader;
4043 
4044    if (s.stage != MESA_SHADER_FRAGMENT) {
4045       return elk_imm_ud(0xffffffff);
4046    } else if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4047       assert(bld.dispatch_width() <= 16);
4048       return elk_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
4049    } else {
4050       assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4051       return retype(elk_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4052                     ELK_REGISTER_TYPE_UW);
4053    }
4054 }
4055 
4056 uint32_t
elk_fb_write_msg_control(const elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data)4057 elk_fb_write_msg_control(const elk_fs_inst *inst,
4058                          const struct elk_wm_prog_data *prog_data)
4059 {
4060    uint32_t mctl;
4061 
4062    if (inst->opcode == ELK_FS_OPCODE_REP_FB_WRITE) {
4063       assert(inst->group == 0 && inst->exec_size == 16);
4064       mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4065    } else if (prog_data->dual_src_blend) {
4066       assert(inst->exec_size == 8);
4067 
4068       if (inst->group % 16 == 0)
4069          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4070       else if (inst->group % 16 == 8)
4071          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4072       else
4073          unreachable("Invalid dual-source FB write instruction group");
4074    } else {
4075       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4076 
4077       if (inst->exec_size == 16)
4078          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4079       else if (inst->exec_size == 8)
4080          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4081       else
4082          unreachable("Invalid FB write execution size");
4083    }
4084 
4085    return mctl;
4086 }
4087 
4088  /**
4089  * Predicate the specified instruction on the sample mask.
4090  */
4091 void
elk_emit_predicate_on_sample_mask(const fs_builder & bld,elk_fs_inst * inst)4092 elk_emit_predicate_on_sample_mask(const fs_builder &bld, elk_fs_inst *inst)
4093 {
4094    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
4095           bld.group() == inst->group &&
4096           bld.dispatch_width() == inst->exec_size);
4097 
4098    const elk_fs_visitor &s = *bld.shader;
4099    const elk_fs_reg sample_mask = elk_sample_mask_reg(bld);
4100    const unsigned subreg = sample_mask_flag_subreg(s);
4101 
4102    if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4103       assert(sample_mask.file == ARF &&
4104              sample_mask.nr == elk_flag_subreg(subreg).nr &&
4105              sample_mask.subnr == elk_flag_subreg(
4106                 subreg + inst->group / 16).subnr);
4107    } else {
4108       bld.group(1, 0).exec_all()
4109          .MOV(elk_flag_subreg(subreg + inst->group / 16), sample_mask);
4110    }
4111 
4112    if (inst->predicate) {
4113       assert(inst->predicate == ELK_PREDICATE_NORMAL);
4114       assert(!inst->predicate_inverse);
4115       assert(inst->flag_subreg == 0);
4116       /* Combine the sample mask with the existing predicate by using a
4117        * vertical predication mode.
4118        */
4119       inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
4120    } else {
4121       inst->flag_subreg = subreg;
4122       inst->predicate = ELK_PREDICATE_NORMAL;
4123       inst->predicate_inverse = false;
4124    }
4125 }
4126 
4127 static bool
is_mixed_float_with_fp32_dst(const elk_fs_inst * inst)4128 is_mixed_float_with_fp32_dst(const elk_fs_inst *inst)
4129 {
4130    /* This opcode sometimes uses :W type on the source even if the operand is
4131     * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
4132     */
4133    if (inst->opcode == ELK_OPCODE_F16TO32)
4134       return true;
4135 
4136    if (inst->dst.type != ELK_REGISTER_TYPE_F)
4137       return false;
4138 
4139    for (int i = 0; i < inst->sources; i++) {
4140       if (inst->src[i].type == ELK_REGISTER_TYPE_HF)
4141          return true;
4142    }
4143 
4144    return false;
4145 }
4146 
4147 static bool
is_mixed_float_with_packed_fp16_dst(const elk_fs_inst * inst)4148 is_mixed_float_with_packed_fp16_dst(const elk_fs_inst *inst)
4149 {
4150    /* This opcode sometimes uses :W type on the destination even if the
4151     * destination is a :HF, because in gfx7 there is no support for :HF, and
4152     * thus it uses :W.
4153     */
4154    if (inst->opcode == ELK_OPCODE_F32TO16 &&
4155        inst->dst.stride == 1)
4156       return true;
4157 
4158    if (inst->dst.type != ELK_REGISTER_TYPE_HF ||
4159        inst->dst.stride != 1)
4160       return false;
4161 
4162    for (int i = 0; i < inst->sources; i++) {
4163       if (inst->src[i].type == ELK_REGISTER_TYPE_F)
4164          return true;
4165    }
4166 
4167    return false;
4168 }
4169 
4170 /**
4171  * Get the closest allowed SIMD width for instruction \p inst accounting for
4172  * some common regioning and execution control restrictions that apply to FPU
4173  * instructions.  These restrictions don't necessarily have any relevance to
4174  * instructions not executed by the FPU pipeline like extended math, control
4175  * flow or send message instructions.
4176  *
4177  * For virtual opcodes it's really up to the instruction -- In some cases
4178  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4179  * instructions) it may simplify virtual instruction lowering if we can
4180  * enforce FPU-like regioning restrictions already on the virtual instruction,
4181  * in other cases (e.g. virtual send-like instructions) this may be
4182  * excessively restrictive.
4183  */
4184 static unsigned
get_fpu_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4185 get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
4186                            const elk_fs_inst *inst)
4187 {
4188    const struct elk_compiler *compiler = shader->compiler;
4189    const struct intel_device_info *devinfo = compiler->devinfo;
4190 
4191    /* Maximum execution size representable in the instruction controls. */
4192    unsigned max_width = MIN2(32, inst->exec_size);
4193 
4194    /* According to the PRMs:
4195     *  "A. In Direct Addressing mode, a source cannot span more than 2
4196     *      adjacent GRF registers.
4197     *   B. A destination cannot span more than 2 adjacent GRF registers."
4198     *
4199     * Look for the source or destination with the largest register region
4200     * which is the one that is going to limit the overall execution size of
4201     * the instruction due to this rule.
4202     */
4203    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4204 
4205    for (unsigned i = 0; i < inst->sources; i++)
4206       reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
4207 
4208    /* Calculate the maximum execution size of the instruction based on the
4209     * factor by which it goes over the hardware limit of 2 GRFs.
4210     */
4211    const unsigned max_reg_count = 2 * reg_unit(devinfo);
4212    if (reg_count > max_reg_count)
4213       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
4214 
4215    /* According to the IVB PRMs:
4216     *  "When destination spans two registers, the source MUST span two
4217     *   registers. The exception to the above rule:
4218     *
4219     *    - When source is scalar, the source registers are not incremented.
4220     *    - When source is packed integer Word and destination is packed
4221     *      integer DWord, the source register is not incremented but the
4222     *      source sub register is incremented."
4223     *
4224     * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
4225     * restrictions.  The code below intentionally doesn't check whether the
4226     * destination type is integer because empirically the hardware doesn't
4227     * seem to care what the actual type is as long as it's dword-aligned.
4228     *
4229     * HSW PRMs also add a note to the second exception:
4230     *  "When lower 8 channels are disabled, the sub register of source1
4231     *   operand is not incremented. If the lower 8 channels are expected
4232     *   to be disabled, say by predication, the instruction must be split
4233     *   into pair of simd8 operations."
4234     *
4235     * We can't reliably know if the channels won't be disabled due to,
4236     * for example, IMASK. So, play it safe and disallow packed-word exception
4237     * for src1.
4238     */
4239    if (devinfo->ver < 8) {
4240       for (unsigned i = 0; i < inst->sources; i++) {
4241          /* IVB implements DF scalars as <0;2,1> regions. */
4242          const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4243             (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
4244          const bool is_packed_word_exception = i != 1 &&
4245             type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4246             type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4247 
4248          /* We check size_read(i) against size_written instead of REG_SIZE
4249           * because we want to properly handle SIMD32.  In SIMD32, you can end
4250           * up with writes to 4 registers and a source that reads 2 registers
4251           * and we may still need to lower all the way to SIMD8 in that case.
4252           */
4253          if (inst->size_written > REG_SIZE &&
4254              inst->size_read(i) != 0 &&
4255              inst->size_read(i) < inst->size_written &&
4256              !is_scalar_exception && !is_packed_word_exception) {
4257             const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4258             max_width = MIN2(max_width, inst->exec_size / reg_count);
4259          }
4260       }
4261    }
4262 
4263    if (devinfo->ver < 6) {
4264       /* From the G45 PRM, Volume 4 Page 361:
4265        *
4266        *    "Operand Alignment Rule: With the exceptions listed below, a
4267        *     source/destination operand in general should be aligned to even
4268        *     256-bit physical register with a region size equal to two 256-bit
4269        *     physical registers."
4270        *
4271        * Normally we enforce this by allocating virtual registers to the
4272        * even-aligned class.  But we need to handle payload registers.
4273        */
4274       for (unsigned i = 0; i < inst->sources; i++) {
4275          if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
4276              inst->size_read(i) > REG_SIZE) {
4277             max_width = MIN2(max_width, 8);
4278          }
4279       }
4280    }
4281 
4282    /* From the IVB PRMs:
4283     *  "When an instruction is SIMD32, the low 16 bits of the execution mask
4284     *   are applied for both halves of the SIMD32 instruction. If different
4285     *   execution mask channels are required, split the instruction into two
4286     *   SIMD16 instructions."
4287     *
4288     * There is similar text in the HSW PRMs.  Gfx4-6 don't even implement
4289     * 32-wide control flow support in hardware and will behave similarly.
4290     */
4291    if (devinfo->ver < 8 && !inst->force_writemask_all)
4292       max_width = MIN2(max_width, 16);
4293 
4294    /* From the IVB PRMs (applies to HSW too):
4295     *  "Instructions with condition modifiers must not use SIMD32."
4296     *
4297     * From the BDW PRMs (applies to later hardware too):
4298     *  "Ternary instruction with condition modifiers must not use SIMD32."
4299     */
4300    if (inst->conditional_mod && (devinfo->ver < 8 ||
4301                                  inst->elk_is_3src(compiler)))
4302       max_width = MIN2(max_width, 16);
4303 
4304    /* From the IVB PRMs (applies to other devices that don't have the
4305     * intel_device_info::supports_simd16_3src flag set):
4306     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
4307     *   SIMD8 is not allowed for DF operations."
4308     */
4309    if (inst->elk_is_3src(compiler) && !devinfo->supports_simd16_3src)
4310       max_width = MIN2(max_width, inst->exec_size / reg_count);
4311 
4312    /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4313     * the 8-bit quarter of the execution mask signals specified in the
4314     * instruction control fields) for the second compressed half of any
4315     * single-precision instruction (for double-precision instructions
4316     * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4317     * the EU will apply the wrong execution controls for the second
4318     * sequential GRF write if the number of channels per GRF is not exactly
4319     * eight in single-precision mode (or four in double-float mode).
4320     *
4321     * In this situation we calculate the maximum size of the split
4322     * instructions so they only ever write to a single register.
4323     */
4324    if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
4325        !inst->force_writemask_all) {
4326       const unsigned channels_per_grf = inst->exec_size /
4327          DIV_ROUND_UP(inst->size_written, REG_SIZE);
4328       const unsigned exec_type_size = get_exec_type_size(inst);
4329       assert(exec_type_size);
4330 
4331       /* The hardware shifts exactly 8 channels per compressed half of the
4332        * instruction in single-precision mode and exactly 4 in double-precision.
4333        */
4334       if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4335          max_width = MIN2(max_width, channels_per_grf);
4336 
4337       /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4338        * because HW applies the same channel enable signals to both halves of
4339        * the compressed instruction which will be just wrong under
4340        * non-uniform control flow.
4341        */
4342       if (devinfo->verx10 == 70 &&
4343           (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4344          max_width = MIN2(max_width, 4);
4345    }
4346 
4347    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4348     * Float Operations:
4349     *
4350     *    "No SIMD16 in mixed mode when destination is f32. Instruction
4351     *     execution size must be no more than 8."
4352     *
4353     * FIXME: the simulator doesn't seem to complain if we don't do this and
4354     * empirical testing with existing CTS tests show that they pass just fine
4355     * without implementing this, however, since our interpretation of the PRM
4356     * is that conversion MOVs between HF and F are still mixed-float
4357     * instructions (and therefore subject to this restriction) we decided to
4358     * split them to be safe. Might be useful to do additional investigation to
4359     * lift the restriction if we can ensure that it is safe though, since these
4360     * conversions are common when half-float types are involved since many
4361     * instructions do not support HF types and conversions from/to F are
4362     * required.
4363     */
4364    if (is_mixed_float_with_fp32_dst(inst))
4365       max_width = MIN2(max_width, 8);
4366 
4367    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4368     * Float Operations:
4369     *
4370     *    "No SIMD16 in mixed mode when destination is packed f16 for both
4371     *     Align1 and Align16."
4372     */
4373    if (is_mixed_float_with_packed_fp16_dst(inst))
4374       max_width = MIN2(max_width, 8);
4375 
4376    /* Only power-of-two execution sizes are representable in the instruction
4377     * control fields.
4378     */
4379    return 1 << util_logbase2(max_width);
4380 }
4381 
4382 /**
4383  * Get the maximum allowed SIMD width for instruction \p inst accounting for
4384  * various payload size restrictions that apply to sampler message
4385  * instructions.
4386  *
4387  * This is only intended to provide a maximum theoretical bound for the
4388  * execution size of the message based on the number of argument components
4389  * alone, which in most cases will determine whether the SIMD8 or SIMD16
4390  * variant of the message can be used, though some messages may have
4391  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4392  * the message length to determine the exact SIMD width and argument count,
4393  * which makes a number of sampler message combinations impossible to
4394  * represent).
4395  *
4396  * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
4397  * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
4398  */
4399 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const elk_fs_inst * inst)4400 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
4401                                const elk_fs_inst *inst)
4402 {
4403    /* If we have a min_lod parameter on anything other than a simple sample
4404     * message, it will push it over 5 arguments and we have to fall back to
4405     * SIMD8.
4406     */
4407    if (inst->opcode != ELK_SHADER_OPCODE_TEX &&
4408        inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
4409       return 8;
4410 
4411    /* Calculate the number of coordinate components that have to be present
4412     * assuming that additional arguments follow the texel coordinates in the
4413     * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
4414     * need to pad to four or three components depending on the message,
4415     * pre-ILK we need to pad to at most three components.
4416     */
4417    const unsigned req_coord_components =
4418       (devinfo->ver >= 7 ||
4419        !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4420       (devinfo->ver >= 5 && inst->opcode != ELK_SHADER_OPCODE_TXF_LOGICAL &&
4421                             inst->opcode != ELK_SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4422       3;
4423 
4424    /* Calculate the total number of argument components that need to be passed
4425     * to the sampler unit.
4426     */
4427    const unsigned num_payload_components =
4428       MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4429            req_coord_components) +
4430       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4431       inst->components_read(TEX_LOGICAL_SRC_LOD) +
4432       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4433       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4434       (inst->opcode == ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4435        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4436       inst->components_read(TEX_LOGICAL_SRC_MCS);
4437 
4438    const unsigned simd_limit = reg_unit(devinfo) *
4439       (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4440 
4441    /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
4442     * maximum message size supported by the sampler, regardless of whether a
4443     * header is provided or not.
4444     */
4445    return MIN2(inst->exec_size, simd_limit);
4446 }
4447 
4448 /**
4449  * Get the closest native SIMD width supported by the hardware for instruction
4450  * \p inst.  The instruction will be left untouched by
4451  * elk_fs_visitor::lower_simd_width() if the returned value is equal to the
4452  * original execution size.
4453  */
4454 static unsigned
get_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4455 get_lowered_simd_width(const elk_fs_visitor *shader, const elk_fs_inst *inst)
4456 {
4457    const struct elk_compiler *compiler = shader->compiler;
4458    const struct intel_device_info *devinfo = compiler->devinfo;
4459 
4460    switch (inst->opcode) {
4461    case ELK_OPCODE_MOV:
4462    case ELK_OPCODE_SEL:
4463    case ELK_OPCODE_NOT:
4464    case ELK_OPCODE_AND:
4465    case ELK_OPCODE_OR:
4466    case ELK_OPCODE_XOR:
4467    case ELK_OPCODE_SHR:
4468    case ELK_OPCODE_SHL:
4469    case ELK_OPCODE_ASR:
4470    case ELK_OPCODE_CMPN:
4471    case ELK_OPCODE_CSEL:
4472    case ELK_OPCODE_F32TO16:
4473    case ELK_OPCODE_F16TO32:
4474    case ELK_OPCODE_BFREV:
4475    case ELK_OPCODE_BFE:
4476    case ELK_OPCODE_ADD:
4477    case ELK_OPCODE_MUL:
4478    case ELK_OPCODE_AVG:
4479    case ELK_OPCODE_FRC:
4480    case ELK_OPCODE_RNDU:
4481    case ELK_OPCODE_RNDD:
4482    case ELK_OPCODE_RNDE:
4483    case ELK_OPCODE_RNDZ:
4484    case ELK_OPCODE_LZD:
4485    case ELK_OPCODE_FBH:
4486    case ELK_OPCODE_FBL:
4487    case ELK_OPCODE_CBIT:
4488    case ELK_OPCODE_SAD2:
4489    case ELK_OPCODE_MAD:
4490    case ELK_OPCODE_LRP:
4491    case ELK_FS_OPCODE_PACK:
4492    case ELK_SHADER_OPCODE_SEL_EXEC:
4493    case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
4494    case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
4495       return get_fpu_lowered_simd_width(shader, inst);
4496 
4497    case ELK_OPCODE_CMP: {
4498       /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4499        * when the destination is a GRF the dependency-clear bit on the flag
4500        * register is cleared early.
4501        *
4502        * Suggested workarounds are to disable coissuing CMP instructions
4503        * or to split CMP(16) instructions into two CMP(8) instructions.
4504        *
4505        * We choose to split into CMP(8) instructions since disabling
4506        * coissuing would affect CMP instructions not otherwise affected by
4507        * the errata.
4508        */
4509       const unsigned max_width = (devinfo->verx10 == 70 &&
4510                                   !inst->dst.is_null() ? 8 : ~0);
4511       return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
4512    }
4513    case ELK_OPCODE_BFI1:
4514    case ELK_OPCODE_BFI2:
4515       /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4516        * should
4517        *  "Force BFI instructions to be executed always in SIMD8."
4518        */
4519       return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
4520                   get_fpu_lowered_simd_width(shader, inst));
4521 
4522    case ELK_OPCODE_IF:
4523       assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4524       return inst->exec_size;
4525 
4526    case ELK_SHADER_OPCODE_RCP:
4527    case ELK_SHADER_OPCODE_RSQ:
4528    case ELK_SHADER_OPCODE_SQRT:
4529    case ELK_SHADER_OPCODE_EXP2:
4530    case ELK_SHADER_OPCODE_LOG2:
4531    case ELK_SHADER_OPCODE_SIN:
4532    case ELK_SHADER_OPCODE_COS: {
4533       /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
4534        * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
4535        */
4536       if (devinfo->ver == 6 || devinfo->verx10 == 40)
4537          return MIN2(8, inst->exec_size);
4538       if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4539          return MIN2(8, inst->exec_size);
4540       return MIN2(16, inst->exec_size);
4541    }
4542 
4543    case ELK_SHADER_OPCODE_POW: {
4544       /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
4545        * to SIMD8 with half-float
4546        */
4547       if (devinfo->ver < 7)
4548          return MIN2(8, inst->exec_size);
4549       if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4550          return MIN2(8, inst->exec_size);
4551       return MIN2(16, inst->exec_size);
4552    }
4553 
4554    case ELK_SHADER_OPCODE_USUB_SAT:
4555    case ELK_SHADER_OPCODE_ISUB_SAT:
4556       return get_fpu_lowered_simd_width(shader, inst);
4557 
4558    case ELK_SHADER_OPCODE_INT_QUOTIENT:
4559    case ELK_SHADER_OPCODE_INT_REMAINDER:
4560       /* Integer division is limited to SIMD8 on all generations. */
4561       return MIN2(8, inst->exec_size);
4562 
4563    case ELK_FS_OPCODE_LINTERP:
4564    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4565    case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
4566    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4567    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4568    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4569       return MIN2(16, inst->exec_size);
4570 
4571    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4572       /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4573        * message used to implement varying pull constant loads, so expand it
4574        * to SIMD16.  An alternative with longer message payload length but
4575        * shorter return payload would be to use the SIMD8 sampler message that
4576        * takes (header, u, v, r) as parameters instead of (header, u).
4577        */
4578       return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
4579 
4580    case ELK_FS_OPCODE_DDX_COARSE:
4581    case ELK_FS_OPCODE_DDX_FINE:
4582    case ELK_FS_OPCODE_DDY_COARSE:
4583    case ELK_FS_OPCODE_DDY_FINE:
4584       /* The implementation of this virtual opcode may require emitting
4585        * compressed Align16 instructions, which are severely limited on some
4586        * generations.
4587        *
4588        * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4589        * Region Restrictions):
4590        *
4591        *  "In Align16 access mode, SIMD16 is not allowed for DW operations
4592        *   and SIMD8 is not allowed for DF operations."
4593        *
4594        * In this context, "DW operations" means "operations acting on 32-bit
4595        * values", so it includes operations on floats.
4596        *
4597        * Gfx4 has a similar restriction.  From the i965 PRM, section 11.5.3
4598        * (Instruction Compression -> Rules and Restrictions):
4599        *
4600        *  "A compressed instruction must be in Align1 access mode. Align16
4601        *   mode instructions cannot be compressed."
4602        *
4603        * Similar text exists in the g45 PRM.
4604        *
4605        * Empirically, compressed align16 instructions using odd register
4606        * numbers don't appear to work on Sandybridge either.
4607        */
4608       return (devinfo->ver == 4 || devinfo->ver == 6 ||
4609               (devinfo->verx10 == 70) ?
4610               MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4611 
4612    case ELK_SHADER_OPCODE_MULH:
4613       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4614        * is 8-wide on Gfx7+.
4615        */
4616       return (devinfo->ver >= 7 ? 8 :
4617               get_fpu_lowered_simd_width(shader, inst));
4618 
4619    case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
4620       /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
4621        * here.
4622        */
4623       assert(devinfo->ver != 6 ||
4624              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4625              inst->exec_size == 8);
4626       /* Dual-source FB writes are unsupported in SIMD16 mode. */
4627       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4628               8 : MIN2(16, inst->exec_size));
4629 
4630    case ELK_SHADER_OPCODE_TEX_LOGICAL:
4631    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
4632    case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
4633    case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
4634    case ELK_SHADER_OPCODE_LOD_LOGICAL:
4635    case ELK_SHADER_OPCODE_TG4_LOGICAL:
4636    case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
4637    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4638    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4639       return get_sampler_lowered_simd_width(devinfo, inst);
4640 
4641    /* On gfx12 parameters are fixed to 16-bit values and therefore they all
4642     * always fit regardless of the execution size.
4643     */
4644    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
4645       return MIN2(16, inst->exec_size);
4646 
4647    case ELK_SHADER_OPCODE_TXD_LOGICAL:
4648       /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
4649        * unsuppported on Xe2.
4650        */
4651       return 8;
4652 
4653    case ELK_SHADER_OPCODE_TXL_LOGICAL:
4654    case ELK_FS_OPCODE_TXB_LOGICAL:
4655       /* Only one execution size is representable pre-ILK depending on whether
4656        * the shadow reference argument is present.
4657        */
4658       if (devinfo->ver == 4)
4659          return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
4660       else
4661          return get_sampler_lowered_simd_width(devinfo, inst);
4662 
4663    case ELK_SHADER_OPCODE_TXF_LOGICAL:
4664    case ELK_SHADER_OPCODE_TXS_LOGICAL:
4665       /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4666        * messages.  Use SIMD16 instead.
4667        */
4668       if (devinfo->ver == 4)
4669          return 16;
4670       else
4671          return get_sampler_lowered_simd_width(devinfo, inst);
4672 
4673    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4674    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4675    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4676       return 8;
4677 
4678    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4679    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4680    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4681    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
4682    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
4683    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
4684    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
4685       return MIN2(16, inst->exec_size);
4686 
4687    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
4688    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
4689    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
4690    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
4691       return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
4692 
4693    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
4694    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
4695    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
4696       assert(inst->exec_size <= 16);
4697       return inst->exec_size;
4698 
4699    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
4700       return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
4701 
4702    case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
4703    case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
4704       return MIN2(8, inst->exec_size);
4705 
4706    case ELK_SHADER_OPCODE_QUAD_SWIZZLE: {
4707       const unsigned swiz = inst->src[1].ud;
4708       return (is_uniform(inst->src[0]) ?
4709                  get_fpu_lowered_simd_width(shader, inst) :
4710               type_sz(inst->src[0].type) == 4 ? 8 :
4711               swiz == ELK_SWIZZLE_XYXY || swiz == ELK_SWIZZLE_ZWZW ? 4 :
4712               get_fpu_lowered_simd_width(shader, inst));
4713    }
4714    case ELK_SHADER_OPCODE_MOV_INDIRECT: {
4715       /* From IVB and HSW PRMs:
4716        *
4717        * "2.When the destination requires two registers and the sources are
4718        *  indirect, the sources must use 1x1 regioning mode.
4719        *
4720        * In case of DF instructions in HSW/IVB, the exec_size is limited by
4721        * the EU decompression logic not handling VxH indirect addressing
4722        * correctly.
4723        */
4724       const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
4725       /* Prior to Broadwell, we only have 8 address subregisters. */
4726       return MIN3(devinfo->ver >= 8 ? 16 : 8,
4727                   max_size / (inst->dst.stride * type_sz(inst->dst.type)),
4728                   inst->exec_size);
4729    }
4730 
4731    case ELK_SHADER_OPCODE_LOAD_PAYLOAD: {
4732       const unsigned reg_count =
4733          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
4734 
4735       if (reg_count > 2) {
4736          /* Only LOAD_PAYLOAD instructions with per-channel destination region
4737           * can be easily lowered (which excludes headers and heterogeneous
4738           * types).
4739           */
4740          assert(!inst->header_size);
4741          for (unsigned i = 0; i < inst->sources; i++)
4742             assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
4743                    inst->src[i].file == BAD_FILE);
4744 
4745          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4746       } else {
4747          return inst->exec_size;
4748       }
4749    }
4750    default:
4751       return inst->exec_size;
4752    }
4753 }
4754 
4755 /**
4756  * Return true if splitting out the group of channels of instruction \p inst
4757  * given by lbld.group() requires allocating a temporary for the i-th source
4758  * of the lowered instruction.
4759  */
4760 static inline bool
needs_src_copy(const fs_builder & lbld,const elk_fs_inst * inst,unsigned i)4761 needs_src_copy(const fs_builder &lbld, const elk_fs_inst *inst, unsigned i)
4762 {
4763    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
4764             (inst->components_read(i) == 1 &&
4765              lbld.dispatch_width() <= inst->exec_size)) ||
4766           (inst->flags_written(lbld.shader->devinfo) &
4767            flag_mask(inst->src[i], type_sz(inst->src[i].type)));
4768 }
4769 
4770 /**
4771  * Extract the data that would be consumed by the channel group given by
4772  * lbld.group() from the i-th source region of instruction \p inst and return
4773  * it as result in packed form.
4774  */
4775 static elk_fs_reg
emit_unzip(const fs_builder & lbld,elk_fs_inst * inst,unsigned i)4776 emit_unzip(const fs_builder &lbld, elk_fs_inst *inst, unsigned i)
4777 {
4778    assert(lbld.group() >= inst->group);
4779 
4780    /* Specified channel group from the source region. */
4781    const elk_fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
4782 
4783    if (needs_src_copy(lbld, inst, i)) {
4784       /* Builder of the right width to perform the copy avoiding uninitialized
4785        * data if the lowered execution size is greater than the original
4786        * execution size of the instruction.
4787        */
4788       const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
4789                                               inst->exec_size), 0);
4790       const elk_fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
4791 
4792       for (unsigned k = 0; k < inst->components_read(i); ++k)
4793          cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
4794 
4795       return tmp;
4796 
4797    } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
4798       /* The source is invariant for all dispatch_width-wide groups of the
4799        * original region.
4800        */
4801       return inst->src[i];
4802 
4803    } else {
4804       /* We can just point the lowered instruction at the right channel group
4805        * from the original region.
4806        */
4807       return src;
4808    }
4809 }
4810 
4811 /**
4812  * Return true if splitting out the group of channels of instruction \p inst
4813  * given by lbld.group() requires allocating a temporary for the destination
4814  * of the lowered instruction and copying the data back to the original
4815  * destination region.
4816  */
4817 static inline bool
needs_dst_copy(const fs_builder & lbld,const elk_fs_inst * inst)4818 needs_dst_copy(const fs_builder &lbld, const elk_fs_inst *inst)
4819 {
4820    if (inst->dst.is_null())
4821       return false;
4822 
4823    /* If the instruction writes more than one component we'll have to shuffle
4824     * the results of multiple lowered instructions in order to make sure that
4825     * they end up arranged correctly in the original destination region.
4826     */
4827    if (inst->size_written > inst->dst.component_size(inst->exec_size))
4828       return true;
4829 
4830    /* If the lowered execution size is larger than the original the result of
4831     * the instruction won't fit in the original destination, so we'll have to
4832     * allocate a temporary in any case.
4833     */
4834    if (lbld.dispatch_width() > inst->exec_size)
4835       return true;
4836 
4837    for (unsigned i = 0; i < inst->sources; i++) {
4838       /* If we already made a copy of the source for other reasons there won't
4839        * be any overlap with the destination.
4840        */
4841       if (needs_src_copy(lbld, inst, i))
4842          continue;
4843 
4844       /* In order to keep the logic simple we emit a copy whenever the
4845        * destination region doesn't exactly match an overlapping source, which
4846        * may point at the source and destination not being aligned group by
4847        * group which could cause one of the lowered instructions to overwrite
4848        * the data read from the same source by other lowered instructions.
4849        */
4850       if (regions_overlap(inst->dst, inst->size_written,
4851                           inst->src[i], inst->size_read(i)) &&
4852           !inst->dst.equals(inst->src[i]))
4853         return true;
4854    }
4855 
4856    return false;
4857 }
4858 
4859 /**
4860  * Insert data from a packed temporary into the channel group given by
4861  * lbld.group() of the destination region of instruction \p inst and return
4862  * the temporary as result.  Any copy instructions that are required for
4863  * unzipping the previous value (in the case of partial writes) will be
4864  * inserted using \p lbld_before and any copy instructions required for
4865  * zipping up the destination of \p inst will be inserted using \p lbld_after.
4866  */
4867 static elk_fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,elk_fs_inst * inst)4868 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
4869          elk_fs_inst *inst)
4870 {
4871    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
4872    assert(lbld_before.group() == lbld_after.group());
4873    assert(lbld_after.group() >= inst->group);
4874 
4875    const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
4876 
4877    /* Specified channel group from the destination region. */
4878    const elk_fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
4879 
4880    if (!needs_dst_copy(lbld_after, inst)) {
4881       /* No need to allocate a temporary for the lowered instruction, just
4882        * take the right group of channels from the original region.
4883        */
4884       return dst;
4885    }
4886 
4887    /* Deal with the residency data part later */
4888    const unsigned residency_size = inst->has_sampler_residency() ?
4889       (reg_unit(devinfo) * REG_SIZE) : 0;
4890    const unsigned dst_size = (inst->size_written - residency_size) /
4891       inst->dst.component_size(inst->exec_size);
4892 
4893    const elk_fs_reg tmp = lbld_after.vgrf(inst->dst.type,
4894                                       dst_size + inst->has_sampler_residency());
4895 
4896    if (inst->predicate) {
4897       /* Handle predication by copying the original contents of the
4898        * destination into the temporary before emitting the lowered
4899        * instruction.
4900        */
4901       const fs_builder gbld_before =
4902          lbld_before.group(MIN2(lbld_before.dispatch_width(),
4903                                 inst->exec_size), 0);
4904       for (unsigned k = 0; k < dst_size; ++k) {
4905          gbld_before.MOV(offset(tmp, lbld_before, k),
4906                          offset(dst, inst->exec_size, k));
4907       }
4908    }
4909 
4910    const fs_builder gbld_after =
4911       lbld_after.group(MIN2(lbld_after.dispatch_width(),
4912                             inst->exec_size), 0);
4913    for (unsigned k = 0; k < dst_size; ++k) {
4914       /* Use a builder of the right width to perform the copy avoiding
4915        * uninitialized data if the lowered execution size is greater than the
4916        * original execution size of the instruction.
4917        */
4918       gbld_after.MOV(offset(dst, inst->exec_size, k),
4919                      offset(tmp, lbld_after, k));
4920    }
4921 
4922    if (inst->has_sampler_residency()) {
4923       /* Sampler messages with residency need a special attention. In the
4924        * first lane of the last component are located the Pixel Null Mask
4925        * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
4926        * have to build a single 32bit value for the SIMD32 message out of 2
4927        * SIMD16 16 bit values.
4928        */
4929       const fs_builder rbld = gbld_after.exec_all().group(1, 0);
4930       elk_fs_reg local_res_reg = component(
4931          retype(offset(tmp, lbld_before, dst_size),
4932                 ELK_REGISTER_TYPE_UW), 0);
4933       elk_fs_reg final_res_reg =
4934          retype(byte_offset(inst->dst,
4935                             inst->size_written - residency_size +
4936                             gbld_after.group() / 8),
4937                 ELK_REGISTER_TYPE_UW);
4938       rbld.MOV(final_res_reg, local_res_reg);
4939    }
4940 
4941    return tmp;
4942 }
4943 
4944 bool
lower_simd_width()4945 elk_fs_visitor::lower_simd_width()
4946 {
4947    bool progress = false;
4948 
4949    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4950       const unsigned lower_width = get_lowered_simd_width(this, inst);
4951 
4952       if (lower_width != inst->exec_size) {
4953          /* Builder matching the original instruction.  We may also need to
4954           * emit an instruction of width larger than the original, set the
4955           * execution size of the builder to the highest of both for now so
4956           * we're sure that both cases can be handled.
4957           */
4958          const unsigned max_width = MAX2(inst->exec_size, lower_width);
4959 
4960          const fs_builder bld =
4961             fs_builder(this, MAX2(max_width, dispatch_width)).at_end();
4962          const fs_builder ibld = bld.at(block, inst)
4963                                     .exec_all(inst->force_writemask_all)
4964                                     .group(max_width, inst->group / max_width);
4965 
4966          /* Split the copies in chunks of the execution width of either the
4967           * original or the lowered instruction, whichever is lower.
4968           */
4969          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
4970          const unsigned residency_size = inst->has_sampler_residency() ?
4971             (reg_unit(devinfo) * REG_SIZE) : 0;
4972          const unsigned dst_size =
4973             (inst->size_written - residency_size) /
4974             inst->dst.component_size(inst->exec_size);
4975 
4976          assert(!inst->writes_accumulator && !inst->mlen);
4977 
4978          /* Inserting the zip, unzip, and duplicated instructions in all of
4979           * the right spots is somewhat tricky.  All of the unzip and any
4980           * instructions from the zip which unzip the destination prior to
4981           * writing need to happen before all of the per-group instructions
4982           * and the zip instructions need to happen after.  In order to sort
4983           * this all out, we insert the unzip instructions before \p inst,
4984           * insert the per-group instructions after \p inst (i.e. before
4985           * inst->next), and insert the zip instructions before the
4986           * instruction after \p inst.  Since we are inserting instructions
4987           * after \p inst, inst->next is a moving target and we need to save
4988           * it off here so that we insert the zip instructions in the right
4989           * place.
4990           *
4991           * Since we're inserting split instructions after after_inst, the
4992           * instructions will end up in the reverse order that we insert them.
4993           * However, certain render target writes require that the low group
4994           * instructions come before the high group.  From the Ivy Bridge PRM
4995           * Vol. 4, Pt. 1, Section 3.9.11:
4996           *
4997           *    "If multiple SIMD8 Dual Source messages are delivered by the
4998           *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
4999           *    issued before the SIMD8_DUALSRC_HI message with the same Slot
5000           *    Group Select setting."
5001           *
5002           * And, from Section 3.9.11.1 of the same PRM:
5003           *
5004           *    "When SIMD32 or SIMD16 PS threads send render target writes
5005           *    with multiple SIMD8 and SIMD16 messages, the following must
5006           *    hold:
5007           *
5008           *    All the slots (as described above) must have a corresponding
5009           *    render target write irrespective of the slot's validity. A slot
5010           *    is considered valid when at least one sample is enabled. For
5011           *    example, a SIMD16 PS thread must send two SIMD8 render target
5012           *    writes to cover all the slots.
5013           *
5014           *    PS thread must send SIMD render target write messages with
5015           *    increasing slot numbers. For example, SIMD16 thread has
5016           *    Slot[15:0] and if two SIMD8 render target writes are used, the
5017           *    first SIMD8 render target write must send Slot[7:0] and the
5018           *    next one must send Slot[15:8]."
5019           *
5020           * In order to make low group instructions come before high group
5021           * instructions (this is required for some render target writes), we
5022           * split from the highest group to lowest.
5023           */
5024          exec_node *const after_inst = inst->next;
5025          for (int i = n - 1; i >= 0; i--) {
5026             /* Emit a copy of the original instruction with the lowered width.
5027              * If the EOT flag was set throw it away except for the last
5028              * instruction to avoid killing the thread prematurely.
5029              */
5030             elk_fs_inst split_inst = *inst;
5031             split_inst.exec_size = lower_width;
5032             split_inst.eot = inst->eot && i == int(n - 1);
5033 
5034             /* Select the correct channel enables for the i-th group, then
5035              * transform the sources and destination and emit the lowered
5036              * instruction.
5037              */
5038             const fs_builder lbld = ibld.group(lower_width, i);
5039 
5040             for (unsigned j = 0; j < inst->sources; j++)
5041                split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
5042 
5043             split_inst.dst = emit_zip(lbld.at(block, inst),
5044                                       lbld.at(block, after_inst), inst);
5045             split_inst.size_written =
5046                split_inst.dst.component_size(lower_width) * dst_size +
5047                residency_size;
5048 
5049             lbld.at(block, inst->next).emit(split_inst);
5050          }
5051 
5052          inst->remove(block);
5053          progress = true;
5054       }
5055    }
5056 
5057    if (progress)
5058       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5059 
5060    return progress;
5061 }
5062 
5063 /**
5064  * Transform barycentric vectors into the interleaved form expected by the PLN
5065  * instruction and returned by the Gfx7+ PI shared function.
5066  *
5067  * For channels 0-15 in SIMD16 mode they are expected to be laid out as
5068  * follows in the register file:
5069  *
5070  *    rN+0: X[0-7]
5071  *    rN+1: Y[0-7]
5072  *    rN+2: X[8-15]
5073  *    rN+3: Y[8-15]
5074  *
5075  * There is no need to handle SIMD32 here -- This is expected to be run after
5076  * SIMD lowering, since SIMD lowering relies on vectors having the standard
5077  * component layout.
5078  */
5079 bool
lower_barycentrics()5080 elk_fs_visitor::lower_barycentrics()
5081 {
5082    const bool has_interleaved_layout = devinfo->has_pln ||
5083                                        devinfo->ver >= 7;
5084    bool progress = false;
5085 
5086    if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
5087       return false;
5088 
5089    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5090       if (inst->exec_size < 16)
5091          continue;
5092 
5093       const fs_builder ibld(this, block, inst);
5094       const fs_builder ubld = ibld.exec_all().group(8, 0);
5095 
5096       switch (inst->opcode) {
5097       case ELK_FS_OPCODE_LINTERP : {
5098          assert(inst->exec_size == 16);
5099          const elk_fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
5100          elk_fs_reg srcs[4];
5101 
5102          for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
5103             srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
5104                                    8 * (i / 2));
5105 
5106          ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
5107 
5108          inst->src[0] = tmp;
5109          progress = true;
5110          break;
5111       }
5112       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
5113       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
5114       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
5115          assert(inst->exec_size == 16);
5116          const elk_fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
5117 
5118          for (unsigned i = 0; i < 2; i++) {
5119             for (unsigned g = 0; g < inst->exec_size / 8; g++) {
5120                elk_fs_inst *mov = ibld.at(block, inst->next).group(8, g)
5121                                   .MOV(horiz_offset(offset(inst->dst, ibld, i),
5122                                                     8 * g),
5123                                        offset(tmp, ubld, 2 * g + i));
5124                mov->predicate = inst->predicate;
5125                mov->predicate_inverse = inst->predicate_inverse;
5126                mov->flag_subreg = inst->flag_subreg;
5127             }
5128          }
5129 
5130          inst->dst = tmp;
5131          progress = true;
5132          break;
5133       }
5134       default:
5135          break;
5136       }
5137    }
5138 
5139    if (progress)
5140       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5141 
5142    return progress;
5143 }
5144 
5145 bool
lower_find_live_channel()5146 elk_fs_visitor::lower_find_live_channel()
5147 {
5148    bool progress = false;
5149 
5150    if (devinfo->ver < 8)
5151       return false;
5152 
5153    bool packed_dispatch =
5154       elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
5155    bool vmask =
5156       stage == MESA_SHADER_FRAGMENT &&
5157       elk_wm_prog_data(stage_prog_data)->uses_vmask;
5158 
5159    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5160       if (inst->opcode != ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL &&
5161           inst->opcode != ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
5162          continue;
5163 
5164       bool first = inst->opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL;
5165 
5166       /* Getting the first active channel index is easy on Gfx8: Just find
5167        * the first bit set in the execution mask.  The register exists on
5168        * HSW already but it reads back as all ones when the current
5169        * instruction has execution masking disabled, so it's kind of
5170        * useless there.
5171        */
5172       elk_fs_reg exec_mask(retype(elk_mask_reg(0), ELK_REGISTER_TYPE_UD));
5173 
5174       const fs_builder ibld(this, block, inst);
5175       if (!inst->is_partial_write())
5176          ibld.emit_undef_for_dst(inst);
5177 
5178       const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
5179 
5180       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
5181        * so combine the execution and dispatch masks to obtain the true mask.
5182        *
5183        * If we're looking for the first live channel, and we have packed
5184        * dispatch, we can skip this step, as we know all dispatched channels
5185        * will appear at the front of the mask.
5186        */
5187       if (!(first && packed_dispatch)) {
5188          elk_fs_reg mask = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5189          ubld.UNDEF(mask);
5190          ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, mask, elk_imm_ud(vmask ? 3 : 2));
5191 
5192          /* Quarter control has the effect of magically shifting the value of
5193           * ce0 so you'll get the first/last active channel relative to the
5194           * specified quarter control as result.
5195           */
5196          if (inst->group > 0)
5197             ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
5198 
5199          ubld.AND(mask, exec_mask, mask);
5200          exec_mask = mask;
5201       }
5202 
5203       if (first) {
5204          ubld.FBL(inst->dst, exec_mask);
5205       } else {
5206          elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 1);
5207          ubld.UNDEF(tmp);
5208          ubld.LZD(tmp, exec_mask);
5209          ubld.ADD(inst->dst, negate(tmp), elk_imm_uw(31));
5210       }
5211 
5212       inst->remove(block);
5213       progress = true;
5214    }
5215 
5216    if (progress)
5217       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5218 
5219    return progress;
5220 }
5221 
5222 void
dump_instructions_to_file(FILE * file) const5223 elk_fs_visitor::dump_instructions_to_file(FILE *file) const
5224 {
5225    if (cfg) {
5226       const register_pressure &rp = regpressure_analysis.require();
5227       unsigned ip = 0, max_pressure = 0;
5228       unsigned cf_count = 0;
5229       foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5230          if (inst->is_control_flow_end())
5231             cf_count -= 1;
5232 
5233          max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5234          fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
5235          for (unsigned i = 0; i < cf_count; i++)
5236             fprintf(file, "  ");
5237          dump_instruction(inst, file);
5238          ip++;
5239 
5240          if (inst->is_control_flow_begin())
5241             cf_count += 1;
5242       }
5243       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5244    } else {
5245       int ip = 0;
5246       foreach_in_list(elk_backend_instruction, inst, &instructions) {
5247          fprintf(file, "%4d: ", ip++);
5248          dump_instruction(inst, file);
5249       }
5250    }
5251 }
5252 
5253 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const5254 elk_fs_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
5255 {
5256    const elk_fs_inst *inst = (const elk_fs_inst *)be_inst;
5257 
5258    if (inst->predicate) {
5259       fprintf(file, "(%cf%d.%d) ",
5260               inst->predicate_inverse ? '-' : '+',
5261               inst->flag_subreg / 2,
5262               inst->flag_subreg % 2);
5263    }
5264 
5265    fprintf(file, "%s", elk_instruction_name(&compiler->isa, inst->opcode));
5266    if (inst->saturate)
5267       fprintf(file, ".sat");
5268    if (inst->conditional_mod) {
5269       fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
5270       if (!inst->predicate &&
5271           (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
5272                                 inst->opcode != ELK_OPCODE_CSEL &&
5273                                 inst->opcode != ELK_OPCODE_IF &&
5274                                 inst->opcode != ELK_OPCODE_WHILE))) {
5275          fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
5276                  inst->flag_subreg % 2);
5277       }
5278    }
5279    fprintf(file, "(%d) ", inst->exec_size);
5280 
5281    if (inst->mlen) {
5282       fprintf(file, "(mlen: %d) ", inst->mlen);
5283    }
5284 
5285    if (inst->eot) {
5286       fprintf(file, "(EOT) ");
5287    }
5288 
5289    switch (inst->dst.file) {
5290    case VGRF:
5291       fprintf(file, "vgrf%d", inst->dst.nr);
5292       break;
5293    case FIXED_GRF:
5294       fprintf(file, "g%d", inst->dst.nr);
5295       break;
5296    case MRF:
5297       fprintf(file, "m%d", inst->dst.nr);
5298       break;
5299    case BAD_FILE:
5300       fprintf(file, "(null)");
5301       break;
5302    case UNIFORM:
5303       fprintf(file, "***u%d***", inst->dst.nr);
5304       break;
5305    case ATTR:
5306       fprintf(file, "***attr%d***", inst->dst.nr);
5307       break;
5308    case ARF:
5309       switch (inst->dst.nr) {
5310       case ELK_ARF_NULL:
5311          fprintf(file, "null");
5312          break;
5313       case ELK_ARF_ADDRESS:
5314          fprintf(file, "a0.%d", inst->dst.subnr);
5315          break;
5316       case ELK_ARF_ACCUMULATOR:
5317          fprintf(file, "acc%d", inst->dst.subnr);
5318          break;
5319       case ELK_ARF_FLAG:
5320          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5321          break;
5322       default:
5323          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5324          break;
5325       }
5326       break;
5327    case IMM:
5328       unreachable("not reached");
5329    }
5330 
5331    if (inst->dst.offset ||
5332        (inst->dst.file == VGRF &&
5333         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5334       const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5335       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5336               inst->dst.offset % reg_size);
5337    }
5338 
5339    if (inst->dst.stride != 1)
5340       fprintf(file, "<%u>", inst->dst.stride);
5341    fprintf(file, ":%s, ", elk_reg_type_to_letters(inst->dst.type));
5342 
5343    for (int i = 0; i < inst->sources; i++) {
5344       if (inst->src[i].negate)
5345          fprintf(file, "-");
5346       if (inst->src[i].abs)
5347          fprintf(file, "|");
5348       switch (inst->src[i].file) {
5349       case VGRF:
5350          fprintf(file, "vgrf%d", inst->src[i].nr);
5351          break;
5352       case FIXED_GRF:
5353          fprintf(file, "g%d", inst->src[i].nr);
5354          break;
5355       case MRF:
5356          fprintf(file, "***m%d***", inst->src[i].nr);
5357          break;
5358       case ATTR:
5359          fprintf(file, "attr%d", inst->src[i].nr);
5360          break;
5361       case UNIFORM:
5362          fprintf(file, "u%d", inst->src[i].nr);
5363          break;
5364       case BAD_FILE:
5365          fprintf(file, "(null)");
5366          break;
5367       case IMM:
5368          switch (inst->src[i].type) {
5369          case ELK_REGISTER_TYPE_HF:
5370             fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
5371             break;
5372          case ELK_REGISTER_TYPE_F:
5373             fprintf(file, "%-gf", inst->src[i].f);
5374             break;
5375          case ELK_REGISTER_TYPE_DF:
5376             fprintf(file, "%fdf", inst->src[i].df);
5377             break;
5378          case ELK_REGISTER_TYPE_W:
5379          case ELK_REGISTER_TYPE_D:
5380             fprintf(file, "%dd", inst->src[i].d);
5381             break;
5382          case ELK_REGISTER_TYPE_UW:
5383          case ELK_REGISTER_TYPE_UD:
5384             fprintf(file, "%uu", inst->src[i].ud);
5385             break;
5386          case ELK_REGISTER_TYPE_Q:
5387             fprintf(file, "%" PRId64 "q", inst->src[i].d64);
5388             break;
5389          case ELK_REGISTER_TYPE_UQ:
5390             fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
5391             break;
5392          case ELK_REGISTER_TYPE_VF:
5393             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5394                     elk_vf_to_float((inst->src[i].ud >>  0) & 0xff),
5395                     elk_vf_to_float((inst->src[i].ud >>  8) & 0xff),
5396                     elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5397                     elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5398             break;
5399          case ELK_REGISTER_TYPE_V:
5400          case ELK_REGISTER_TYPE_UV:
5401             fprintf(file, "%08x%s", inst->src[i].ud,
5402                     inst->src[i].type == ELK_REGISTER_TYPE_V ? "V" : "UV");
5403             break;
5404          default:
5405             fprintf(file, "???");
5406             break;
5407          }
5408          break;
5409       case ARF:
5410          switch (inst->src[i].nr) {
5411          case ELK_ARF_NULL:
5412             fprintf(file, "null");
5413             break;
5414          case ELK_ARF_ADDRESS:
5415             fprintf(file, "a0.%d", inst->src[i].subnr);
5416             break;
5417          case ELK_ARF_ACCUMULATOR:
5418             fprintf(file, "acc%d", inst->src[i].subnr);
5419             break;
5420          case ELK_ARF_FLAG:
5421             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5422             break;
5423          default:
5424             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5425             break;
5426          }
5427          break;
5428       }
5429 
5430       if (inst->src[i].offset ||
5431           (inst->src[i].file == VGRF &&
5432            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5433          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5434          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5435                  inst->src[i].offset % reg_size);
5436       }
5437 
5438       if (inst->src[i].abs)
5439          fprintf(file, "|");
5440 
5441       if (inst->src[i].file != IMM) {
5442          unsigned stride;
5443          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5444             unsigned hstride = inst->src[i].hstride;
5445             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5446          } else {
5447             stride = inst->src[i].stride;
5448          }
5449          if (stride != 1)
5450             fprintf(file, "<%u>", stride);
5451 
5452          fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
5453       }
5454 
5455       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5456          fprintf(file, ", ");
5457    }
5458 
5459    fprintf(file, " ");
5460 
5461    if (inst->force_writemask_all)
5462       fprintf(file, "NoMask ");
5463 
5464    if (inst->exec_size != dispatch_width)
5465       fprintf(file, "group%d ", inst->group);
5466 
5467    fprintf(file, "\n");
5468 }
5469 
register_pressure(const elk_fs_visitor * v)5470 elk::register_pressure::register_pressure(const elk_fs_visitor *v)
5471 {
5472    const fs_live_variables &live = v->live_analysis.require();
5473    const unsigned num_instructions = v->cfg->num_blocks ?
5474       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
5475 
5476    regs_live_at_ip = new unsigned[num_instructions]();
5477 
5478    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
5479       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
5480          regs_live_at_ip[ip] += v->alloc.sizes[reg];
5481    }
5482 
5483    const unsigned payload_count = v->first_non_payload_grf;
5484 
5485    int *payload_last_use_ip = new int[payload_count];
5486    v->calculate_payload_ranges(payload_count, payload_last_use_ip);
5487 
5488    for (unsigned reg = 0; reg < payload_count; reg++) {
5489       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
5490          ++regs_live_at_ip[ip];
5491    }
5492 
5493    delete[] payload_last_use_ip;
5494 }
5495 
~register_pressure()5496 elk::register_pressure::~register_pressure()
5497 {
5498    delete[] regs_live_at_ip;
5499 }
5500 
5501 void
invalidate_analysis(elk::analysis_dependency_class c)5502 elk_fs_visitor::invalidate_analysis(elk::analysis_dependency_class c)
5503 {
5504    elk_backend_shader::invalidate_analysis(c);
5505    live_analysis.invalidate(c);
5506    regpressure_analysis.invalidate(c);
5507 }
5508 
5509 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const5510 elk_fs_visitor::debug_optimizer(const nir_shader *nir,
5511                             const char *pass_name,
5512                             int iteration, int pass_num) const
5513 {
5514    if (!elk_should_print_shader(nir, DEBUG_OPTIMIZER))
5515       return;
5516 
5517    char *filename;
5518    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
5519                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
5520                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
5521                       iteration, pass_num, pass_name);
5522    if (ret == -1)
5523       return;
5524    dump_instructions(filename);
5525    free(filename);
5526 }
5527 
5528 void
optimize()5529 elk_fs_visitor::optimize()
5530 {
5531    debug_optimizer(nir, "start", 0, 0);
5532 
5533    /* Start by validating the shader we currently have. */
5534    validate();
5535 
5536    bool progress = false;
5537    int iteration = 0;
5538    int pass_num = 0;
5539 
5540 #define OPT(pass, args...) ({                                           \
5541       pass_num++;                                                       \
5542       bool this_progress = pass(args);                                  \
5543                                                                         \
5544       if (this_progress)                                                \
5545          debug_optimizer(nir, #pass, iteration, pass_num);              \
5546                                                                         \
5547       validate();                                                       \
5548                                                                         \
5549       progress = progress || this_progress;                             \
5550       this_progress;                                                    \
5551    })
5552 
5553    assign_constant_locations();
5554    OPT(lower_constant_loads);
5555 
5556    validate();
5557 
5558    OPT(split_virtual_grfs);
5559 
5560    /* Before anything else, eliminate dead code.  The results of some NIR
5561     * instructions may effectively be calculated twice.  Once when the
5562     * instruction is encountered, and again when the user of that result is
5563     * encountered.  Wipe those away before algebraic optimizations and
5564     * especially copy propagation can mix things up.
5565     */
5566    OPT(dead_code_eliminate);
5567 
5568    OPT(remove_extra_rounding_modes);
5569 
5570    do {
5571       progress = false;
5572       pass_num = 0;
5573       iteration++;
5574 
5575       OPT(remove_duplicate_mrf_writes);
5576 
5577       OPT(opt_algebraic);
5578       OPT(opt_cse);
5579       OPT(opt_copy_propagation);
5580       OPT(elk_opt_predicated_break, this);
5581       OPT(opt_cmod_propagation);
5582       OPT(dead_code_eliminate);
5583       OPT(opt_peephole_sel);
5584       OPT(elk_dead_control_flow_eliminate, this);
5585       OPT(opt_saturate_propagation);
5586       OPT(register_coalesce);
5587       OPT(compute_to_mrf);
5588       OPT(eliminate_find_live_channel);
5589 
5590       OPT(compact_virtual_grfs);
5591    } while (progress);
5592 
5593    progress = false;
5594    pass_num = 0;
5595 
5596    if (OPT(lower_pack)) {
5597       OPT(register_coalesce);
5598       OPT(dead_code_eliminate);
5599    }
5600 
5601    OPT(lower_simd_width);
5602    OPT(lower_barycentrics);
5603    OPT(lower_logical_sends);
5604 
5605    /* After logical SEND lowering. */
5606 
5607    if (OPT(opt_copy_propagation))
5608       OPT(opt_algebraic);
5609 
5610    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
5611     * Do this before splitting SENDs.
5612     */
5613    if (devinfo->ver >= 7) {
5614       if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
5615          OPT(opt_algebraic);
5616    }
5617 
5618    if (progress) {
5619       if (OPT(opt_copy_propagation))
5620          OPT(opt_algebraic);
5621 
5622       /* Run after logical send lowering to give it a chance to CSE the
5623        * LOAD_PAYLOAD instructions created to construct the payloads of
5624        * e.g. texturing messages in cases where it wasn't possible to CSE the
5625        * whole logical instruction.
5626        */
5627       OPT(opt_cse);
5628       OPT(register_coalesce);
5629       OPT(compute_to_mrf);
5630       OPT(dead_code_eliminate);
5631       OPT(remove_duplicate_mrf_writes);
5632       OPT(opt_peephole_sel);
5633    }
5634 
5635    OPT(opt_redundant_halt);
5636 
5637    if (OPT(lower_load_payload)) {
5638       OPT(split_virtual_grfs);
5639 
5640       /* Lower 64 bit MOVs generated by payload lowering. */
5641       if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
5642          OPT(opt_algebraic);
5643 
5644       OPT(register_coalesce);
5645       OPT(lower_simd_width);
5646       OPT(compute_to_mrf);
5647       OPT(dead_code_eliminate);
5648    }
5649 
5650    OPT(opt_combine_constants);
5651    if (OPT(lower_integer_multiplication)) {
5652       /* If lower_integer_multiplication made progress, it may have produced
5653        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
5654        * one more time to clean those up if they exist.
5655        */
5656       OPT(lower_integer_multiplication);
5657    }
5658    OPT(lower_sub_sat);
5659 
5660    if (devinfo->ver <= 5 && OPT(lower_minmax)) {
5661       OPT(opt_cmod_propagation);
5662       OPT(opt_cse);
5663       if (OPT(opt_copy_propagation))
5664          OPT(opt_algebraic);
5665       OPT(dead_code_eliminate);
5666    }
5667 
5668    progress = false;
5669    OPT(lower_regioning);
5670    if (progress) {
5671       if (OPT(opt_copy_propagation))
5672          OPT(opt_algebraic);
5673       OPT(dead_code_eliminate);
5674       OPT(lower_simd_width);
5675    }
5676 
5677    OPT(lower_uniform_pull_constant_loads);
5678 
5679    OPT(lower_find_live_channel);
5680 
5681    validate();
5682 }
5683 
5684 /**
5685  * Three source instruction must have a GRF/MRF destination register.
5686  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
5687  */
5688 void
fixup_3src_null_dest()5689 elk_fs_visitor::fixup_3src_null_dest()
5690 {
5691    bool progress = false;
5692 
5693    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
5694       if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
5695          inst->dst = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5696                             inst->dst.type);
5697          progress = true;
5698       }
5699    }
5700 
5701    if (progress)
5702       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
5703                           DEPENDENCY_VARIABLES);
5704 }
5705 
5706 uint32_t
compute_max_register_pressure()5707 elk_fs_visitor::compute_max_register_pressure()
5708 {
5709    const register_pressure &rp = regpressure_analysis.require();
5710    uint32_t ip = 0, max_pressure = 0;
5711    foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5712       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5713       ip++;
5714    }
5715    return max_pressure;
5716 }
5717 
5718 static elk_fs_inst **
save_instruction_order(const struct elk_cfg_t * cfg)5719 save_instruction_order(const struct elk_cfg_t *cfg)
5720 {
5721    /* Before we schedule anything, stash off the instruction order as an array
5722     * of elk_fs_inst *.  This way, we can reset it between scheduling passes to
5723     * prevent dependencies between the different scheduling modes.
5724     */
5725    int num_insts = cfg->last_block()->end_ip + 1;
5726    elk_fs_inst **inst_arr = new elk_fs_inst * [num_insts];
5727 
5728    int ip = 0;
5729    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
5730       assert(ip >= block->start_ip && ip <= block->end_ip);
5731       inst_arr[ip++] = inst;
5732    }
5733    assert(ip == num_insts);
5734 
5735    return inst_arr;
5736 }
5737 
5738 static void
restore_instruction_order(struct elk_cfg_t * cfg,elk_fs_inst ** inst_arr)5739 restore_instruction_order(struct elk_cfg_t *cfg, elk_fs_inst **inst_arr)
5740 {
5741    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
5742 
5743    int ip = 0;
5744    foreach_block (block, cfg) {
5745       block->instructions.make_empty();
5746 
5747       assert(ip == block->start_ip);
5748       for (; ip <= block->end_ip; ip++)
5749          block->instructions.push_tail(inst_arr[ip]);
5750    }
5751    assert(ip == num_insts);
5752 }
5753 
5754 void
allocate_registers(bool allow_spilling)5755 elk_fs_visitor::allocate_registers(bool allow_spilling)
5756 {
5757    bool allocated;
5758 
5759    static const enum instruction_scheduler_mode pre_modes[] = {
5760       SCHEDULE_PRE,
5761       SCHEDULE_PRE_NON_LIFO,
5762       SCHEDULE_NONE,
5763       SCHEDULE_PRE_LIFO,
5764    };
5765 
5766    static const char *scheduler_mode_name[] = {
5767       [SCHEDULE_PRE] = "top-down",
5768       [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
5769       [SCHEDULE_PRE_LIFO] = "lifo",
5770       [SCHEDULE_POST] = "post",
5771       [SCHEDULE_NONE] = "none",
5772    };
5773 
5774    uint32_t best_register_pressure = UINT32_MAX;
5775    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
5776 
5777    compact_virtual_grfs();
5778 
5779    if (needs_register_pressure)
5780       shader_stats.max_register_pressure = compute_max_register_pressure();
5781 
5782    debug_optimizer(nir, "pre_register_allocate", 90, 90);
5783 
5784    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
5785 
5786    /* Before we schedule anything, stash off the instruction order as an array
5787     * of elk_fs_inst *.  This way, we can reset it between scheduling passes to
5788     * prevent dependencies between the different scheduling modes.
5789     */
5790    elk_fs_inst **orig_order = save_instruction_order(cfg);
5791    elk_fs_inst **best_pressure_order = NULL;
5792 
5793    void *scheduler_ctx = ralloc_context(NULL);
5794    elk_fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
5795 
5796    /* Try each scheduling heuristic to see if it can successfully register
5797     * allocate without spilling.  They should be ordered by decreasing
5798     * performance but increasing likelihood of allocating.
5799     */
5800    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5801       enum instruction_scheduler_mode sched_mode = pre_modes[i];
5802 
5803       schedule_instructions_pre_ra(sched, sched_mode);
5804       this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
5805 
5806       debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
5807 
5808       if (0) {
5809          assign_regs_trivial();
5810          allocated = true;
5811          break;
5812       }
5813 
5814       /* We should only spill registers on the last scheduling. */
5815       assert(!spilled_any_registers);
5816 
5817       allocated = assign_regs(false, spill_all);
5818       if (allocated)
5819          break;
5820 
5821       /* Save the maximum register pressure */
5822       uint32_t this_pressure = compute_max_register_pressure();
5823 
5824       if (0) {
5825          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
5826                  scheduler_mode_name[sched_mode], this_pressure);
5827       }
5828 
5829       if (this_pressure < best_register_pressure) {
5830          best_register_pressure = this_pressure;
5831          best_sched = sched_mode;
5832          delete[] best_pressure_order;
5833          best_pressure_order = save_instruction_order(cfg);
5834       }
5835 
5836       /* Reset back to the original order before trying the next mode */
5837       restore_instruction_order(cfg, orig_order);
5838       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
5839    }
5840 
5841    ralloc_free(scheduler_ctx);
5842 
5843    if (!allocated) {
5844       if (0) {
5845          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
5846                  scheduler_mode_name[best_sched]);
5847       }
5848       restore_instruction_order(cfg, best_pressure_order);
5849       shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
5850 
5851       allocated = assign_regs(allow_spilling, spill_all);
5852    }
5853 
5854    delete[] orig_order;
5855    delete[] best_pressure_order;
5856 
5857    if (!allocated) {
5858       fail("Failure to register allocate.  Reduce number of "
5859            "live scalar values to avoid this.");
5860    } else if (spilled_any_registers) {
5861       elk_shader_perf_log(compiler, log_data,
5862                           "%s shader triggered register spilling.  "
5863                           "Try reducing the number of live scalar "
5864                           "values to improve performance.\n",
5865                           _mesa_shader_stage_to_string(stage));
5866    }
5867 
5868    /* This must come after all optimization and register allocation, since
5869     * it inserts dead code that happens to have side effects, and it does
5870     * so based on the actual physical registers in use.
5871     */
5872    insert_gfx4_send_dependency_workarounds();
5873 
5874    if (failed)
5875       return;
5876 
5877    opt_bank_conflicts();
5878 
5879    schedule_instructions_post_ra();
5880 
5881    if (last_scratch > 0) {
5882       ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
5883 
5884       /* Take the max of any previously compiled variant of the shader. In the
5885        * case of bindless shaders with return parts, this will also take the
5886        * max of all parts.
5887        */
5888       prog_data->total_scratch = MAX2(elk_get_scratch_size(last_scratch),
5889                                       prog_data->total_scratch);
5890 
5891       if (gl_shader_stage_is_compute(stage)) {
5892          if (devinfo->platform == INTEL_PLATFORM_HSW) {
5893             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5894              * field documentation, Haswell supports a minimum of 2kB of
5895              * scratch space for compute shaders, unlike every other stage
5896              * and platform.
5897              */
5898             prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
5899          } else if (devinfo->ver <= 7) {
5900             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5901              * field documentation, platforms prior to Haswell measure scratch
5902              * size linearly with a range of [1kB, 12kB] and 1kB granularity.
5903              */
5904             prog_data->total_scratch = ALIGN(last_scratch, 1024);
5905             max_scratch_size = 12 * 1024;
5906          }
5907       }
5908 
5909       /* We currently only support up to 2MB of scratch space.  If we
5910        * need to support more eventually, the documentation suggests
5911        * that we could allocate a larger buffer, and partition it out
5912        * ourselves.  We'd just have to undo the hardware's address
5913        * calculation by subtracting (FFTID * Per Thread Scratch Space)
5914        * and then add FFTID * (Larger Per Thread Scratch Space).
5915        *
5916        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
5917        * Thread Group Tracking > Local Memory/Scratch Space.
5918        */
5919       assert(prog_data->total_scratch < max_scratch_size);
5920    }
5921 }
5922 
5923 bool
run_vs()5924 elk_fs_visitor::run_vs()
5925 {
5926    assert(stage == MESA_SHADER_VERTEX);
5927 
5928    payload_ = new elk_vs_thread_payload(*this);
5929 
5930    nir_to_elk(this);
5931 
5932    if (failed)
5933       return false;
5934 
5935    emit_urb_writes();
5936 
5937    calculate_cfg();
5938 
5939    optimize();
5940 
5941    assign_curb_setup();
5942    assign_vs_urb_setup();
5943 
5944    fixup_3src_null_dest();
5945 
5946    allocate_registers(true /* allow_spilling */);
5947 
5948    return !failed;
5949 }
5950 
5951 void
set_tcs_invocation_id()5952 elk_fs_visitor::set_tcs_invocation_id()
5953 {
5954    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(prog_data);
5955    struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
5956    const fs_builder bld = fs_builder(this).at_end();
5957 
5958    const unsigned instance_id_mask = INTEL_MASK(23, 17);
5959    const unsigned instance_id_shift = 17;
5960 
5961    elk_fs_reg t = bld.vgrf(ELK_REGISTER_TYPE_UD);
5962    bld.AND(t, elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD)),
5963            elk_imm_ud(instance_id_mask));
5964 
5965    invocation_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
5966 
5967    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
5968       /* gl_InvocationID is just the thread number */
5969       bld.SHR(invocation_id, t, elk_imm_ud(instance_id_shift));
5970       return;
5971    }
5972 
5973    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
5974 
5975    elk_fs_reg channels_uw = bld.vgrf(ELK_REGISTER_TYPE_UW);
5976    elk_fs_reg channels_ud = bld.vgrf(ELK_REGISTER_TYPE_UD);
5977    bld.MOV(channels_uw, elk_fs_reg(elk_imm_uv(0x76543210)));
5978    bld.MOV(channels_ud, channels_uw);
5979 
5980    if (tcs_prog_data->instances == 1) {
5981       invocation_id = channels_ud;
5982    } else {
5983       elk_fs_reg instance_times_8 = bld.vgrf(ELK_REGISTER_TYPE_UD);
5984       bld.SHR(instance_times_8, t, elk_imm_ud(instance_id_shift - 3));
5985       bld.ADD(invocation_id, instance_times_8, channels_ud);
5986    }
5987 }
5988 
5989 void
emit_tcs_thread_end()5990 elk_fs_visitor::emit_tcs_thread_end()
5991 {
5992    /* Try and tag the last URB write with EOT instead of emitting a whole
5993     * separate write just to finish the thread.  There isn't guaranteed to
5994     * be one, so this may not succeed.
5995     */
5996    if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
5997       return;
5998 
5999    const fs_builder bld = fs_builder(this).at_end();
6000 
6001    /* Emit a URB write to end the thread.  On Broadwell, we use this to write
6002     * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
6003     * algorithm to set it optimally).  On other platforms, we simply write
6004     * zero to a reserved/MBZ patch header DWord which has no consequence.
6005     */
6006    elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
6007    srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
6008    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = elk_imm_ud(WRITEMASK_X << 16);
6009    srcs[URB_LOGICAL_SRC_DATA] = elk_imm_ud(0);
6010    srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
6011    elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
6012                             reg_undef, srcs, ARRAY_SIZE(srcs));
6013    inst->eot = true;
6014 }
6015 
6016 bool
run_tcs()6017 elk_fs_visitor::run_tcs()
6018 {
6019    assert(stage == MESA_SHADER_TESS_CTRL);
6020 
6021    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
6022    const fs_builder bld = fs_builder(this).at_end();
6023 
6024    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
6025           vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
6026 
6027    payload_ = new elk_tcs_thread_payload(*this);
6028 
6029    /* Initialize gl_InvocationID */
6030    set_tcs_invocation_id();
6031 
6032    const bool fix_dispatch_mask =
6033       vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
6034       (nir->info.tess.tcs_vertices_out % 8) != 0;
6035 
6036    /* Fix the disptach mask */
6037    if (fix_dispatch_mask) {
6038       bld.CMP(bld.null_reg_ud(), invocation_id,
6039               elk_imm_ud(nir->info.tess.tcs_vertices_out), ELK_CONDITIONAL_L);
6040       bld.IF(ELK_PREDICATE_NORMAL);
6041    }
6042 
6043    nir_to_elk(this);
6044 
6045    if (fix_dispatch_mask) {
6046       bld.emit(ELK_OPCODE_ENDIF);
6047    }
6048 
6049    emit_tcs_thread_end();
6050 
6051    if (failed)
6052       return false;
6053 
6054    calculate_cfg();
6055 
6056    optimize();
6057 
6058    assign_curb_setup();
6059    assign_tcs_urb_setup();
6060 
6061    fixup_3src_null_dest();
6062 
6063    allocate_registers(true /* allow_spilling */);
6064 
6065    return !failed;
6066 }
6067 
6068 bool
run_tes()6069 elk_fs_visitor::run_tes()
6070 {
6071    assert(stage == MESA_SHADER_TESS_EVAL);
6072 
6073    payload_ = new elk_tes_thread_payload(*this);
6074 
6075    nir_to_elk(this);
6076 
6077    if (failed)
6078       return false;
6079 
6080    emit_urb_writes();
6081 
6082    calculate_cfg();
6083 
6084    optimize();
6085 
6086    assign_curb_setup();
6087    assign_tes_urb_setup();
6088 
6089    fixup_3src_null_dest();
6090 
6091    allocate_registers(true /* allow_spilling */);
6092 
6093    return !failed;
6094 }
6095 
6096 bool
run_gs()6097 elk_fs_visitor::run_gs()
6098 {
6099    assert(stage == MESA_SHADER_GEOMETRY);
6100 
6101    payload_ = new elk_gs_thread_payload(*this);
6102 
6103    this->final_gs_vertex_count = vgrf(glsl_uint_type());
6104 
6105    if (gs_compile->control_data_header_size_bits > 0) {
6106       /* Create a VGRF to store accumulated control data bits. */
6107       this->control_data_bits = vgrf(glsl_uint_type());
6108 
6109       /* If we're outputting more than 32 control data bits, then EmitVertex()
6110        * will set control_data_bits to 0 after emitting the first vertex.
6111        * Otherwise, we need to initialize it to 0 here.
6112        */
6113       if (gs_compile->control_data_header_size_bits <= 32) {
6114          const fs_builder bld = fs_builder(this).at_end();
6115          const fs_builder abld = bld.annotate("initialize control data bits");
6116          abld.MOV(this->control_data_bits, elk_imm_ud(0u));
6117       }
6118    }
6119 
6120    nir_to_elk(this);
6121 
6122    emit_gs_thread_end();
6123 
6124    if (failed)
6125       return false;
6126 
6127    calculate_cfg();
6128 
6129    optimize();
6130 
6131    assign_curb_setup();
6132    assign_gs_urb_setup();
6133 
6134    fixup_3src_null_dest();
6135 
6136    allocate_registers(true /* allow_spilling */);
6137 
6138    return !failed;
6139 }
6140 
6141 bool
run_fs(bool allow_spilling,bool do_rep_send)6142 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6143 {
6144    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(this->prog_data);
6145    elk_wm_prog_key *wm_key = (elk_wm_prog_key *) this->key;
6146    const fs_builder bld = fs_builder(this).at_end();
6147 
6148    assert(stage == MESA_SHADER_FRAGMENT);
6149 
6150    payload_ = new elk_fs_thread_payload(*this, source_depth_to_render_target,
6151                                     runtime_check_aads_emit);
6152 
6153    if (do_rep_send) {
6154       assert(dispatch_width == 16);
6155       emit_repclear_shader();
6156    } else {
6157       if (nir->info.inputs_read > 0 ||
6158           BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
6159           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6160          if (devinfo->ver < 6)
6161             emit_interpolation_setup_gfx4();
6162          else
6163             emit_interpolation_setup_gfx6();
6164       }
6165 
6166       /* We handle discards by keeping track of the still-live pixels in f0.1.
6167        * Initialize it with the dispatched pixels.
6168        */
6169       if (wm_prog_data->uses_kill) {
6170          const unsigned lower_width = MIN2(dispatch_width, 16);
6171          for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
6172             /* According to the "PS Thread Payload for Normal
6173              * Dispatch" pages on the BSpec, the dispatch mask is
6174              * stored in R1.7/R2.7 on gfx6+.
6175              */
6176             const elk_fs_reg dispatch_mask =
6177                devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
6178                elk_vec1_grf(0, 0);
6179             bld.exec_all().group(1, 0)
6180                .MOV(elk_sample_mask_reg(bld.group(lower_width, i)),
6181                     retype(dispatch_mask, ELK_REGISTER_TYPE_UW));
6182          }
6183       }
6184 
6185       if (nir->info.writes_memory)
6186          wm_prog_data->has_side_effects = true;
6187 
6188       nir_to_elk(this);
6189 
6190       if (failed)
6191 	 return false;
6192 
6193       if (wm_key->emit_alpha_test)
6194          emit_alpha_test();
6195 
6196       emit_fb_writes();
6197 
6198       calculate_cfg();
6199 
6200       optimize();
6201 
6202       assign_curb_setup();
6203 
6204       assign_urb_setup();
6205 
6206       fixup_3src_null_dest();
6207 
6208       allocate_registers(allow_spilling);
6209    }
6210 
6211    return !failed;
6212 }
6213 
6214 bool
run_cs(bool allow_spilling)6215 elk_fs_visitor::run_cs(bool allow_spilling)
6216 {
6217    assert(gl_shader_stage_is_compute(stage));
6218    assert(devinfo->ver >= 7);
6219    const fs_builder bld = fs_builder(this).at_end();
6220 
6221    payload_ = new elk_cs_thread_payload(*this);
6222 
6223    if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
6224       /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
6225       const fs_builder abld = bld.exec_all().group(1, 0);
6226       abld.MOV(retype(elk_sr0_reg(1), ELK_REGISTER_TYPE_UW),
6227                suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW), 1));
6228    }
6229 
6230    nir_to_elk(this);
6231 
6232    if (failed)
6233       return false;
6234 
6235    emit_cs_terminate();
6236 
6237    calculate_cfg();
6238 
6239    optimize();
6240 
6241    assign_curb_setup();
6242 
6243    fixup_3src_null_dest();
6244 
6245    allocate_registers(allow_spilling);
6246 
6247    return !failed;
6248 }
6249 
6250 static bool
is_used_in_not_interp_frag_coord(nir_def * def)6251 is_used_in_not_interp_frag_coord(nir_def *def)
6252 {
6253    nir_foreach_use_including_if(src, def) {
6254       if (nir_src_is_if(src))
6255          return true;
6256 
6257       if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
6258          return true;
6259 
6260       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
6261       if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
6262          return true;
6263    }
6264 
6265    return false;
6266 }
6267 
6268 /**
6269  * Return a bitfield where bit n is set if barycentric interpolation mode n
6270  * (see enum elk_barycentric_mode) is needed by the fragment shader.
6271  *
6272  * We examine the load_barycentric intrinsics rather than looking at input
6273  * variables so that we catch interpolateAtCentroid() messages too, which
6274  * also need the ELK_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
6275  */
6276 static unsigned
elk_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)6277 elk_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
6278                                      const nir_shader *shader)
6279 {
6280    unsigned barycentric_interp_modes = 0;
6281 
6282    nir_foreach_function_impl(impl, shader) {
6283       nir_foreach_block(block, impl) {
6284          nir_foreach_instr(instr, block) {
6285             if (instr->type != nir_instr_type_intrinsic)
6286                continue;
6287 
6288             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6289             switch (intrin->intrinsic) {
6290             case nir_intrinsic_load_barycentric_pixel:
6291             case nir_intrinsic_load_barycentric_centroid:
6292             case nir_intrinsic_load_barycentric_sample:
6293             case nir_intrinsic_load_barycentric_at_sample:
6294             case nir_intrinsic_load_barycentric_at_offset:
6295                break;
6296             default:
6297                continue;
6298             }
6299 
6300             /* Ignore WPOS; it doesn't require interpolation. */
6301             if (!is_used_in_not_interp_frag_coord(&intrin->def))
6302                continue;
6303 
6304             nir_intrinsic_op bary_op = intrin->intrinsic;
6305             enum elk_barycentric_mode bary =
6306                elk_barycentric_mode(intrin);
6307 
6308             barycentric_interp_modes |= 1 << bary;
6309 
6310             if (devinfo->needs_unlit_centroid_workaround &&
6311                 bary_op == nir_intrinsic_load_barycentric_centroid)
6312                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
6313          }
6314       }
6315    }
6316 
6317    return barycentric_interp_modes;
6318 }
6319 
6320 static void
elk_compute_flat_inputs(struct elk_wm_prog_data * prog_data,const nir_shader * shader)6321 elk_compute_flat_inputs(struct elk_wm_prog_data *prog_data,
6322                         const nir_shader *shader)
6323 {
6324    prog_data->flat_inputs = 0;
6325 
6326    nir_foreach_shader_in_variable(var, shader) {
6327       /* flat shading */
6328       if (var->data.interpolation != INTERP_MODE_FLAT)
6329          continue;
6330 
6331       if (var->data.per_primitive)
6332          continue;
6333 
6334       unsigned slots = glsl_count_attribute_slots(var->type, false);
6335       for (unsigned s = 0; s < slots; s++) {
6336          int input_index = prog_data->urb_setup[var->data.location + s];
6337 
6338          if (input_index >= 0)
6339             prog_data->flat_inputs |= 1 << input_index;
6340       }
6341    }
6342 }
6343 
6344 static uint8_t
computed_depth_mode(const nir_shader * shader)6345 computed_depth_mode(const nir_shader *shader)
6346 {
6347    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
6348       switch (shader->info.fs.depth_layout) {
6349       case FRAG_DEPTH_LAYOUT_NONE:
6350       case FRAG_DEPTH_LAYOUT_ANY:
6351          return ELK_PSCDEPTH_ON;
6352       case FRAG_DEPTH_LAYOUT_GREATER:
6353          return ELK_PSCDEPTH_ON_GE;
6354       case FRAG_DEPTH_LAYOUT_LESS:
6355          return ELK_PSCDEPTH_ON_LE;
6356       case FRAG_DEPTH_LAYOUT_UNCHANGED:
6357          /* We initially set this to OFF, but having the shader write the
6358           * depth means we allocate register space in the SEND message. The
6359           * difference between the SEND register count and the OFF state
6360           * programming makes the HW hang.
6361           *
6362           * Removing the depth writes also leads to test failures. So use
6363           * LesserThanOrEqual, which fits writing the same value
6364           * (unchanged/equal).
6365           *
6366           */
6367          return ELK_PSCDEPTH_ON_LE;
6368       }
6369    }
6370    return ELK_PSCDEPTH_OFF;
6371 }
6372 
6373 /**
6374  * Move load_interpolated_input with simple (payload-based) barycentric modes
6375  * to the top of the program so we don't emit multiple PLNs for the same input.
6376  *
6377  * This works around CSE not being able to handle non-dominating cases
6378  * such as:
6379  *
6380  *    if (...) {
6381  *       interpolate input
6382  *    } else {
6383  *       interpolate the same exact input
6384  *    }
6385  *
6386  * This should be replaced by global value numbering someday.
6387  */
6388 bool
elk_nir_move_interpolation_to_top(nir_shader * nir)6389 elk_nir_move_interpolation_to_top(nir_shader *nir)
6390 {
6391    bool progress = false;
6392 
6393    nir_foreach_function_impl(impl, nir) {
6394       nir_block *top = nir_start_block(impl);
6395       nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
6396       bool impl_progress = false;
6397 
6398       for (nir_block *block = nir_block_cf_tree_next(top);
6399            block != NULL;
6400            block = nir_block_cf_tree_next(block)) {
6401 
6402          nir_foreach_instr_safe(instr, block) {
6403             if (instr->type != nir_instr_type_intrinsic)
6404                continue;
6405 
6406             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6407             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
6408                continue;
6409             nir_intrinsic_instr *bary_intrinsic =
6410                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
6411             nir_intrinsic_op op = bary_intrinsic->intrinsic;
6412 
6413             /* Leave interpolateAtSample/Offset() where they are. */
6414             if (op == nir_intrinsic_load_barycentric_at_sample ||
6415                 op == nir_intrinsic_load_barycentric_at_offset)
6416                continue;
6417 
6418             nir_instr *move[3] = {
6419                &bary_intrinsic->instr,
6420                intrin->src[1].ssa->parent_instr,
6421                instr
6422             };
6423 
6424             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
6425                if (move[i]->block != top) {
6426                   nir_instr_move(cursor, move[i]);
6427                   impl_progress = true;
6428                }
6429             }
6430          }
6431       }
6432 
6433       progress = progress || impl_progress;
6434 
6435       nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
6436                                                 : nir_metadata_all);
6437    }
6438 
6439    return progress;
6440 }
6441 
6442 static void
elk_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data)6443 elk_nir_populate_wm_prog_data(nir_shader *shader,
6444                               const struct intel_device_info *devinfo,
6445                               const struct elk_wm_prog_key *key,
6446                               struct elk_wm_prog_data *prog_data)
6447 {
6448    /* key->alpha_test_func means simulating alpha testing via discards,
6449     * so the shader definitely kills pixels.
6450     */
6451    prog_data->uses_kill = shader->info.fs.uses_discard ||
6452                           key->emit_alpha_test;
6453    prog_data->uses_omask = !key->ignore_sample_mask_out &&
6454       (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
6455    prog_data->color_outputs_written = key->color_outputs_valid;
6456    prog_data->computed_depth_mode = computed_depth_mode(shader);
6457    prog_data->computed_stencil =
6458       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
6459 
6460    prog_data->sample_shading =
6461       shader->info.fs.uses_sample_shading ||
6462       shader->info.outputs_read;
6463 
6464    assert(key->multisample_fbo != ELK_NEVER ||
6465           key->persample_interp == ELK_NEVER);
6466 
6467    prog_data->persample_dispatch = key->persample_interp;
6468    if (prog_data->sample_shading)
6469       prog_data->persample_dispatch = ELK_ALWAYS;
6470 
6471    /* We can only persample dispatch if we have a multisample FBO */
6472    prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
6473                                         key->multisample_fbo);
6474 
6475    /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
6476     * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
6477     * to definitively tell whether alpha_to_coverage is on or off.
6478     */
6479    prog_data->alpha_to_coverage = key->alpha_to_coverage;
6480    assert(prog_data->alpha_to_coverage != ELK_SOMETIMES ||
6481           prog_data->persample_dispatch == ELK_SOMETIMES);
6482 
6483    if (devinfo->ver >= 6) {
6484       prog_data->uses_sample_mask =
6485          BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
6486 
6487       /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
6488        *
6489        *    "MSDISPMODE_PERSAMPLE is required in order to select
6490        *    POSOFFSET_SAMPLE"
6491        *
6492        * So we can only really get sample positions if we are doing real
6493        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
6494        * persample dispatch, we hard-code it to 0.5.
6495        */
6496       prog_data->uses_pos_offset =
6497          prog_data->persample_dispatch != ELK_NEVER &&
6498          (BITSET_TEST(shader->info.system_values_read,
6499                       SYSTEM_VALUE_SAMPLE_POS) ||
6500           BITSET_TEST(shader->info.system_values_read,
6501                       SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
6502    }
6503 
6504    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
6505    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
6506    prog_data->inner_coverage = shader->info.fs.inner_coverage;
6507 
6508    prog_data->barycentric_interp_modes =
6509       elk_compute_barycentric_interp_modes(devinfo, shader);
6510 
6511    /* From the BDW PRM documentation for 3DSTATE_WM:
6512     *
6513     *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
6514     *     Sample or Non- perspective Sample barycentric coordinates."
6515     *
6516     * So cleanup any potentially set sample barycentric mode when not in per
6517     * sample dispatch.
6518     */
6519    if (prog_data->persample_dispatch == ELK_NEVER) {
6520       prog_data->barycentric_interp_modes &=
6521          ~BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
6522    }
6523 
6524    prog_data->uses_nonperspective_interp_modes |=
6525       (prog_data->barycentric_interp_modes &
6526       ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
6527 
6528    /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
6529     * Message Descriptor :
6530     *
6531     *    "Message Type. Specifies the type of message being sent when
6532     *     pixel-rate evaluation is requested :
6533     *
6534     *     Format = U2
6535     *       0: Per Message Offset (eval_snapped with immediate offset)
6536     *       1: Sample Position Offset (eval_sindex)
6537     *       2: Centroid Position Offset (eval_centroid)
6538     *       3: Per Slot Offset (eval_snapped with register offset)
6539     *
6540     *     Message Type. Specifies the type of message being sent when
6541     *     coarse-rate evaluation is requested :
6542     *
6543     *     Format = U2
6544     *       0: Coarse to Pixel Mapping Message (internal message)
6545     *       1: Reserved
6546     *       2: Coarse Centroid Position (eval_centroid)
6547     *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
6548     *
6549     * The Sample Position Offset is marked as reserved for coarse rate
6550     * evaluation and leads to hangs if we try to use it. So disable coarse
6551     * pixel shading if we have any intrinsic that will result in a pixel
6552     * interpolater message at sample.
6553     */
6554    intel_nir_pulls_at_sample(shader);
6555 
6556    /* We choose to always enable VMask prior to XeHP, as it would cause
6557     * us to lose out on the eliminate_find_live_channel() optimization.
6558     */
6559    prog_data->uses_vmask = true;
6560 
6561    prog_data->uses_src_w =
6562       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6563    prog_data->uses_src_depth =
6564       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6565 
6566    calculate_urb_setup(devinfo, key, prog_data, shader);
6567    elk_compute_flat_inputs(prog_data, shader);
6568 }
6569 
6570 /**
6571  * Pre-gfx6, the register file of the EUs was shared between threads,
6572  * and each thread used some subset allocated on a 16-register block
6573  * granularity.  The unit states wanted these block counts.
6574  */
6575 static inline int
elk_register_blocks(int reg_count)6576 elk_register_blocks(int reg_count)
6577 {
6578    return ALIGN(reg_count, 16) / 16 - 1;
6579 }
6580 
6581 const unsigned *
elk_compile_fs(const struct elk_compiler * compiler,struct elk_compile_fs_params * params)6582 elk_compile_fs(const struct elk_compiler *compiler,
6583                struct elk_compile_fs_params *params)
6584 {
6585    struct nir_shader *nir = params->base.nir;
6586    const struct elk_wm_prog_key *key = params->key;
6587    struct elk_wm_prog_data *prog_data = params->prog_data;
6588    bool allow_spilling = params->allow_spilling;
6589    const bool debug_enabled =
6590       elk_should_print_shader(nir, params->base.debug_flag ?
6591                                    params->base.debug_flag : DEBUG_WM);
6592 
6593    prog_data->base.stage = MESA_SHADER_FRAGMENT;
6594    prog_data->base.total_scratch = 0;
6595 
6596    const struct intel_device_info *devinfo = compiler->devinfo;
6597    const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
6598 
6599    elk_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
6600    elk_nir_lower_fs_inputs(nir, devinfo, key);
6601    elk_nir_lower_fs_outputs(nir);
6602 
6603    if (devinfo->ver < 6)
6604       elk_setup_vue_interpolation(params->vue_map, nir, prog_data);
6605 
6606    /* From the SKL PRM, Volume 7, "Alpha Coverage":
6607     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
6608     *   hardware, regardless of the state setting for this feature."
6609     */
6610    if (devinfo->ver > 6 && key->alpha_to_coverage != ELK_NEVER) {
6611       /* Run constant fold optimization in order to get the correct source
6612        * offset to determine render target 0 store instruction in
6613        * emit_alpha_to_coverage pass.
6614        */
6615       NIR_PASS(_, nir, nir_opt_constant_folding);
6616       NIR_PASS(_, nir, elk_nir_lower_alpha_to_coverage, key, prog_data);
6617    }
6618 
6619    NIR_PASS(_, nir, elk_nir_move_interpolation_to_top);
6620    elk_postprocess_nir(nir, compiler, debug_enabled,
6621                        key->base.robust_flags);
6622 
6623    elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
6624 
6625    std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
6626    elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
6627    float throughput = 0;
6628    bool has_spilled = false;
6629 
6630    v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
6631                                      prog_data, nir, 8,
6632                                      params->base.stats != NULL,
6633                                      debug_enabled);
6634    if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
6635       params->base.error_str = ralloc_strdup(params->base.mem_ctx,
6636                                              v8->fail_msg);
6637       return NULL;
6638    } else if (INTEL_SIMD(FS, 8)) {
6639       simd8_cfg = v8->cfg;
6640 
6641       assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
6642       prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
6643 
6644       prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
6645       const performance &perf = v8->performance_analysis.require();
6646       throughput = MAX2(throughput, perf.throughput);
6647       has_spilled = v8->spilled_any_registers;
6648       allow_spilling = false;
6649    }
6650 
6651    /* Limit dispatch width to simd8 with dual source blending on gfx8.
6652     * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
6653     */
6654    if (devinfo->ver == 8 && prog_data->dual_src_blend &&
6655        INTEL_SIMD(FS, 8)) {
6656       assert(!params->use_rep_send);
6657       v8->limit_dispatch_width(8, "gfx8 workaround: "
6658                                "using SIMD8 when dual src blending.\n");
6659    }
6660 
6661    if (!has_spilled &&
6662        (!v8 || v8->max_dispatch_width >= 16) &&
6663        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
6664       /* Try a SIMD16 compile */
6665       v16 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
6666                                          prog_data, nir, 16,
6667                                          params->base.stats != NULL,
6668                                          debug_enabled);
6669       if (v8)
6670          v16->import_uniforms(v8.get());
6671       if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
6672          elk_shader_perf_log(compiler, params->base.log_data,
6673                              "SIMD16 shader failed to compile: %s\n",
6674                              v16->fail_msg);
6675       } else {
6676          simd16_cfg = v16->cfg;
6677 
6678          assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
6679          prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
6680 
6681          prog_data->reg_blocks_16 = elk_register_blocks(v16->grf_used);
6682          const performance &perf = v16->performance_analysis.require();
6683          throughput = MAX2(throughput, perf.throughput);
6684          has_spilled = v16->spilled_any_registers;
6685          allow_spilling = false;
6686       }
6687    }
6688 
6689    const bool simd16_failed = v16 && !simd16_cfg;
6690 
6691    /* Currently, the compiler only supports SIMD32 on SNB+ */
6692    if (!has_spilled &&
6693        (!v8 || v8->max_dispatch_width >= 32) &&
6694        (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
6695        devinfo->ver >= 6 && !simd16_failed &&
6696        INTEL_SIMD(FS, 32)) {
6697       /* Try a SIMD32 compile */
6698       v32 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
6699                                          prog_data, nir, 32,
6700                                          params->base.stats != NULL,
6701                                          debug_enabled);
6702       if (v8)
6703          v32->import_uniforms(v8.get());
6704       else if (v16)
6705          v32->import_uniforms(v16.get());
6706 
6707       if (!v32->run_fs(allow_spilling, false)) {
6708          elk_shader_perf_log(compiler, params->base.log_data,
6709                              "SIMD32 shader failed to compile: %s\n",
6710                              v32->fail_msg);
6711       } else {
6712          const performance &perf = v32->performance_analysis.require();
6713 
6714          if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
6715             elk_shader_perf_log(compiler, params->base.log_data,
6716                                 "SIMD32 shader inefficient\n");
6717          } else {
6718             simd32_cfg = v32->cfg;
6719 
6720             assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
6721             prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
6722 
6723             prog_data->reg_blocks_32 = elk_register_blocks(v32->grf_used);
6724             throughput = MAX2(throughput, perf.throughput);
6725          }
6726       }
6727    }
6728 
6729    /* When the caller requests a repclear shader, they want SIMD16-only */
6730    if (params->use_rep_send)
6731       simd8_cfg = NULL;
6732 
6733    /* Prior to Iron Lake, the PS had a single shader offset with a jump table
6734     * at the top to select the shader.  We've never implemented that.
6735     * Instead, we just give them exactly one shader and we pick the widest one
6736     * available.
6737     */
6738    if (compiler->devinfo->ver < 5) {
6739       if (simd32_cfg || simd16_cfg)
6740          simd8_cfg = NULL;
6741       if (simd32_cfg)
6742          simd16_cfg = NULL;
6743    }
6744 
6745    /* If computed depth is enabled SNB only allows SIMD8. */
6746    if (compiler->devinfo->ver == 6 &&
6747        prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF)
6748       assert(simd16_cfg == NULL && simd32_cfg == NULL);
6749 
6750    if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
6751       /* Iron lake and earlier only have one Dispatch GRF start field.  Make
6752        * the data available in the base prog data struct for convenience.
6753        */
6754       if (simd16_cfg) {
6755          prog_data->base.dispatch_grf_start_reg =
6756             prog_data->dispatch_grf_start_reg_16;
6757       } else if (simd32_cfg) {
6758          prog_data->base.dispatch_grf_start_reg =
6759             prog_data->dispatch_grf_start_reg_32;
6760       }
6761    }
6762 
6763    elk_fs_generator g(compiler, &params->base, &prog_data->base,
6764                   v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
6765 
6766    if (unlikely(debug_enabled)) {
6767       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
6768                                      "%s fragment shader %s",
6769                                      nir->info.label ?
6770                                         nir->info.label : "unnamed",
6771                                      nir->info.name));
6772    }
6773 
6774    struct elk_compile_stats *stats = params->base.stats;
6775    uint32_t max_dispatch_width = 0;
6776 
6777    if (simd8_cfg) {
6778       prog_data->dispatch_8 = true;
6779       g.generate_code(simd8_cfg, 8, v8->shader_stats,
6780                       v8->performance_analysis.require(), stats);
6781       stats = stats ? stats + 1 : NULL;
6782       max_dispatch_width = 8;
6783    }
6784 
6785    if (simd16_cfg) {
6786       prog_data->dispatch_16 = true;
6787       prog_data->prog_offset_16 = g.generate_code(
6788          simd16_cfg, 16, v16->shader_stats,
6789          v16->performance_analysis.require(), stats);
6790       stats = stats ? stats + 1 : NULL;
6791       max_dispatch_width = 16;
6792    }
6793 
6794    if (simd32_cfg) {
6795       prog_data->dispatch_32 = true;
6796       prog_data->prog_offset_32 = g.generate_code(
6797          simd32_cfg, 32, v32->shader_stats,
6798          v32->performance_analysis.require(), stats);
6799       stats = stats ? stats + 1 : NULL;
6800       max_dispatch_width = 32;
6801    }
6802 
6803    for (struct elk_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
6804       s->max_dispatch_width = max_dispatch_width;
6805 
6806    g.add_const_data(nir->constant_data, nir->constant_data_size);
6807    return g.get_assembly();
6808 }
6809 
6810 unsigned
elk_cs_push_const_total_size(const struct elk_cs_prog_data * cs_prog_data,unsigned threads)6811 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
6812                              unsigned threads)
6813 {
6814    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
6815    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
6816    return cs_prog_data->push.per_thread.size * threads +
6817           cs_prog_data->push.cross_thread.size;
6818 }
6819 
6820 static void
fill_push_const_block_info(struct elk_push_const_block * block,unsigned dwords)6821 fill_push_const_block_info(struct elk_push_const_block *block, unsigned dwords)
6822 {
6823    block->dwords = dwords;
6824    block->regs = DIV_ROUND_UP(dwords, 8);
6825    block->size = block->regs * 32;
6826 }
6827 
6828 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct elk_cs_prog_data * cs_prog_data)6829 cs_fill_push_const_info(const struct intel_device_info *devinfo,
6830                         struct elk_cs_prog_data *cs_prog_data)
6831 {
6832    const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
6833    int subgroup_id_index = elk_get_subgroup_id_param_index(devinfo, prog_data);
6834    bool cross_thread_supported = devinfo->verx10 >= 75;
6835 
6836    /* The thread ID should be stored in the last param dword */
6837    assert(subgroup_id_index == -1 ||
6838           subgroup_id_index == (int)prog_data->nr_params - 1);
6839 
6840    unsigned cross_thread_dwords, per_thread_dwords;
6841    if (!cross_thread_supported) {
6842       cross_thread_dwords = 0u;
6843       per_thread_dwords = prog_data->nr_params;
6844    } else if (subgroup_id_index >= 0) {
6845       /* Fill all but the last register with cross-thread payload */
6846       cross_thread_dwords = 8 * (subgroup_id_index / 8);
6847       per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
6848       assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
6849    } else {
6850       /* Fill all data using cross-thread payload */
6851       cross_thread_dwords = prog_data->nr_params;
6852       per_thread_dwords = 0u;
6853    }
6854 
6855    fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
6856    fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
6857 
6858    assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
6859           cs_prog_data->push.per_thread.size == 0);
6860    assert(cs_prog_data->push.cross_thread.dwords +
6861           cs_prog_data->push.per_thread.dwords ==
6862              prog_data->nr_params);
6863 }
6864 
6865 static bool
filter_simd(const nir_instr * instr,const void *)6866 filter_simd(const nir_instr *instr, const void * /* options */)
6867 {
6868    if (instr->type != nir_instr_type_intrinsic)
6869       return false;
6870 
6871    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6872    case nir_intrinsic_load_simd_width_intel:
6873    case nir_intrinsic_load_subgroup_id:
6874       return true;
6875 
6876    default:
6877       return false;
6878    }
6879 }
6880 
6881 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)6882 lower_simd(nir_builder *b, nir_instr *instr, void *options)
6883 {
6884    uintptr_t simd_width = (uintptr_t)options;
6885 
6886    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6887    case nir_intrinsic_load_simd_width_intel:
6888       return nir_imm_int(b, simd_width);
6889 
6890    case nir_intrinsic_load_subgroup_id:
6891       /* If the whole workgroup fits in one thread, we can lower subgroup_id
6892        * to a constant zero.
6893        */
6894       if (!b->shader->info.workgroup_size_variable) {
6895          unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
6896                                          b->shader->info.workgroup_size[1] *
6897                                          b->shader->info.workgroup_size[2];
6898          if (local_workgroup_size <= simd_width)
6899             return nir_imm_int(b, 0);
6900       }
6901       return NULL;
6902 
6903    default:
6904       return NULL;
6905    }
6906 }
6907 
6908 bool
elk_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)6909 elk_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
6910 {
6911    return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
6912                                  (void *)(uintptr_t)dispatch_width);
6913 }
6914 
6915 const unsigned *
elk_compile_cs(const struct elk_compiler * compiler,struct elk_compile_cs_params * params)6916 elk_compile_cs(const struct elk_compiler *compiler,
6917                struct elk_compile_cs_params *params)
6918 {
6919    const nir_shader *nir = params->base.nir;
6920    const struct elk_cs_prog_key *key = params->key;
6921    struct elk_cs_prog_data *prog_data = params->prog_data;
6922 
6923    const bool debug_enabled =
6924       elk_should_print_shader(nir, params->base.debug_flag ?
6925                                    params->base.debug_flag : DEBUG_CS);
6926 
6927    prog_data->base.stage = MESA_SHADER_COMPUTE;
6928    prog_data->base.total_shared = nir->info.shared_size;
6929    prog_data->base.total_scratch = 0;
6930 
6931    if (!nir->info.workgroup_size_variable) {
6932       prog_data->local_size[0] = nir->info.workgroup_size[0];
6933       prog_data->local_size[1] = nir->info.workgroup_size[1];
6934       prog_data->local_size[2] = nir->info.workgroup_size[2];
6935    }
6936 
6937    elk_simd_selection_state simd_state{
6938       .devinfo = compiler->devinfo,
6939       .prog_data = prog_data,
6940       .required_width = elk_required_dispatch_width(&nir->info),
6941    };
6942 
6943    std::unique_ptr<elk_fs_visitor> v[3];
6944 
6945    for (unsigned simd = 0; simd < 3; simd++) {
6946       if (!elk_simd_should_compile(simd_state, simd))
6947          continue;
6948 
6949       const unsigned dispatch_width = 8u << simd;
6950 
6951       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
6952       elk_nir_apply_key(shader, compiler, &key->base,
6953                         dispatch_width);
6954 
6955       NIR_PASS(_, shader, elk_nir_lower_simd, dispatch_width);
6956 
6957       /* Clean up after the local index and ID calculations. */
6958       NIR_PASS(_, shader, nir_opt_constant_folding);
6959       NIR_PASS(_, shader, nir_opt_dce);
6960 
6961       elk_postprocess_nir(shader, compiler, debug_enabled,
6962                           key->base.robust_flags);
6963 
6964       v[simd] = std::make_unique<elk_fs_visitor>(compiler, &params->base,
6965                                              &key->base,
6966                                              &prog_data->base,
6967                                              shader, dispatch_width,
6968                                              params->base.stats != NULL,
6969                                              debug_enabled);
6970 
6971       const int first = elk_simd_first_compiled(simd_state);
6972       if (first >= 0)
6973          v[simd]->import_uniforms(v[first].get());
6974 
6975       const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
6976 
6977       if (v[simd]->run_cs(allow_spilling)) {
6978          cs_fill_push_const_info(compiler->devinfo, prog_data);
6979 
6980          elk_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
6981       } else {
6982          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
6983          if (simd > 0) {
6984             elk_shader_perf_log(compiler, params->base.log_data,
6985                                 "SIMD%u shader failed to compile: %s\n",
6986                                 dispatch_width, v[simd]->fail_msg);
6987          }
6988       }
6989    }
6990 
6991    const int selected_simd = elk_simd_select(simd_state);
6992    if (selected_simd < 0) {
6993       params->base.error_str =
6994          ralloc_asprintf(params->base.mem_ctx,
6995                          "Can't compile shader: "
6996                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
6997                          simd_state.error[0], simd_state.error[1],
6998                          simd_state.error[2]);
6999       return NULL;
7000    }
7001 
7002    assert(selected_simd < 3);
7003    elk_fs_visitor *selected = v[selected_simd].get();
7004 
7005    if (!nir->info.workgroup_size_variable)
7006       prog_data->prog_mask = 1 << selected_simd;
7007 
7008    elk_fs_generator g(compiler, &params->base, &prog_data->base,
7009                   selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
7010    if (unlikely(debug_enabled)) {
7011       char *name = ralloc_asprintf(params->base.mem_ctx,
7012                                    "%s compute shader %s",
7013                                    nir->info.label ?
7014                                    nir->info.label : "unnamed",
7015                                    nir->info.name);
7016       g.enable_debug(name);
7017    }
7018 
7019    uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
7020 
7021    struct elk_compile_stats *stats = params->base.stats;
7022    for (unsigned simd = 0; simd < 3; simd++) {
7023       if (prog_data->prog_mask & (1u << simd)) {
7024          assert(v[simd]);
7025          prog_data->prog_offset[simd] =
7026             g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
7027                             v[simd]->performance_analysis.require(), stats);
7028          if (stats)
7029             stats->max_dispatch_width = max_dispatch_width;
7030          stats = stats ? stats + 1 : NULL;
7031          max_dispatch_width = 8u << simd;
7032       }
7033    }
7034 
7035    g.add_const_data(nir->constant_data, nir->constant_data_size);
7036 
7037    return g.get_assembly();
7038 }
7039 
7040 struct intel_cs_dispatch_info
elk_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * override_local_size)7041 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
7042                          const struct elk_cs_prog_data *prog_data,
7043                          const unsigned *override_local_size)
7044 {
7045    struct intel_cs_dispatch_info info = {};
7046 
7047    const unsigned *sizes =
7048       override_local_size ? override_local_size :
7049                             prog_data->local_size;
7050 
7051    const int simd = elk_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
7052    assert(simd >= 0 && simd < 3);
7053 
7054    info.group_size = sizes[0] * sizes[1] * sizes[2];
7055    info.simd_size = 8u << simd;
7056    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
7057 
7058    const uint32_t remainder = info.group_size & (info.simd_size - 1);
7059    if (remainder > 0)
7060       info.right_mask = ~0u >> (32 - remainder);
7061    else
7062       info.right_mask = ~0u >> (32 - info.simd_size);
7063 
7064    return info;
7065 }
7066 
7067 uint64_t
elk_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)7068 elk_bsr(const struct intel_device_info *devinfo,
7069         uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
7070 {
7071    assert(offset % 64 == 0);
7072    assert(simd_size == 8 || simd_size == 16);
7073    assert(local_arg_offset % 8 == 0);
7074 
7075    return offset |
7076           SET_BITS(simd_size == 8, 4, 4) |
7077           SET_BITS(local_arg_offset / 8, 2, 0);
7078 }
7079 
7080 /**
7081  * Test the dispatch mask packing assumptions of
7082  * elk_stage_has_packed_dispatch().  Call this from e.g. the top of
7083  * elk_fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
7084  * executed with an unexpected dispatch mask.
7085  */
7086 static UNUSED void
elk_fs_test_dispatch_packing(const fs_builder & bld)7087 elk_fs_test_dispatch_packing(const fs_builder &bld)
7088 {
7089    const elk_fs_visitor *shader = static_cast<const elk_fs_visitor *>(bld.shader);
7090    const gl_shader_stage stage = shader->stage;
7091    const bool uses_vmask =
7092       stage == MESA_SHADER_FRAGMENT &&
7093       elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
7094 
7095    if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
7096                                      shader->stage_prog_data)) {
7097       const fs_builder ubld = bld.exec_all().group(1, 0);
7098       const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
7099       const elk_fs_reg mask = uses_vmask ? elk_vmask_reg() : elk_dmask_reg();
7100 
7101       ubld.ADD(tmp, mask, elk_imm_ud(1));
7102       ubld.AND(tmp, mask, tmp);
7103 
7104       /* This will loop forever if the dispatch mask doesn't have the expected
7105        * form '2^n-1', in which case tmp will be non-zero.
7106        */
7107       bld.emit(ELK_OPCODE_DO);
7108       bld.CMP(bld.null_reg_ud(), tmp, elk_imm_ud(0), ELK_CONDITIONAL_NZ);
7109       set_predicate(ELK_PREDICATE_NORMAL, bld.emit(ELK_OPCODE_WHILE));
7110    }
7111 }
7112 
7113 unsigned
workgroup_size() const7114 elk_fs_visitor::workgroup_size() const
7115 {
7116    assert(gl_shader_stage_uses_workgroup(stage));
7117    const struct elk_cs_prog_data *cs = elk_cs_prog_data(prog_data);
7118    return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
7119 }
7120 
elk_should_print_shader(const nir_shader * shader,uint64_t debug_flag)7121 bool elk_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
7122 {
7123    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
7124 }
7125 
7126 namespace elk {
7127    elk_fs_reg
fetch_payload_reg(const elk::fs_builder & bld,uint8_t regs[2],elk_reg_type type,unsigned n)7128    fetch_payload_reg(const elk::fs_builder &bld, uint8_t regs[2],
7129                      elk_reg_type type, unsigned n)
7130    {
7131       if (!regs[0])
7132          return elk_fs_reg();
7133 
7134       if (bld.dispatch_width() > 16) {
7135          const elk_fs_reg tmp = bld.vgrf(type, n);
7136          const elk::fs_builder hbld = bld.exec_all().group(16, 0);
7137          const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7138          elk_fs_reg *const components = new elk_fs_reg[m * n];
7139 
7140          for (unsigned c = 0; c < n; c++) {
7141             for (unsigned g = 0; g < m; g++)
7142                components[c * m + g] =
7143                   offset(retype(elk_vec8_grf(regs[g], 0), type), hbld, c);
7144          }
7145 
7146          hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
7147 
7148          delete[] components;
7149          return tmp;
7150 
7151       } else {
7152          return elk_fs_reg(retype(elk_vec8_grf(regs[0], 0), type));
7153       }
7154    }
7155 
7156    elk_fs_reg
fetch_barycentric_reg(const elk::fs_builder & bld,uint8_t regs[2])7157    fetch_barycentric_reg(const elk::fs_builder &bld, uint8_t regs[2])
7158    {
7159       if (!regs[0])
7160          return elk_fs_reg();
7161 
7162       const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
7163       const elk::fs_builder hbld = bld.exec_all().group(8, 0);
7164       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7165       elk_fs_reg *const components = new elk_fs_reg[2 * m];
7166 
7167       for (unsigned c = 0; c < 2; c++) {
7168          for (unsigned g = 0; g < m; g++)
7169             components[c * m + g] = offset(elk_vec8_grf(regs[g / 2], 0),
7170                                            hbld, c + 2 * (g % 2));
7171       }
7172 
7173       hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
7174 
7175       delete[] components;
7176       return tmp;
7177    }
7178 
7179    void
check_dynamic_msaa_flag(const fs_builder & bld,const struct elk_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)7180    check_dynamic_msaa_flag(const fs_builder &bld,
7181                            const struct elk_wm_prog_data *wm_prog_data,
7182                            enum intel_msaa_flags flag)
7183    {
7184       elk_fs_inst *inst = bld.AND(bld.null_reg_ud(),
7185                               dynamic_msaa_flags(wm_prog_data),
7186                               elk_imm_ud(flag));
7187       inst->conditional_mod = ELK_CONDITIONAL_NZ;
7188    }
7189 }
7190