xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs_nir.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 #include "elk_nir.h"
27 #include "elk_nir_private.h"
28 #include "elk_eu.h"
29 #include "nir.h"
30 #include "nir_intrinsics.h"
31 #include "nir_search_helpers.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34 
35 #include <vector>
36 
37 using namespace elk;
38 
39 struct elk_fs_bind_info {
40    bool valid;
41    bool bindless;
42    unsigned block;
43    unsigned set;
44    unsigned binding;
45 };
46 
47 struct nir_to_elk_state {
48    elk_fs_visitor &s;
49    const nir_shader *nir;
50    const intel_device_info *devinfo;
51    void *mem_ctx;
52 
53    /* Points to the end of the program.  Annotated with the current NIR
54     * instruction when applicable.
55     */
56    fs_builder bld;
57 
58    elk_fs_reg *ssa_values;
59    elk_fs_inst **resource_insts;
60    struct elk_fs_bind_info *ssa_bind_infos;
61    elk_fs_reg *resource_values;
62    elk_fs_reg *system_values;
63 };
64 
65 static elk_fs_reg get_nir_src(nir_to_elk_state &ntb, const nir_src &src);
66 static elk_fs_reg get_nir_def(nir_to_elk_state &ntb, const nir_def &def);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68 
69 static void fs_nir_emit_intrinsic(nir_to_elk_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static elk_fs_reg emit_samplepos_setup(nir_to_elk_state &ntb);
71 static elk_fs_reg emit_sampleid_setup(nir_to_elk_state &ntb);
72 static elk_fs_reg emit_samplemaskin_setup(nir_to_elk_state &ntb);
73 
74 static void fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl);
75 static void fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list);
76 static void fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt);
77 static void fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop);
78 static void fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block);
79 static void fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr);
80 
81 static void fs_nir_emit_surface_atomic(nir_to_elk_state &ntb,
82                                        const fs_builder &bld,
83                                        nir_intrinsic_instr *instr,
84                                        elk_fs_reg surface,
85                                        bool bindless);
86 static void fs_nir_emit_global_atomic(nir_to_elk_state &ntb,
87                                       const fs_builder &bld,
88                                       nir_intrinsic_instr *instr);
89 
90 static void
fs_nir_setup_outputs(nir_to_elk_state & ntb)91 fs_nir_setup_outputs(nir_to_elk_state &ntb)
92 {
93    elk_fs_visitor &s = ntb.s;
94 
95    if (s.stage == MESA_SHADER_TESS_CTRL ||
96        s.stage == MESA_SHADER_FRAGMENT)
97       return;
98 
99    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
100 
101    /* Calculate the size of output registers in a separate pass, before
102     * allocating them.  With ARB_enhanced_layouts, multiple output variables
103     * may occupy the same slot, but have different type sizes.
104     */
105    nir_foreach_shader_out_variable(var, s.nir) {
106       const int loc = var->data.driver_location;
107       const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
108       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
109    }
110 
111    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
112       if (vec4s[loc] == 0) {
113          loc++;
114          continue;
115       }
116 
117       unsigned reg_size = vec4s[loc];
118 
119       /* Check if there are any ranges that start within this range and extend
120        * past it. If so, include them in this allocation.
121        */
122       for (unsigned i = 1; i < reg_size; i++) {
123          assert(i + loc < ARRAY_SIZE(vec4s));
124          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
125       }
126 
127       elk_fs_reg reg = ntb.bld.vgrf(ELK_REGISTER_TYPE_F, 4 * reg_size);
128       for (unsigned i = 0; i < reg_size; i++) {
129          assert(loc + i < ARRAY_SIZE(s.outputs));
130          s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
131       }
132 
133       loc += reg_size;
134    }
135 }
136 
137 static void
fs_nir_setup_uniforms(elk_fs_visitor & s)138 fs_nir_setup_uniforms(elk_fs_visitor &s)
139 {
140    /* Only the first compile gets to set up uniforms. */
141    if (s.push_constant_loc)
142       return;
143 
144    s.uniforms = s.nir->num_uniforms / 4;
145 
146    if (gl_shader_stage_is_compute(s.stage)) {
147       /* Add uniforms for builtins after regular NIR uniforms. */
148       assert(s.uniforms == s.prog_data->nr_params);
149 
150       /* Subgroup ID must be the last uniform on the list.  This will make
151        * easier later to split between cross thread and per thread
152        * uniforms.
153        */
154       uint32_t *param = elk_stage_prog_data_add_params(s.prog_data, 1);
155       *param = ELK_PARAM_BUILTIN_SUBGROUP_ID;
156       s.uniforms++;
157    }
158 }
159 
160 static elk_fs_reg
emit_work_group_id_setup(nir_to_elk_state & ntb)161 emit_work_group_id_setup(nir_to_elk_state &ntb)
162 {
163    elk_fs_visitor &s = ntb.s;
164    const fs_builder &bld = ntb.bld;
165 
166    assert(gl_shader_stage_is_compute(s.stage));
167 
168    elk_fs_reg id = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
169 
170    struct elk_reg r0_1(retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
171    bld.MOV(id, r0_1);
172 
173    struct elk_reg r0_6(retype(elk_vec1_grf(0, 6), ELK_REGISTER_TYPE_UD));
174    struct elk_reg r0_7(retype(elk_vec1_grf(0, 7), ELK_REGISTER_TYPE_UD));
175    bld.MOV(offset(id, bld, 1), r0_6);
176    bld.MOV(offset(id, bld, 2), r0_7);
177 
178    return id;
179 }
180 
181 static bool
emit_system_values_block(nir_to_elk_state & ntb,nir_block * block)182 emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
183 {
184    elk_fs_visitor &s = ntb.s;
185    elk_fs_reg *reg;
186 
187    nir_foreach_instr(instr, block) {
188       if (instr->type != nir_instr_type_intrinsic)
189          continue;
190 
191       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
192       switch (intrin->intrinsic) {
193       case nir_intrinsic_load_vertex_id:
194       case nir_intrinsic_load_base_vertex:
195          unreachable("should be lowered by nir_lower_system_values().");
196 
197       case nir_intrinsic_load_vertex_id_zero_base:
198       case nir_intrinsic_load_is_indexed_draw:
199       case nir_intrinsic_load_first_vertex:
200       case nir_intrinsic_load_instance_id:
201       case nir_intrinsic_load_base_instance:
202          unreachable("should be lowered by elk_nir_lower_vs_inputs().");
203          break;
204 
205       case nir_intrinsic_load_draw_id:
206          unreachable("should be lowered by elk_nir_lower_vs_inputs().");
207          break;
208 
209       case nir_intrinsic_load_invocation_id:
210          if (s.stage == MESA_SHADER_TESS_CTRL)
211             break;
212          assert(s.stage == MESA_SHADER_GEOMETRY);
213          reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
214          if (reg->file == BAD_FILE) {
215             *reg = s.gs_payload().instance_id;
216          }
217          break;
218 
219       case nir_intrinsic_load_sample_pos:
220       case nir_intrinsic_load_sample_pos_or_center:
221          assert(s.stage == MESA_SHADER_FRAGMENT);
222          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
223          if (reg->file == BAD_FILE)
224             *reg = emit_samplepos_setup(ntb);
225          break;
226 
227       case nir_intrinsic_load_sample_id:
228          assert(s.stage == MESA_SHADER_FRAGMENT);
229          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
230          if (reg->file == BAD_FILE)
231             *reg = emit_sampleid_setup(ntb);
232          break;
233 
234       case nir_intrinsic_load_sample_mask_in:
235          assert(s.stage == MESA_SHADER_FRAGMENT);
236          assert(s.devinfo->ver >= 7);
237          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
238          if (reg->file == BAD_FILE)
239             *reg = emit_samplemaskin_setup(ntb);
240          break;
241 
242       case nir_intrinsic_load_workgroup_id:
243          assert(gl_shader_stage_is_compute(s.stage));
244          reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
245          if (reg->file == BAD_FILE)
246             *reg = emit_work_group_id_setup(ntb);
247          break;
248 
249       case nir_intrinsic_load_helper_invocation:
250          assert(s.stage == MESA_SHADER_FRAGMENT);
251          reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
252          if (reg->file == BAD_FILE) {
253             const fs_builder abld =
254                ntb.bld.annotate("gl_HelperInvocation", NULL);
255 
256             /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
257              * pixel mask is in g1.7 of the thread payload.
258              *
259              * We move the per-channel pixel enable bit to the low bit of each
260              * channel by shifting the byte containing the pixel mask by the
261              * vector immediate 0x76543210UV.
262              *
263              * The region of <1,8,0> reads only 1 byte (the pixel masks for
264              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
265              * masks for 2 and 3) in SIMD16.
266              */
267             elk_fs_reg shifted = abld.vgrf(ELK_REGISTER_TYPE_UW, 1);
268 
269             for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
270                const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
271                /* According to the "PS Thread Payload for Normal
272                 * Dispatch" pages on the BSpec, the dispatch mask is
273                 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
274                 * gfx6+.
275                 */
276                const struct elk_reg reg = elk_vec1_grf(i + 1, 7);
277                hbld.SHR(offset(shifted, hbld, i),
278                         stride(retype(reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
279                         elk_imm_v(0x76543210));
280             }
281 
282             /* A set bit in the pixel mask means the channel is enabled, but
283              * that is the opposite of gl_HelperInvocation so we need to invert
284              * the mask.
285              *
286              * The negate source-modifier bit of logical instructions on Gfx8+
287              * performs 1's complement negation, so we can use that instead of
288              * a NOT instruction.
289              */
290             elk_fs_reg inverted = negate(shifted);
291             if (s.devinfo->ver < 8) {
292                inverted = abld.vgrf(ELK_REGISTER_TYPE_UW);
293                abld.NOT(inverted, shifted);
294             }
295 
296             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
297              * with 1 and negating.
298              */
299             elk_fs_reg anded = abld.vgrf(ELK_REGISTER_TYPE_UD, 1);
300             abld.AND(anded, inverted, elk_imm_uw(1));
301 
302             elk_fs_reg dst = abld.vgrf(ELK_REGISTER_TYPE_D, 1);
303             abld.MOV(dst, negate(retype(anded, ELK_REGISTER_TYPE_D)));
304             *reg = dst;
305          }
306          break;
307 
308       default:
309          break;
310       }
311    }
312 
313    return true;
314 }
315 
316 static void
fs_nir_emit_system_values(nir_to_elk_state & ntb)317 fs_nir_emit_system_values(nir_to_elk_state &ntb)
318 {
319    const fs_builder &bld = ntb.bld;
320    elk_fs_visitor &s = ntb.s;
321 
322    ntb.system_values = ralloc_array(ntb.mem_ctx, elk_fs_reg, SYSTEM_VALUE_MAX);
323    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
324       ntb.system_values[i] = elk_fs_reg();
325    }
326 
327    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
328     * never end up using it.
329     */
330    {
331       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
332       elk_fs_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
333       reg = abld.vgrf(ELK_REGISTER_TYPE_UW);
334       abld.UNDEF(reg);
335 
336       const fs_builder allbld8 = abld.group(8, 0).exec_all();
337       allbld8.MOV(reg, elk_imm_v(0x76543210));
338       if (s.dispatch_width > 8)
339          allbld8.ADD(byte_offset(reg, 16), reg, elk_imm_uw(8u));
340       if (s.dispatch_width > 16) {
341          const fs_builder allbld16 = abld.group(16, 0).exec_all();
342          allbld16.ADD(byte_offset(reg, 32), reg, elk_imm_uw(16u));
343       }
344    }
345 
346    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
347    nir_foreach_block(block, impl)
348       emit_system_values_block(ntb, block);
349 }
350 
351 static void
fs_nir_emit_impl(nir_to_elk_state & ntb,nir_function_impl * impl)352 fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl)
353 {
354    ntb.ssa_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
355    ntb.resource_insts = rzalloc_array(ntb.mem_ctx, elk_fs_inst *, impl->ssa_alloc);
356    ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct elk_fs_bind_info, impl->ssa_alloc);
357    ntb.resource_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
358 
359    fs_nir_emit_cf_list(ntb, &impl->body);
360 }
361 
362 static void
fs_nir_emit_cf_list(nir_to_elk_state & ntb,exec_list * list)363 fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list)
364 {
365    exec_list_validate(list);
366    foreach_list_typed(nir_cf_node, node, node, list) {
367       switch (node->type) {
368       case nir_cf_node_if:
369          fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
370          break;
371 
372       case nir_cf_node_loop:
373          fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
374          break;
375 
376       case nir_cf_node_block:
377          fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
378          break;
379 
380       default:
381          unreachable("Invalid CFG node block");
382       }
383    }
384 }
385 
386 static void
fs_nir_emit_if(nir_to_elk_state & ntb,nir_if * if_stmt)387 fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt)
388 {
389    const intel_device_info *devinfo = ntb.devinfo;
390    const fs_builder &bld = ntb.bld;
391 
392    bool invert;
393    elk_fs_reg cond_reg;
394 
395    /* If the condition has the form !other_condition, use other_condition as
396     * the source, but invert the predicate on the if instruction.
397     */
398    nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
399    if (cond != NULL && cond->op == nir_op_inot) {
400       invert = true;
401       cond_reg = get_nir_src(ntb, cond->src[0].src);
402       cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
403 
404       if (devinfo->ver <= 5 &&
405 	  (cond->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
406          /* redo boolean resolve on gen5 */
407          elk_fs_reg masked = ntb.s.vgrf(glsl_int_type());
408          bld.AND(masked, cond_reg, elk_imm_d(1));
409          masked.negate = true;
410          elk_fs_reg tmp = bld.vgrf(cond_reg.type);
411          bld.MOV(retype(tmp, ELK_REGISTER_TYPE_D), masked);
412          cond_reg = tmp;
413       }
414    } else {
415       invert = false;
416       cond_reg = get_nir_src(ntb, if_stmt->condition);
417    }
418 
419    /* first, put the condition into f0 */
420    elk_fs_inst *inst = bld.MOV(bld.null_reg_d(),
421                            retype(cond_reg, ELK_REGISTER_TYPE_D));
422    inst->conditional_mod = ELK_CONDITIONAL_NZ;
423 
424    bld.IF(ELK_PREDICATE_NORMAL)->predicate_inverse = invert;
425 
426    fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
427 
428    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
429       bld.emit(ELK_OPCODE_ELSE);
430       fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
431    }
432 
433    bld.emit(ELK_OPCODE_ENDIF);
434 
435    if (devinfo->ver < 7)
436       ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
437                                    "in SIMD32 mode.");
438 }
439 
440 static void
fs_nir_emit_loop(nir_to_elk_state & ntb,nir_loop * loop)441 fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop)
442 {
443    const intel_device_info *devinfo = ntb.devinfo;
444    const fs_builder &bld = ntb.bld;
445 
446    assert(!nir_loop_has_continue_construct(loop));
447    bld.emit(ELK_OPCODE_DO);
448 
449    fs_nir_emit_cf_list(ntb, &loop->body);
450 
451    bld.emit(ELK_OPCODE_WHILE);
452 
453    if (devinfo->ver < 7)
454       ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
455                                    "in SIMD32 mode.");
456 }
457 
458 static void
fs_nir_emit_block(nir_to_elk_state & ntb,nir_block * block)459 fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block)
460 {
461    fs_builder bld = ntb.bld;
462 
463    nir_foreach_instr(instr, block) {
464       fs_nir_emit_instr(ntb, instr);
465    }
466 
467    ntb.bld = bld;
468 }
469 
470 /**
471  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
472  * match instr.
473  */
474 static bool
optimize_extract_to_float(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)475 optimize_extract_to_float(nir_to_elk_state &ntb, nir_alu_instr *instr,
476                           const elk_fs_reg &result)
477 {
478    const intel_device_info *devinfo = ntb.devinfo;
479    const fs_builder &bld = ntb.bld;
480 
481    /* No fast path for f16 or f64. */
482    assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
483 
484    if (!instr->src[0].src.ssa->parent_instr)
485       return false;
486 
487    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
488       return false;
489 
490    nir_alu_instr *src0 =
491       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
492 
493    unsigned bytes;
494    bool is_signed;
495 
496    switch (src0->op) {
497    case nir_op_extract_u8:
498    case nir_op_extract_u16:
499       bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
500 
501       /* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
502        * result. Ditto for extract_u16.
503        */
504       is_signed = false;
505       break;
506 
507    case nir_op_extract_i8:
508    case nir_op_extract_i16:
509       bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
510 
511       /* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
512        * sign extension of the extract_i8 is lost. For example,
513        * u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
514        * fast path could either give 255.0 (by implementing the fast path as
515        * u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
516        * i2f(extract_i8(x))). At one point in time, we incorrectly implemented
517        * the former.
518        */
519       if (instr->op != nir_op_i2f32)
520          return false;
521 
522       is_signed = true;
523       break;
524 
525    default:
526       return false;
527    }
528 
529    unsigned element = nir_src_as_uint(src0->src[1].src);
530 
531    /* Element type to extract.*/
532    const elk_reg_type type = elk_int_type(bytes, is_signed);
533 
534    elk_fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
535    op0.type = elk_type_for_nir_type(devinfo,
536       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
537                      nir_src_bit_size(src0->src[0].src)));
538    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
539 
540    bld.MOV(result, subscript(op0, type, element));
541    return true;
542 }
543 
544 static bool
optimize_frontfacing_ternary(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)545 optimize_frontfacing_ternary(nir_to_elk_state &ntb,
546                              nir_alu_instr *instr,
547                              const elk_fs_reg &result)
548 {
549    const intel_device_info *devinfo = ntb.devinfo;
550    elk_fs_visitor &s = ntb.s;
551 
552    nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
553    if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
554       return false;
555 
556    if (!nir_src_is_const(instr->src[1].src) ||
557        !nir_src_is_const(instr->src[2].src))
558       return false;
559 
560    const float value1 = nir_src_as_float(instr->src[1].src);
561    const float value2 = nir_src_as_float(instr->src[2].src);
562    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
563       return false;
564 
565    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
566    assert(value1 == -value2);
567 
568    elk_fs_reg tmp = s.vgrf(glsl_int_type());
569 
570    if (devinfo->ver >= 6) {
571       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
572       elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
573 
574       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
575        *
576        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
577        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
578        *
579        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
580        *
581        * This negation looks like it's safe in practice, because bits 0:4 will
582        * surely be TRIANGLES
583        */
584 
585       if (value1 == -1.0f) {
586          g0.negate = true;
587       }
588 
589       ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
590                   g0, elk_imm_uw(0x3f80));
591    } else {
592       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
593       elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
594 
595       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
596        *
597        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
598        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
599        *
600        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
601        *
602        * This negation looks like it's safe in practice, because bits 0:4 will
603        * surely be TRIANGLES
604        */
605 
606       if (value1 == -1.0f) {
607          g1_6.negate = true;
608       }
609 
610       ntb.bld.OR(tmp, g1_6, elk_imm_d(0x3f800000));
611    }
612    ntb.bld.AND(retype(result, ELK_REGISTER_TYPE_D), tmp, elk_imm_d(0xbf800000));
613 
614    return true;
615 }
616 
617 static elk_rnd_mode
elk_rnd_mode_from_nir_op(const nir_op op)618 elk_rnd_mode_from_nir_op (const nir_op op) {
619    switch (op) {
620    case nir_op_f2f16_rtz:
621       return ELK_RND_MODE_RTZ;
622    case nir_op_f2f16_rtne:
623       return ELK_RND_MODE_RTNE;
624    default:
625       unreachable("Operation doesn't support rounding mode");
626    }
627 }
628 
629 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)630 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
631 {
632    if (nir_has_any_rounding_mode_rtne(execution_mode))
633       return ELK_RND_MODE_RTNE;
634    if (nir_has_any_rounding_mode_rtz(execution_mode))
635       return ELK_RND_MODE_RTZ;
636    return ELK_RND_MODE_UNSPECIFIED;
637 }
638 
639 static elk_fs_reg
prepare_alu_destination_and_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op,bool need_dest)640 prepare_alu_destination_and_sources(nir_to_elk_state &ntb,
641                                     const fs_builder &bld,
642                                     nir_alu_instr *instr,
643                                     elk_fs_reg *op,
644                                     bool need_dest)
645 {
646    const intel_device_info *devinfo = ntb.devinfo;
647 
648    elk_fs_reg result =
649       need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
650 
651    result.type = elk_type_for_nir_type(devinfo,
652       (nir_alu_type)(nir_op_infos[instr->op].output_type |
653                      instr->def.bit_size));
654 
655    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
656       op[i] = get_nir_src(ntb, instr->src[i].src);
657       op[i].type = elk_type_for_nir_type(devinfo,
658          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
659                         nir_src_bit_size(instr->src[i].src)));
660    }
661 
662    /* Move and vecN instrutions may still be vectored.  Return the raw,
663     * vectored source and destination so that elk_fs_visitor::nir_emit_alu can
664     * handle it.  Other callers should not have to handle these kinds of
665     * instructions.
666     */
667    switch (instr->op) {
668    case nir_op_mov:
669    case nir_op_vec2:
670    case nir_op_vec3:
671    case nir_op_vec4:
672    case nir_op_vec8:
673    case nir_op_vec16:
674       return result;
675    default:
676       break;
677    }
678 
679    /* At this point, we have dealt with any instruction that operates on
680     * more than a single channel.  Therefore, we can just adjust the source
681     * and destination registers for that channel and emit the instruction.
682     */
683    unsigned channel = 0;
684    if (nir_op_infos[instr->op].output_size == 0) {
685       /* Since NIR is doing the scalarizing for us, we should only ever see
686        * vectorized operations with a single channel.
687        */
688       nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
689       assert(util_bitcount(write_mask) == 1);
690       channel = ffs(write_mask) - 1;
691 
692       result = offset(result, bld, channel);
693    }
694 
695    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
696       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
697       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
698    }
699 
700    return result;
701 }
702 
703 static elk_fs_reg
resolve_source_modifiers(const fs_builder & bld,const elk_fs_reg & src)704 resolve_source_modifiers(const fs_builder &bld, const elk_fs_reg &src)
705 {
706    if (!src.abs && !src.negate)
707       return src;
708 
709    elk_fs_reg temp = bld.vgrf(src.type);
710    bld.MOV(temp, src);
711 
712    return temp;
713 }
714 
715 static void
resolve_inot_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op)716 resolve_inot_sources(nir_to_elk_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
717                      elk_fs_reg *op)
718 {
719    for (unsigned i = 0; i < 2; i++) {
720       nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
721 
722       if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
723          /* The source of the inot is now the source of instr. */
724          prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
725 
726          assert(!op[i].negate);
727          op[i].negate = true;
728       } else {
729          op[i] = resolve_source_modifiers(bld, op[i]);
730       }
731    }
732 }
733 
734 static bool
try_emit_b2fi_of_inot(nir_to_elk_state & ntb,const fs_builder & bld,elk_fs_reg result,nir_alu_instr * instr)735 try_emit_b2fi_of_inot(nir_to_elk_state &ntb, const fs_builder &bld,
736                       elk_fs_reg result,
737                       nir_alu_instr *instr)
738 {
739    const intel_device_info *devinfo = bld.shader->devinfo;
740 
741    if (devinfo->ver < 6)
742       return false;
743 
744    nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
745 
746    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
747       return false;
748 
749    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
750     * of valid size-changing combinations is a bit more complex.
751     *
752     * The source restriction is just because I was lazy about generating the
753     * constant below.
754     */
755    if (instr->def.bit_size != 32 ||
756        nir_src_bit_size(inot_instr->src[0].src) != 32)
757       return false;
758 
759    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
760     * this is float(1 + a).
761     */
762    elk_fs_reg op;
763 
764    prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
765 
766    /* Ignore the saturate modifier, if there is one.  The result of the
767     * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
768     */
769    bld.ADD(result, op, elk_imm_d(1));
770 
771    return true;
772 }
773 
774 /**
775  * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
776  *
777  * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
778  * the source of \c instr that is a \c nir_op_fsign.
779  */
780 static void
emit_fsign(nir_to_elk_state & ntb,const fs_builder & bld,const nir_alu_instr * instr,elk_fs_reg result,elk_fs_reg * op,unsigned fsign_src)781 emit_fsign(nir_to_elk_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
782            elk_fs_reg result, elk_fs_reg *op, unsigned fsign_src)
783 {
784    const intel_device_info *devinfo = ntb.devinfo;
785 
786    elk_fs_inst *inst;
787 
788    assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
789    assert(fsign_src < nir_op_infos[instr->op].num_inputs);
790 
791    if (instr->op != nir_op_fsign) {
792       const nir_alu_instr *const fsign_instr =
793          nir_src_as_alu_instr(instr->src[fsign_src].src);
794 
795       /* op[fsign_src] has the nominal result of the fsign, and op[1 -
796        * fsign_src] has the other multiply source.  This must be rearranged so
797        * that op[0] is the source of the fsign op[1] is the other multiply
798        * source.
799        */
800       if (fsign_src != 0)
801          op[1] = op[0];
802 
803       op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
804 
805       const nir_alu_type t =
806          (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
807                         nir_src_bit_size(fsign_instr->src[0].src));
808 
809       op[0].type = elk_type_for_nir_type(devinfo, t);
810 
811       unsigned channel = 0;
812       if (nir_op_infos[instr->op].output_size == 0) {
813          /* Since NIR is doing the scalarizing for us, we should only ever see
814           * vectorized operations with a single channel.
815           */
816          nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
817          assert(util_bitcount(write_mask) == 1);
818          channel = ffs(write_mask) - 1;
819       }
820 
821       op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
822    }
823 
824    if (type_sz(op[0].type) == 2) {
825       /* AND(val, 0x8000) gives the sign bit.
826        *
827        * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
828        */
829       elk_fs_reg zero = retype(elk_imm_uw(0), ELK_REGISTER_TYPE_HF);
830       bld.CMP(bld.null_reg_f(), op[0], zero, ELK_CONDITIONAL_NZ);
831 
832       op[0].type = ELK_REGISTER_TYPE_UW;
833       result.type = ELK_REGISTER_TYPE_UW;
834       bld.AND(result, op[0], elk_imm_uw(0x8000u));
835 
836       if (instr->op == nir_op_fsign)
837          inst = bld.OR(result, result, elk_imm_uw(0x3c00u));
838       else {
839          /* Use XOR here to get the result sign correct. */
840          inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UW));
841       }
842 
843       inst->predicate = ELK_PREDICATE_NORMAL;
844    } else if (type_sz(op[0].type) == 4) {
845       /* AND(val, 0x80000000) gives the sign bit.
846        *
847        * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
848        * zero.
849        */
850       bld.CMP(bld.null_reg_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ);
851 
852       op[0].type = ELK_REGISTER_TYPE_UD;
853       result.type = ELK_REGISTER_TYPE_UD;
854       bld.AND(result, op[0], elk_imm_ud(0x80000000u));
855 
856       if (instr->op == nir_op_fsign)
857          inst = bld.OR(result, result, elk_imm_ud(0x3f800000u));
858       else {
859          /* Use XOR here to get the result sign correct. */
860          inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UD));
861       }
862 
863       inst->predicate = ELK_PREDICATE_NORMAL;
864    } else {
865       unreachable("Should have been lowered by nir_opt_algebraic.");
866    }
867 }
868 
869 /**
870  * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
871  *
872  * Checks the operands of a \c nir_op_fmul to determine whether or not
873  * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
874  *
875  * \param instr  The multiplication instruction
876  *
877  * \param fsign_src The source of \c instr that may or may not be a
878  *                  \c nir_op_fsign
879  */
880 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)881 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
882 {
883    assert(instr->op == nir_op_fmul);
884 
885    nir_alu_instr *const fsign_instr =
886       nir_src_as_alu_instr(instr->src[fsign_src].src);
887 
888    /* Rules:
889     *
890     * 1. instr->src[fsign_src] must be a nir_op_fsign.
891     * 2. The nir_op_fsign can only be used by this multiplication.
892     * 3. The source that is the nir_op_fsign does not have source modifiers.
893     *    \c emit_fsign only examines the source modifiers of the source of the
894     *    \c nir_op_fsign.
895     *
896     * The nir_op_fsign must also not have the saturate modifier, but steps
897     * have already been taken (in nir_opt_algebraic) to ensure that.
898     */
899    return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
900           is_used_once(fsign_instr);
901 }
902 
903 static bool
is_const_zero(const nir_src & src)904 is_const_zero(const nir_src &src)
905 {
906    return nir_src_is_const(src) && nir_src_as_int(src) == 0;
907 }
908 
909 static void
fs_nir_emit_alu(nir_to_elk_state & ntb,nir_alu_instr * instr,bool need_dest)910 fs_nir_emit_alu(nir_to_elk_state &ntb, nir_alu_instr *instr,
911                 bool need_dest)
912 {
913    const intel_device_info *devinfo = ntb.devinfo;
914    const fs_builder &bld = ntb.bld;
915    elk_fs_visitor &s = ntb.s;
916 
917    elk_fs_inst *inst;
918    unsigned execution_mode =
919       bld.shader->nir->info.float_controls_execution_mode;
920 
921    elk_fs_reg op[NIR_MAX_VEC_COMPONENTS];
922    elk_fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
923 
924 #ifndef NDEBUG
925    /* Everything except raw moves, some type conversions, iabs, and ineg
926     * should have 8-bit sources lowered by nir_lower_bit_size in
927     * elk_preprocess_nir or by elk_nir_lower_conversions in
928     * elk_postprocess_nir.
929     */
930    switch (instr->op) {
931    case nir_op_mov:
932    case nir_op_vec2:
933    case nir_op_vec3:
934    case nir_op_vec4:
935    case nir_op_vec8:
936    case nir_op_vec16:
937    case nir_op_i2f16:
938    case nir_op_i2f32:
939    case nir_op_i2i16:
940    case nir_op_i2i32:
941    case nir_op_u2f16:
942    case nir_op_u2f32:
943    case nir_op_u2u16:
944    case nir_op_u2u32:
945    case nir_op_iabs:
946    case nir_op_ineg:
947    case nir_op_pack_32_4x8_split:
948       break;
949 
950    default:
951       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
952          assert(type_sz(op[i].type) > 1);
953       }
954    }
955 #endif
956 
957    switch (instr->op) {
958    case nir_op_mov:
959    case nir_op_vec2:
960    case nir_op_vec3:
961    case nir_op_vec4:
962    case nir_op_vec8:
963    case nir_op_vec16: {
964       elk_fs_reg temp = result;
965       bool need_extra_copy = false;
966 
967       nir_intrinsic_instr *store_reg =
968          nir_store_reg_for_def(&instr->def);
969       if (store_reg != NULL) {
970          nir_def *dest_reg = store_reg->src[1].ssa;
971          for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
972             nir_intrinsic_instr *load_reg =
973                nir_load_reg_for_def(instr->src[i].src.ssa);
974             if (load_reg == NULL)
975                continue;
976 
977             if (load_reg->src[0].ssa == dest_reg) {
978                need_extra_copy = true;
979                temp = bld.vgrf(result.type, 4);
980                break;
981             }
982          }
983       }
984 
985       nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
986       unsigned last_bit = util_last_bit(write_mask);
987 
988       for (unsigned i = 0; i < last_bit; i++) {
989          if (!(write_mask & (1 << i)))
990             continue;
991 
992          if (instr->op == nir_op_mov) {
993             bld.MOV(offset(temp, bld, i),
994                            offset(op[0], bld, instr->src[0].swizzle[i]));
995          } else {
996             bld.MOV(offset(temp, bld, i),
997                            offset(op[i], bld, instr->src[i].swizzle[0]));
998          }
999       }
1000 
1001       /* In this case the source and destination registers were the same,
1002        * so we need to insert an extra set of moves in order to deal with
1003        * any swizzling.
1004        */
1005       if (need_extra_copy) {
1006          for (unsigned i = 0; i < last_bit; i++) {
1007             if (!(write_mask & (1 << i)))
1008                continue;
1009 
1010             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1011          }
1012       }
1013       return;
1014    }
1015 
1016    case nir_op_i2f32:
1017    case nir_op_u2f32:
1018       if (optimize_extract_to_float(ntb, instr, result))
1019          return;
1020       inst = bld.MOV(result, op[0]);
1021       break;
1022 
1023    case nir_op_f2f16_rtne:
1024    case nir_op_f2f16_rtz:
1025    case nir_op_f2f16: {
1026       elk_rnd_mode rnd = ELK_RND_MODE_UNSPECIFIED;
1027 
1028       if (nir_op_f2f16 == instr->op)
1029          rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1030       else
1031          rnd = elk_rnd_mode_from_nir_op(instr->op);
1032 
1033       if (ELK_RND_MODE_UNSPECIFIED != rnd)
1034          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), elk_imm_d(rnd));
1035 
1036       assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1037       inst = bld.F32TO16(result, op[0]);
1038       break;
1039    }
1040 
1041    case nir_op_b2i8:
1042    case nir_op_b2i16:
1043    case nir_op_b2i32:
1044    case nir_op_b2i64:
1045    case nir_op_b2f16:
1046    case nir_op_b2f32:
1047    case nir_op_b2f64:
1048       if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1049          break;
1050       op[0].type = ELK_REGISTER_TYPE_D;
1051       op[0].negate = !op[0].negate;
1052       FALLTHROUGH;
1053    case nir_op_i2f64:
1054    case nir_op_i2i64:
1055    case nir_op_u2f64:
1056    case nir_op_u2u64:
1057    case nir_op_f2f64:
1058    case nir_op_f2i64:
1059    case nir_op_f2u64:
1060    case nir_op_i2i32:
1061    case nir_op_u2u32:
1062    case nir_op_f2i32:
1063    case nir_op_f2u32:
1064    case nir_op_i2f16:
1065    case nir_op_u2f16:
1066    case nir_op_f2i16:
1067    case nir_op_f2u16:
1068    case nir_op_f2i8:
1069    case nir_op_f2u8:
1070       if (result.type == ELK_REGISTER_TYPE_B ||
1071           result.type == ELK_REGISTER_TYPE_UB ||
1072           result.type == ELK_REGISTER_TYPE_HF)
1073          assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1074 
1075       if (op[0].type == ELK_REGISTER_TYPE_B ||
1076           op[0].type == ELK_REGISTER_TYPE_UB ||
1077           op[0].type == ELK_REGISTER_TYPE_HF)
1078          assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1079 
1080       inst = bld.MOV(result, op[0]);
1081       break;
1082 
1083    case nir_op_i2i8:
1084    case nir_op_u2u8:
1085       assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1086       FALLTHROUGH;
1087    case nir_op_i2i16:
1088    case nir_op_u2u16: {
1089       /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1090        * Emitting the instructions one by one results in two MOV instructions
1091        * that won't be propagated.  By handling both instructions here, a
1092        * single MOV is emitted.
1093        */
1094       nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1095       if (extract_instr != NULL) {
1096          if (extract_instr->op == nir_op_extract_u8 ||
1097              extract_instr->op == nir_op_extract_i8) {
1098             prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1099 
1100             const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1101             const elk_reg_type type =
1102                elk_int_type(1, extract_instr->op == nir_op_extract_i8);
1103 
1104             op[0] = subscript(op[0], type, byte);
1105          } else if (extract_instr->op == nir_op_extract_u16 ||
1106                     extract_instr->op == nir_op_extract_i16) {
1107             prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1108 
1109             const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1110             const elk_reg_type type =
1111                elk_int_type(2, extract_instr->op == nir_op_extract_i16);
1112 
1113             op[0] = subscript(op[0], type, word);
1114          }
1115       }
1116 
1117       inst = bld.MOV(result, op[0]);
1118       break;
1119    }
1120 
1121    case nir_op_fsat:
1122       inst = bld.MOV(result, op[0]);
1123       inst->saturate = true;
1124       break;
1125 
1126    case nir_op_fneg:
1127    case nir_op_ineg:
1128       op[0].negate = true;
1129       inst = bld.MOV(result, op[0]);
1130       break;
1131 
1132    case nir_op_fabs:
1133    case nir_op_iabs:
1134       op[0].negate = false;
1135       op[0].abs = true;
1136       inst = bld.MOV(result, op[0]);
1137       break;
1138 
1139    case nir_op_f2f32:
1140       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1141          elk_rnd_mode rnd =
1142             elk_rnd_mode_from_execution_mode(execution_mode);
1143          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1144                              elk_imm_d(rnd));
1145       }
1146 
1147       if (op[0].type == ELK_REGISTER_TYPE_HF)
1148          assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1149 
1150       inst = bld.MOV(result, op[0]);
1151       break;
1152 
1153    case nir_op_fsign:
1154       emit_fsign(ntb, bld, instr, result, op, 0);
1155       break;
1156 
1157    case nir_op_frcp:
1158       inst = bld.emit(ELK_SHADER_OPCODE_RCP, result, op[0]);
1159       break;
1160 
1161    case nir_op_fexp2:
1162       inst = bld.emit(ELK_SHADER_OPCODE_EXP2, result, op[0]);
1163       break;
1164 
1165    case nir_op_flog2:
1166       inst = bld.emit(ELK_SHADER_OPCODE_LOG2, result, op[0]);
1167       break;
1168 
1169    case nir_op_fsin:
1170       inst = bld.emit(ELK_SHADER_OPCODE_SIN, result, op[0]);
1171       break;
1172 
1173    case nir_op_fcos:
1174       inst = bld.emit(ELK_SHADER_OPCODE_COS, result, op[0]);
1175       break;
1176 
1177    case nir_op_fadd:
1178       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1179          elk_rnd_mode rnd =
1180             elk_rnd_mode_from_execution_mode(execution_mode);
1181          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1182                              elk_imm_d(rnd));
1183       }
1184       FALLTHROUGH;
1185    case nir_op_iadd:
1186       inst = bld.ADD(result, op[0], op[1]);
1187       break;
1188 
1189    case nir_op_iadd_sat:
1190    case nir_op_uadd_sat:
1191       inst = bld.ADD(result, op[0], op[1]);
1192       inst->saturate = true;
1193       break;
1194 
1195    case nir_op_isub_sat:
1196       bld.emit(ELK_SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1197       break;
1198 
1199    case nir_op_usub_sat:
1200       bld.emit(ELK_SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1201       break;
1202 
1203    case nir_op_irhadd:
1204    case nir_op_urhadd:
1205       assert(instr->def.bit_size < 64);
1206       inst = bld.AVG(result, op[0], op[1]);
1207       break;
1208 
1209    case nir_op_ihadd:
1210    case nir_op_uhadd: {
1211       assert(instr->def.bit_size < 64);
1212       elk_fs_reg tmp = bld.vgrf(result.type);
1213 
1214       if (devinfo->ver >= 8) {
1215          op[0] = resolve_source_modifiers(bld, op[0]);
1216          op[1] = resolve_source_modifiers(bld, op[1]);
1217       }
1218 
1219       /* AVG(x, y) - ((x ^ y) & 1) */
1220       bld.XOR(tmp, op[0], op[1]);
1221       bld.AND(tmp, tmp, retype(elk_imm_ud(1), result.type));
1222       bld.AVG(result, op[0], op[1]);
1223       inst = bld.ADD(result, result, tmp);
1224       inst->src[1].negate = true;
1225       break;
1226    }
1227 
1228    case nir_op_fmul:
1229       for (unsigned i = 0; i < 2; i++) {
1230          if (can_fuse_fmul_fsign(instr, i)) {
1231             emit_fsign(ntb, bld, instr, result, op, i);
1232             return;
1233          }
1234       }
1235 
1236       /* We emit the rounding mode after the previous fsign optimization since
1237        * it won't result in a MUL, but will try to negate the value by other
1238        * means.
1239        */
1240       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1241          elk_rnd_mode rnd =
1242             elk_rnd_mode_from_execution_mode(execution_mode);
1243          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1244                              elk_imm_d(rnd));
1245       }
1246 
1247       inst = bld.MUL(result, op[0], op[1]);
1248       break;
1249 
1250    case nir_op_imul_2x32_64:
1251    case nir_op_umul_2x32_64:
1252       bld.MUL(result, op[0], op[1]);
1253       break;
1254 
1255    case nir_op_imul_32x16:
1256    case nir_op_umul_32x16: {
1257       const bool ud = instr->op == nir_op_umul_32x16;
1258       const enum elk_reg_type word_type =
1259          ud ? ELK_REGISTER_TYPE_UW : ELK_REGISTER_TYPE_W;
1260       const enum elk_reg_type dword_type =
1261          ud ? ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_D;
1262 
1263       assert(instr->def.bit_size == 32);
1264 
1265       /* Before copy propagation there are no immediate values. */
1266       assert(op[0].file != IMM && op[1].file != IMM);
1267 
1268       op[1] = subscript(op[1], word_type, 0);
1269 
1270       if (devinfo->ver >= 7)
1271          bld.MUL(result, retype(op[0], dword_type), op[1]);
1272       else
1273          bld.MUL(result, op[1], retype(op[0], dword_type));
1274 
1275       break;
1276    }
1277 
1278    case nir_op_imul:
1279       assert(instr->def.bit_size < 64);
1280       bld.MUL(result, op[0], op[1]);
1281       break;
1282 
1283    case nir_op_imul_high:
1284    case nir_op_umul_high:
1285       assert(instr->def.bit_size < 64);
1286       if (instr->def.bit_size == 32) {
1287          bld.emit(ELK_SHADER_OPCODE_MULH, result, op[0], op[1]);
1288       } else {
1289          elk_fs_reg tmp = bld.vgrf(elk_reg_type_from_bit_size(32, op[0].type));
1290          bld.MUL(tmp, op[0], op[1]);
1291          bld.MOV(result, subscript(tmp, result.type, 1));
1292       }
1293       break;
1294 
1295    case nir_op_idiv:
1296    case nir_op_udiv:
1297       assert(instr->def.bit_size < 64);
1298       bld.emit(ELK_SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1299       break;
1300 
1301    case nir_op_uadd_carry:
1302       unreachable("Should have been lowered by carry_to_arith().");
1303 
1304    case nir_op_usub_borrow:
1305       unreachable("Should have been lowered by borrow_to_arith().");
1306 
1307    case nir_op_umod:
1308    case nir_op_irem:
1309       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1310        * appears that our hardware just does the right thing for signed
1311        * remainder.
1312        */
1313       assert(instr->def.bit_size < 64);
1314       bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1315       break;
1316 
1317    case nir_op_imod: {
1318       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1319       bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1320 
1321       /* Math instructions don't support conditional mod */
1322       inst = bld.MOV(bld.null_reg_d(), result);
1323       inst->conditional_mod = ELK_CONDITIONAL_NZ;
1324 
1325       /* Now, we need to determine if signs of the sources are different.
1326        * When we XOR the sources, the top bit is 0 if they are the same and 1
1327        * if they are different.  We can then use a conditional modifier to
1328        * turn that into a predicate.  This leads us to an XOR.l instruction.
1329        *
1330        * Technically, according to the PRM, you're not allowed to use .l on a
1331        * XOR instruction.  However, empirical experiments and Curro's reading
1332        * of the simulator source both indicate that it's safe.
1333        */
1334       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_D);
1335       inst = bld.XOR(tmp, op[0], op[1]);
1336       inst->predicate = ELK_PREDICATE_NORMAL;
1337       inst->conditional_mod = ELK_CONDITIONAL_L;
1338 
1339       /* If the result of the initial remainder operation is non-zero and the
1340        * two sources have different signs, add in a copy of op[1] to get the
1341        * final integer modulus value.
1342        */
1343       inst = bld.ADD(result, result, op[1]);
1344       inst->predicate = ELK_PREDICATE_NORMAL;
1345       break;
1346    }
1347 
1348    case nir_op_flt32:
1349    case nir_op_fge32:
1350    case nir_op_feq32:
1351    case nir_op_fneu32: {
1352       elk_fs_reg dest = result;
1353 
1354       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1355       if (bit_size != 32) {
1356          dest = bld.vgrf(op[0].type, 1);
1357          bld.UNDEF(dest);
1358       }
1359 
1360       bld.CMP(dest, op[0], op[1], elk_cmod_for_nir_comparison(instr->op));
1361 
1362       if (bit_size > 32) {
1363          bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1364       } else if(bit_size < 32) {
1365          /* When we convert the result to 32-bit we need to be careful and do
1366           * it as a signed conversion to get sign extension (for 32-bit true)
1367           */
1368          const elk_reg_type src_type =
1369             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1370 
1371          bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1372       }
1373       break;
1374    }
1375 
1376    case nir_op_ilt32:
1377    case nir_op_ult32:
1378    case nir_op_ige32:
1379    case nir_op_uge32:
1380    case nir_op_ieq32:
1381    case nir_op_ine32: {
1382       elk_fs_reg dest = result;
1383 
1384       const uint32_t bit_size = type_sz(op[0].type) * 8;
1385       if (bit_size != 32) {
1386          dest = bld.vgrf(op[0].type, 1);
1387          bld.UNDEF(dest);
1388       }
1389 
1390       bld.CMP(dest, op[0], op[1],
1391               elk_cmod_for_nir_comparison(instr->op));
1392 
1393       if (bit_size > 32) {
1394          bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1395       } else if (bit_size < 32) {
1396          /* When we convert the result to 32-bit we need to be careful and do
1397           * it as a signed conversion to get sign extension (for 32-bit true)
1398           */
1399          const elk_reg_type src_type =
1400             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1401 
1402          bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1403       }
1404       break;
1405    }
1406 
1407    case nir_op_inot:
1408       if (devinfo->ver >= 8) {
1409          nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1410 
1411          if (inot_src_instr != NULL &&
1412              (inot_src_instr->op == nir_op_ior ||
1413               inot_src_instr->op == nir_op_ixor ||
1414               inot_src_instr->op == nir_op_iand)) {
1415             /* The sources of the source logical instruction are now the
1416              * sources of the instruction that will be generated.
1417              */
1418             prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
1419             resolve_inot_sources(ntb, bld, inot_src_instr, op);
1420 
1421             /* Smash all of the sources and destination to be signed.  This
1422              * doesn't matter for the operation of the instruction, but cmod
1423              * propagation fails on unsigned sources with negation (due to
1424              * elk_fs_inst::can_do_cmod returning false).
1425              */
1426             result.type =
1427                elk_type_for_nir_type(devinfo,
1428                                      (nir_alu_type)(nir_type_int |
1429                                                     instr->def.bit_size));
1430             op[0].type =
1431                elk_type_for_nir_type(devinfo,
1432                                      (nir_alu_type)(nir_type_int |
1433                                                     nir_src_bit_size(inot_src_instr->src[0].src)));
1434             op[1].type =
1435                elk_type_for_nir_type(devinfo,
1436                                      (nir_alu_type)(nir_type_int |
1437                                                     nir_src_bit_size(inot_src_instr->src[1].src)));
1438 
1439             /* For XOR, only invert one of the sources.  Arbitrarily choose
1440              * the first source.
1441              */
1442             op[0].negate = !op[0].negate;
1443             if (inot_src_instr->op != nir_op_ixor)
1444                op[1].negate = !op[1].negate;
1445 
1446             switch (inot_src_instr->op) {
1447             case nir_op_ior:
1448                bld.AND(result, op[0], op[1]);
1449                return;
1450 
1451             case nir_op_iand:
1452                bld.OR(result, op[0], op[1]);
1453                return;
1454 
1455             case nir_op_ixor:
1456                bld.XOR(result, op[0], op[1]);
1457                return;
1458 
1459             default:
1460                unreachable("impossible opcode");
1461             }
1462          }
1463          op[0] = resolve_source_modifiers(bld, op[0]);
1464       }
1465       bld.NOT(result, op[0]);
1466       break;
1467    case nir_op_ixor:
1468       if (devinfo->ver >= 8) {
1469          resolve_inot_sources(ntb, bld, instr, op);
1470       }
1471       bld.XOR(result, op[0], op[1]);
1472       break;
1473    case nir_op_ior:
1474       if (devinfo->ver >= 8) {
1475          resolve_inot_sources(ntb, bld, instr, op);
1476       }
1477       bld.OR(result, op[0], op[1]);
1478       break;
1479    case nir_op_iand:
1480       if (devinfo->ver >= 8) {
1481          resolve_inot_sources(ntb, bld, instr, op);
1482       }
1483       bld.AND(result, op[0], op[1]);
1484       break;
1485 
1486    case nir_op_fdot2:
1487    case nir_op_fdot3:
1488    case nir_op_fdot4:
1489    case nir_op_b32all_fequal2:
1490    case nir_op_b32all_iequal2:
1491    case nir_op_b32all_fequal3:
1492    case nir_op_b32all_iequal3:
1493    case nir_op_b32all_fequal4:
1494    case nir_op_b32all_iequal4:
1495    case nir_op_b32any_fnequal2:
1496    case nir_op_b32any_inequal2:
1497    case nir_op_b32any_fnequal3:
1498    case nir_op_b32any_inequal3:
1499    case nir_op_b32any_fnequal4:
1500    case nir_op_b32any_inequal4:
1501       unreachable("Lowered by nir_lower_alu_reductions");
1502 
1503    case nir_op_ldexp:
1504       unreachable("not reached: should be handled by ldexp_to_arith()");
1505 
1506    case nir_op_fsqrt:
1507       inst = bld.emit(ELK_SHADER_OPCODE_SQRT, result, op[0]);
1508       break;
1509 
1510    case nir_op_frsq:
1511       inst = bld.emit(ELK_SHADER_OPCODE_RSQ, result, op[0]);
1512       break;
1513 
1514    case nir_op_ftrunc:
1515       inst = bld.RNDZ(result, op[0]);
1516       if (devinfo->ver < 6) {
1517          set_condmod(ELK_CONDITIONAL_R, inst);
1518          set_predicate(ELK_PREDICATE_NORMAL,
1519                        bld.ADD(result, result, elk_imm_f(1.0f)));
1520          inst = bld.MOV(result, result); /* for potential saturation */
1521       }
1522       break;
1523 
1524    case nir_op_fceil: {
1525       op[0].negate = !op[0].negate;
1526       elk_fs_reg temp = s.vgrf(glsl_float_type());
1527       bld.RNDD(temp, op[0]);
1528       temp.negate = true;
1529       inst = bld.MOV(result, temp);
1530       break;
1531    }
1532    case nir_op_ffloor:
1533       inst = bld.RNDD(result, op[0]);
1534       break;
1535    case nir_op_ffract:
1536       inst = bld.FRC(result, op[0]);
1537       break;
1538    case nir_op_fround_even:
1539       inst = bld.RNDE(result, op[0]);
1540       if (devinfo->ver < 6) {
1541          set_condmod(ELK_CONDITIONAL_R, inst);
1542          set_predicate(ELK_PREDICATE_NORMAL,
1543                        bld.ADD(result, result, elk_imm_f(1.0f)));
1544          inst = bld.MOV(result, result); /* for potential saturation */
1545       }
1546       break;
1547 
1548    case nir_op_fquantize2f16: {
1549       elk_fs_reg tmp16 = bld.vgrf(ELK_REGISTER_TYPE_D);
1550       elk_fs_reg tmp32 = bld.vgrf(ELK_REGISTER_TYPE_F);
1551       elk_fs_reg zero = bld.vgrf(ELK_REGISTER_TYPE_F);
1552 
1553       /* The destination stride must be at least as big as the source stride. */
1554       tmp16 = subscript(tmp16, ELK_REGISTER_TYPE_HF, 0);
1555 
1556       /* Check for denormal */
1557       elk_fs_reg abs_src0 = op[0];
1558       abs_src0.abs = true;
1559       bld.CMP(bld.null_reg_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1560               ELK_CONDITIONAL_L);
1561       /* Get the appropriately signed zero */
1562       bld.AND(retype(zero, ELK_REGISTER_TYPE_UD),
1563               retype(op[0], ELK_REGISTER_TYPE_UD),
1564               elk_imm_ud(0x80000000));
1565       /* Do the actual F32 -> F16 -> F32 conversion */
1566       bld.F32TO16(tmp16, op[0]);
1567       bld.F16TO32(tmp32, tmp16);
1568       /* Select that or zero based on normal status */
1569       inst = bld.SEL(result, zero, tmp32);
1570       inst->predicate = ELK_PREDICATE_NORMAL;
1571       break;
1572    }
1573 
1574    case nir_op_imin:
1575    case nir_op_umin:
1576    case nir_op_fmin:
1577       inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_L);
1578       break;
1579 
1580    case nir_op_imax:
1581    case nir_op_umax:
1582    case nir_op_fmax:
1583       inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_GE);
1584       break;
1585 
1586    case nir_op_pack_snorm_2x16:
1587    case nir_op_pack_snorm_4x8:
1588    case nir_op_pack_unorm_2x16:
1589    case nir_op_pack_unorm_4x8:
1590    case nir_op_unpack_snorm_2x16:
1591    case nir_op_unpack_snorm_4x8:
1592    case nir_op_unpack_unorm_2x16:
1593    case nir_op_unpack_unorm_4x8:
1594    case nir_op_unpack_half_2x16:
1595    case nir_op_pack_half_2x16:
1596       unreachable("not reached: should be handled by lower_packing_builtins");
1597 
1598    case nir_op_unpack_half_2x16_split_x:
1599       inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 0));
1600       break;
1601 
1602    case nir_op_unpack_half_2x16_split_y:
1603       inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 1));
1604       break;
1605 
1606    case nir_op_pack_64_2x32_split:
1607    case nir_op_pack_32_2x16_split:
1608       bld.emit(ELK_FS_OPCODE_PACK, result, op[0], op[1]);
1609       break;
1610 
1611    case nir_op_pack_32_4x8_split:
1612       bld.emit(ELK_FS_OPCODE_PACK, result, op, 4);
1613       break;
1614 
1615    case nir_op_unpack_64_2x32_split_x:
1616    case nir_op_unpack_64_2x32_split_y: {
1617       if (instr->op == nir_op_unpack_64_2x32_split_x)
1618          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 0));
1619       else
1620          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 1));
1621       break;
1622    }
1623 
1624    case nir_op_unpack_32_2x16_split_x:
1625    case nir_op_unpack_32_2x16_split_y: {
1626       if (instr->op == nir_op_unpack_32_2x16_split_x)
1627          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 0));
1628       else
1629          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 1));
1630       break;
1631    }
1632 
1633    case nir_op_fpow:
1634       inst = bld.emit(ELK_SHADER_OPCODE_POW, result, op[0], op[1]);
1635       break;
1636 
1637    case nir_op_bitfield_reverse:
1638       assert(instr->def.bit_size == 32);
1639       assert(nir_src_bit_size(instr->src[0].src) == 32);
1640       bld.BFREV(result, op[0]);
1641       break;
1642 
1643    case nir_op_bit_count:
1644       assert(instr->def.bit_size == 32);
1645       assert(nir_src_bit_size(instr->src[0].src) < 64);
1646       bld.CBIT(result, op[0]);
1647       break;
1648 
1649    case nir_op_uclz:
1650       assert(instr->def.bit_size == 32);
1651       assert(nir_src_bit_size(instr->src[0].src) == 32);
1652       bld.LZD(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1653       break;
1654 
1655    case nir_op_ifind_msb: {
1656       assert(instr->def.bit_size == 32);
1657       assert(nir_src_bit_size(instr->src[0].src) == 32);
1658       assert(devinfo->ver >= 7);
1659 
1660       bld.FBH(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1661 
1662       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1663        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1664        * subtract the result from 31 to convert the MSB count into an LSB
1665        * count.
1666        */
1667       bld.CMP(bld.null_reg_d(), result, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1668 
1669       inst = bld.ADD(result, result, elk_imm_d(31));
1670       inst->predicate = ELK_PREDICATE_NORMAL;
1671       inst->src[0].negate = true;
1672       break;
1673    }
1674 
1675    case nir_op_find_lsb:
1676       assert(instr->def.bit_size == 32);
1677       assert(nir_src_bit_size(instr->src[0].src) == 32);
1678       assert(devinfo->ver >= 7);
1679       bld.FBL(result, op[0]);
1680       break;
1681 
1682    case nir_op_ubitfield_extract:
1683    case nir_op_ibitfield_extract:
1684       unreachable("should have been lowered");
1685    case nir_op_ubfe:
1686    case nir_op_ibfe:
1687       assert(instr->def.bit_size < 64);
1688       bld.BFE(result, op[2], op[1], op[0]);
1689       break;
1690    case nir_op_bfm:
1691       assert(instr->def.bit_size < 64);
1692       bld.BFI1(result, op[0], op[1]);
1693       break;
1694    case nir_op_bfi:
1695       assert(instr->def.bit_size < 64);
1696 
1697       /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1698        * either 0 or src0. Replacing the 0 with another value can eliminate a
1699        * temporary register.
1700        */
1701       if (is_const_zero(instr->src[2].src))
1702          bld.BFI2(result, op[0], op[1], op[0]);
1703       else
1704          bld.BFI2(result, op[0], op[1], op[2]);
1705 
1706       break;
1707 
1708    case nir_op_bitfield_insert:
1709       unreachable("not reached: should have been lowered");
1710 
1711    /* With regards to implicit masking of the shift counts for 8- and 16-bit
1712     * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1713     * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1714     * src0) are used. The Bspec (backed by data from experimentation) state
1715     * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1716     * types.
1717     *
1718     * The match the behavior expected for the NIR opcodes, explicit masks for
1719     * 8- and 16-bit types must be added.
1720     */
1721    case nir_op_ishl:
1722       if (instr->def.bit_size < 32) {
1723          bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1724          bld.SHL(result, op[0], result);
1725       } else {
1726          bld.SHL(result, op[0], op[1]);
1727       }
1728 
1729       break;
1730    case nir_op_ishr:
1731       if (instr->def.bit_size < 32) {
1732          bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1733          bld.ASR(result, op[0], result);
1734       } else {
1735          bld.ASR(result, op[0], op[1]);
1736       }
1737 
1738       break;
1739    case nir_op_ushr:
1740       if (instr->def.bit_size < 32) {
1741          bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1742          bld.SHR(result, op[0], result);
1743       } else {
1744          bld.SHR(result, op[0], op[1]);
1745       }
1746 
1747       break;
1748 
1749    case nir_op_pack_half_2x16_split:
1750       bld.emit(ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1751       break;
1752 
1753    case nir_op_ffma:
1754       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1755          elk_rnd_mode rnd =
1756             elk_rnd_mode_from_execution_mode(execution_mode);
1757          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1758                              elk_imm_d(rnd));
1759       }
1760 
1761       inst = bld.MAD(result, op[2], op[1], op[0]);
1762       break;
1763 
1764    case nir_op_flrp:
1765       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1766          elk_rnd_mode rnd =
1767             elk_rnd_mode_from_execution_mode(execution_mode);
1768          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1769                              elk_imm_d(rnd));
1770       }
1771 
1772       inst = bld.LRP(result, op[0], op[1], op[2]);
1773       break;
1774 
1775    case nir_op_b32csel:
1776       if (optimize_frontfacing_ternary(ntb, instr, result))
1777          return;
1778 
1779       bld.CMP(bld.null_reg_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ);
1780       inst = bld.SEL(result, op[1], op[2]);
1781       inst->predicate = ELK_PREDICATE_NORMAL;
1782       break;
1783 
1784    case nir_op_extract_u8:
1785    case nir_op_extract_i8: {
1786       unsigned byte = nir_src_as_uint(instr->src[1].src);
1787 
1788       /* The PRMs say:
1789        *
1790        *    BDW+
1791        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1792        *    Use two instructions and a word or DWord intermediate integer type.
1793        */
1794       if (instr->def.bit_size == 64) {
1795          const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1796 
1797          if (instr->op == nir_op_extract_i8) {
1798             /* If we need to sign extend, extract to a word first */
1799             elk_fs_reg w_temp = bld.vgrf(ELK_REGISTER_TYPE_W);
1800             bld.MOV(w_temp, subscript(op[0], type, byte));
1801             bld.MOV(result, w_temp);
1802          } else if (byte & 1) {
1803             /* Extract the high byte from the word containing the desired byte
1804              * offset.
1805              */
1806             bld.SHR(result,
1807                     subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1808                     elk_imm_uw(8));
1809          } else {
1810             /* Otherwise use an AND with 0xff and a word type */
1811             bld.AND(result,
1812                     subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1813                     elk_imm_uw(0xff));
1814          }
1815       } else {
1816          const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1817          bld.MOV(result, subscript(op[0], type, byte));
1818       }
1819       break;
1820    }
1821 
1822    case nir_op_extract_u16:
1823    case nir_op_extract_i16: {
1824       const elk_reg_type type = elk_int_type(2, instr->op == nir_op_extract_i16);
1825       unsigned word = nir_src_as_uint(instr->src[1].src);
1826       bld.MOV(result, subscript(op[0], type, word));
1827       break;
1828    }
1829 
1830    default:
1831       unreachable("unhandled instruction");
1832    }
1833 
1834    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1835     * to sign extend the low bit to 0/~0
1836     */
1837    if (devinfo->ver <= 5 &&
1838        !result.is_null() &&
1839        (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1840       elk_fs_reg masked = s.vgrf(glsl_int_type());
1841       bld.AND(masked, result, elk_imm_d(1));
1842       masked.negate = true;
1843       bld.MOV(retype(result, ELK_REGISTER_TYPE_D), masked);
1844    }
1845 }
1846 
1847 static void
fs_nir_emit_load_const(nir_to_elk_state & ntb,nir_load_const_instr * instr)1848 fs_nir_emit_load_const(nir_to_elk_state &ntb,
1849                        nir_load_const_instr *instr)
1850 {
1851    const intel_device_info *devinfo = ntb.devinfo;
1852    const fs_builder &bld = ntb.bld;
1853 
1854    const elk_reg_type reg_type =
1855       elk_reg_type_from_bit_size(instr->def.bit_size, ELK_REGISTER_TYPE_D);
1856    elk_fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1857 
1858    switch (instr->def.bit_size) {
1859    case 8:
1860       for (unsigned i = 0; i < instr->def.num_components; i++)
1861          bld.MOV(offset(reg, bld, i), elk_setup_imm_b(bld, instr->value[i].i8));
1862       break;
1863 
1864    case 16:
1865       for (unsigned i = 0; i < instr->def.num_components; i++)
1866          bld.MOV(offset(reg, bld, i), elk_imm_w(instr->value[i].i16));
1867       break;
1868 
1869    case 32:
1870       for (unsigned i = 0; i < instr->def.num_components; i++)
1871          bld.MOV(offset(reg, bld, i), elk_imm_d(instr->value[i].i32));
1872       break;
1873 
1874    case 64:
1875       assert(devinfo->ver >= 7);
1876       if (!devinfo->has_64bit_int) {
1877          for (unsigned i = 0; i < instr->def.num_components; i++) {
1878             bld.MOV(retype(offset(reg, bld, i), ELK_REGISTER_TYPE_DF),
1879                     elk_setup_imm_df(bld, instr->value[i].f64));
1880          }
1881       } else {
1882          for (unsigned i = 0; i < instr->def.num_components; i++)
1883             bld.MOV(offset(reg, bld, i), elk_imm_q(instr->value[i].i64));
1884       }
1885       break;
1886 
1887    default:
1888       unreachable("Invalid bit size");
1889    }
1890 
1891    ntb.ssa_values[instr->def.index] = reg;
1892 }
1893 
1894 static bool
get_nir_src_bindless(nir_to_elk_state & ntb,const nir_src & src)1895 get_nir_src_bindless(nir_to_elk_state &ntb, const nir_src &src)
1896 {
1897    return ntb.ssa_bind_infos[src.ssa->index].bindless;
1898 }
1899 
1900 static bool
is_resource_src(nir_src src)1901 is_resource_src(nir_src src)
1902 {
1903    return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
1904           nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
1905 }
1906 
1907 static elk_fs_reg
get_resource_nir_src(nir_to_elk_state & ntb,const nir_src & src)1908 get_resource_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1909 {
1910    if (!is_resource_src(src))
1911       return elk_fs_reg();
1912    return ntb.resource_values[src.ssa->index];
1913 }
1914 
1915 static elk_fs_reg
get_nir_src(nir_to_elk_state & ntb,const nir_src & src)1916 get_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1917 {
1918    const intel_device_info *devinfo = ntb.devinfo;
1919 
1920    nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1921 
1922    elk_fs_reg reg;
1923    if (!load_reg) {
1924       if (nir_src_is_undef(src)) {
1925          const elk_reg_type reg_type =
1926             elk_reg_type_from_bit_size(src.ssa->bit_size,
1927                                        ELK_REGISTER_TYPE_D);
1928          reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
1929       } else {
1930          reg = ntb.ssa_values[src.ssa->index];
1931       }
1932    } else {
1933       nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
1934       /* We don't handle indirects on locals */
1935       assert(nir_intrinsic_base(load_reg) == 0);
1936       assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
1937       reg = ntb.ssa_values[decl_reg->def.index];
1938    }
1939 
1940    if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
1941       /* The only 64-bit type available on gfx7 is DF, so use that. */
1942       reg.type = ELK_REGISTER_TYPE_DF;
1943    } else {
1944       /* To avoid floating-point denorm flushing problems, set the type by
1945        * default to an integer type - instructions that need floating point
1946        * semantics will set this to F if they need to
1947        */
1948       reg.type = elk_reg_type_from_bit_size(nir_src_bit_size(src),
1949                                             ELK_REGISTER_TYPE_D);
1950    }
1951 
1952    return reg;
1953 }
1954 
1955 /**
1956  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1957  *
1958  * This function should not be called on any value which may be 64 bits.
1959  * We could theoretically support 64-bit on gfx8+ but we choose not to
1960  * because it wouldn't work in general (no gfx7 support) and there are
1961  * enough restrictions in 64-bit immediates that you can't take the return
1962  * value and treat it the same as the result of get_nir_src().
1963  */
1964 static elk_fs_reg
get_nir_src_imm(nir_to_elk_state & ntb,const nir_src & src)1965 get_nir_src_imm(nir_to_elk_state &ntb, const nir_src &src)
1966 {
1967    assert(nir_src_bit_size(src) == 32);
1968    return nir_src_is_const(src) ?
1969           elk_fs_reg(elk_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
1970 }
1971 
1972 static elk_fs_reg
get_nir_def(nir_to_elk_state & ntb,const nir_def & def)1973 get_nir_def(nir_to_elk_state &ntb, const nir_def &def)
1974 {
1975    const fs_builder &bld = ntb.bld;
1976 
1977    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
1978    if (!store_reg) {
1979       const elk_reg_type reg_type =
1980          elk_reg_type_from_bit_size(def.bit_size,
1981                                     def.bit_size == 8 ?
1982                                     ELK_REGISTER_TYPE_D :
1983                                     ELK_REGISTER_TYPE_F);
1984       ntb.ssa_values[def.index] =
1985          bld.vgrf(reg_type, def.num_components);
1986       bld.UNDEF(ntb.ssa_values[def.index]);
1987       return ntb.ssa_values[def.index];
1988    } else {
1989       nir_intrinsic_instr *decl_reg =
1990          nir_reg_get_decl(store_reg->src[1].ssa);
1991       /* We don't handle indirects on locals */
1992       assert(nir_intrinsic_base(store_reg) == 0);
1993       assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
1994       return ntb.ssa_values[decl_reg->def.index];
1995    }
1996 }
1997 
1998 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)1999 get_nir_write_mask(const nir_def &def)
2000 {
2001    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2002    if (!store_reg) {
2003       return nir_component_mask(def.num_components);
2004    } else {
2005       return nir_intrinsic_write_mask(store_reg);
2006    }
2007 }
2008 
2009 static elk_fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum elk_opcode opcode,const elk_fs_reg & dst,const elk_fs_reg & src,const elk_fs_reg & desc,const elk_fs_reg & flag_reg,glsl_interp_mode interpolation)2010 emit_pixel_interpolater_send(const fs_builder &bld,
2011                              enum elk_opcode opcode,
2012                              const elk_fs_reg &dst,
2013                              const elk_fs_reg &src,
2014                              const elk_fs_reg &desc,
2015                              const elk_fs_reg &flag_reg,
2016                              glsl_interp_mode interpolation)
2017 {
2018    struct elk_wm_prog_data *wm_prog_data =
2019       elk_wm_prog_data(bld.shader->stage_prog_data);
2020 
2021    elk_fs_reg srcs[INTERP_NUM_SRCS];
2022    srcs[INTERP_SRC_OFFSET]       = src;
2023    srcs[INTERP_SRC_MSG_DESC]     = desc;
2024    srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2025 
2026    elk_fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2027    /* 2 floats per slot returned */
2028    inst->size_written = 2 * dst.component_size(inst->exec_size);
2029    if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2030       inst->pi_noperspective = true;
2031       /* TGL BSpec says:
2032        *     This field cannot be set to "Linear Interpolation"
2033        *     unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2034        */
2035       wm_prog_data->uses_nonperspective_interp_modes = true;
2036    }
2037 
2038    wm_prog_data->pulls_bary = true;
2039 
2040    return inst;
2041 }
2042 
2043 /**
2044  * Computes 1 << x, given a D/UD register containing some value x.
2045  */
2046 static elk_fs_reg
intexp2(const fs_builder & bld,const elk_fs_reg & x)2047 intexp2(const fs_builder &bld, const elk_fs_reg &x)
2048 {
2049    assert(x.type == ELK_REGISTER_TYPE_UD || x.type == ELK_REGISTER_TYPE_D);
2050 
2051    elk_fs_reg result = bld.vgrf(x.type, 1);
2052    elk_fs_reg one = bld.vgrf(x.type, 1);
2053 
2054    bld.MOV(one, retype(elk_imm_d(1), one.type));
2055    bld.SHL(result, one, x);
2056    return result;
2057 }
2058 
2059 static void
emit_gs_end_primitive(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src)2060 emit_gs_end_primitive(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src)
2061 {
2062    elk_fs_visitor &s = ntb.s;
2063    assert(s.stage == MESA_SHADER_GEOMETRY);
2064 
2065    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2066 
2067    if (s.gs_compile->control_data_header_size_bits == 0)
2068       return;
2069 
2070    /* We can only do EndPrimitive() functionality when the control data
2071     * consists of cut bits.  Fortunately, the only time it isn't is when the
2072     * output type is points, in which case EndPrimitive() is a no-op.
2073     */
2074    if (gs_prog_data->control_data_format !=
2075        GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2076       return;
2077    }
2078 
2079    /* Cut bits use one bit per vertex. */
2080    assert(s.gs_compile->control_data_bits_per_vertex == 1);
2081 
2082    elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2083    vertex_count.type = ELK_REGISTER_TYPE_UD;
2084 
2085    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2086     * vertex n, 0 otherwise.  So all we need to do here is mark bit
2087     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2088     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2089     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2090     *
2091     * Note that if EndPrimitive() is called before emitting any vertices, this
2092     * will cause us to set bit 31 of the control_data_bits register to 1.
2093     * That's fine because:
2094     *
2095     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2096     *   output, so the hardware will ignore cut bit 31.
2097     *
2098     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2099     *   last vertex, so setting cut bit 31 has no effect (since the primitive
2100     *   is automatically ended when the GS terminates).
2101     *
2102     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2103     *   control_data_bits register to 0 when the first vertex is emitted.
2104     */
2105 
2106    const fs_builder abld = ntb.bld.annotate("end primitive");
2107 
2108    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2109    elk_fs_reg prev_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2110    abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2111    elk_fs_reg mask = intexp2(abld, prev_count);
2112    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2113     * attention to the lower 5 bits of its second source argument, so on this
2114     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2115     * ((vertex_count - 1) % 32).
2116     */
2117    abld.OR(s.control_data_bits, s.control_data_bits, mask);
2118 }
2119 
2120 void
emit_gs_control_data_bits(const elk_fs_reg & vertex_count)2121 elk_fs_visitor::emit_gs_control_data_bits(const elk_fs_reg &vertex_count)
2122 {
2123    assert(stage == MESA_SHADER_GEOMETRY);
2124    assert(gs_compile->control_data_bits_per_vertex != 0);
2125 
2126    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
2127 
2128    const fs_builder bld = fs_builder(this).at_end();
2129    const fs_builder abld = bld.annotate("emit control data bits");
2130    const fs_builder fwa_bld = bld.exec_all();
2131 
2132    /* We use a single UD register to accumulate control data bits (32 bits
2133     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2134     * at a time.
2135     *
2136     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2137     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2138     * use the Channel Mask phase to enable/disable which DWord within that
2139     * group to write.  (Remember, different SIMD8 channels may have emitted
2140     * different numbers of vertices, so we may need per-slot offsets.)
2141     *
2142     * Channel masking presents an annoying problem: we may have to replicate
2143     * the data up to 4 times:
2144     *
2145     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2146     *
2147     * To avoid penalizing shaders that emit a small number of vertices, we
2148     * can avoid these sometimes: if the size of the control data header is
2149     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2150     * land in the same 128-bit group, so we can skip per-slot offsets.
2151     *
2152     * Similarly, if the control data header is <= 32 bits, there is only one
2153     * DWord, so we can skip channel masks.
2154     */
2155    elk_fs_reg channel_mask, per_slot_offset;
2156 
2157    if (gs_compile->control_data_header_size_bits > 32)
2158       channel_mask = vgrf(glsl_uint_type());
2159 
2160    if (gs_compile->control_data_header_size_bits > 128)
2161       per_slot_offset = vgrf(glsl_uint_type());
2162 
2163    /* Figure out which DWord we're trying to write to using the formula:
2164     *
2165     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2166     *
2167     * Since bits_per_vertex is a power of two, and is known at compile
2168     * time, this can be optimized to:
2169     *
2170     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2171     */
2172    if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2173       elk_fs_reg dword_index = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2174       elk_fs_reg prev_count = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2175       abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2176       unsigned log2_bits_per_vertex =
2177          util_last_bit(gs_compile->control_data_bits_per_vertex);
2178       abld.SHR(dword_index, prev_count, elk_imm_ud(6u - log2_bits_per_vertex));
2179 
2180       if (per_slot_offset.file != BAD_FILE) {
2181          /* Set the per-slot offset to dword_index / 4, so that we'll write to
2182           * the appropriate OWord within the control data header.
2183           */
2184          abld.SHR(per_slot_offset, dword_index, elk_imm_ud(2u));
2185       }
2186 
2187       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2188        * write to the appropriate DWORD within the OWORD.
2189        */
2190       elk_fs_reg channel = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2191       fwa_bld.AND(channel, dword_index, elk_imm_ud(3u));
2192       channel_mask = intexp2(fwa_bld, channel);
2193       /* Then the channel masks need to be in bits 23:16. */
2194       fwa_bld.SHL(channel_mask, channel_mask, elk_imm_ud(16u));
2195    }
2196 
2197    /* If there are channel masks, add 3 extra copies of the data. */
2198    const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2199    elk_fs_reg sources[4];
2200 
2201    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2202       sources[i] = this->control_data_bits;
2203 
2204    elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2205    srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2206    srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2207    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2208    srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, length);
2209    srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
2210    abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2211 
2212    elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2213                              srcs, ARRAY_SIZE(srcs));
2214 
2215    /* We need to increment Global Offset by 256-bits to make room for
2216     * Broadwell's extra "Vertex Count" payload at the beginning of the
2217     * URB entry.  Since this is an OWord message, Global Offset is counted
2218     * in 128-bit units, so we must set it to 2.
2219     */
2220    if (gs_prog_data->static_vertex_count == -1)
2221       inst->offset = 2;
2222 }
2223 
2224 static void
set_gs_stream_control_data_bits(nir_to_elk_state & ntb,const elk_fs_reg & vertex_count,unsigned stream_id)2225 set_gs_stream_control_data_bits(nir_to_elk_state &ntb, const elk_fs_reg &vertex_count,
2226                                 unsigned stream_id)
2227 {
2228    elk_fs_visitor &s = ntb.s;
2229 
2230    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2231 
2232    /* Note: we are calling this *before* increasing vertex_count, so
2233     * this->vertex_count == vertex_count - 1 in the formula above.
2234     */
2235 
2236    /* Stream mode uses 2 bits per vertex */
2237    assert(s.gs_compile->control_data_bits_per_vertex == 2);
2238 
2239    /* Must be a valid stream */
2240    assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2241 
2242    /* Control data bits are initialized to 0 so we don't have to set any
2243     * bits when sending vertices to stream 0.
2244     */
2245    if (stream_id == 0)
2246       return;
2247 
2248    const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
2249 
2250    /* reg::sid = stream_id */
2251    elk_fs_reg sid = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2252    abld.MOV(sid, elk_imm_ud(stream_id));
2253 
2254    /* reg:shift_count = 2 * (vertex_count - 1) */
2255    elk_fs_reg shift_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2256    abld.SHL(shift_count, vertex_count, elk_imm_ud(1u));
2257 
2258    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2259     * attention to the lower 5 bits of its second source argument, so on this
2260     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2261     * stream_id << ((2 * (vertex_count - 1)) % 32).
2262     */
2263    elk_fs_reg mask = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2264    abld.SHL(mask, sid, shift_count);
2265    abld.OR(s.control_data_bits, s.control_data_bits, mask);
2266 }
2267 
2268 static void
emit_gs_vertex(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2269 emit_gs_vertex(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src,
2270                unsigned stream_id)
2271 {
2272    elk_fs_visitor &s = ntb.s;
2273 
2274    assert(s.stage == MESA_SHADER_GEOMETRY);
2275 
2276    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2277 
2278    elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2279    vertex_count.type = ELK_REGISTER_TYPE_UD;
2280 
2281    /* Haswell and later hardware ignores the "Render Stream Select" bits
2282     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2283     * and instead sends all primitives down the pipeline for rasterization.
2284     * If the SOL stage is enabled, "Render Stream Select" is honored and
2285     * primitives bound to non-zero streams are discarded after stream output.
2286     *
2287     * Since the only purpose of primives sent to non-zero streams is to
2288     * be recorded by transform feedback, we can simply discard all geometry
2289     * bound to these streams when transform feedback is disabled.
2290     */
2291    if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2292       return;
2293 
2294    /* If we're outputting 32 control data bits or less, then we can wait
2295     * until the shader is over to output them all.  Otherwise we need to
2296     * output them as we go.  Now is the time to do it, since we're about to
2297     * output the vertex_count'th vertex, so it's guaranteed that the
2298     * control data bits associated with the (vertex_count - 1)th vertex are
2299     * correct.
2300     */
2301    if (s.gs_compile->control_data_header_size_bits > 32) {
2302       const fs_builder abld =
2303          ntb.bld.annotate("emit vertex: emit control data bits");
2304 
2305       /* Only emit control data bits if we've finished accumulating a batch
2306        * of 32 bits.  This is the case when:
2307        *
2308        *     (vertex_count * bits_per_vertex) % 32 == 0
2309        *
2310        * (in other words, when the last 5 bits of vertex_count *
2311        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2312        * integer n (which is always the case, since bits_per_vertex is
2313        * always 1 or 2), this is equivalent to requiring that the last 5-n
2314        * bits of vertex_count are 0:
2315        *
2316        *     vertex_count & (2^(5-n) - 1) == 0
2317        *
2318        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2319        * equivalent to:
2320        *
2321        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2322        *
2323        * TODO: If vertex_count is an immediate, we could do some of this math
2324        *       at compile time...
2325        */
2326       elk_fs_inst *inst =
2327          abld.AND(ntb.bld.null_reg_d(), vertex_count,
2328                   elk_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2329       inst->conditional_mod = ELK_CONDITIONAL_Z;
2330 
2331       abld.IF(ELK_PREDICATE_NORMAL);
2332       /* If vertex_count is 0, then no control data bits have been
2333        * accumulated yet, so we can skip emitting them.
2334        */
2335       abld.CMP(ntb.bld.null_reg_d(), vertex_count, elk_imm_ud(0u),
2336                ELK_CONDITIONAL_NEQ);
2337       abld.IF(ELK_PREDICATE_NORMAL);
2338       s.emit_gs_control_data_bits(vertex_count);
2339       abld.emit(ELK_OPCODE_ENDIF);
2340 
2341       /* Reset control_data_bits to 0 so we can start accumulating a new
2342        * batch.
2343        *
2344        * Note: in the case where vertex_count == 0, this neutralizes the
2345        * effect of any call to EndPrimitive() that the shader may have
2346        * made before outputting its first vertex.
2347        */
2348       inst = abld.MOV(s.control_data_bits, elk_imm_ud(0u));
2349       inst->force_writemask_all = true;
2350       abld.emit(ELK_OPCODE_ENDIF);
2351    }
2352 
2353    s.emit_urb_writes(vertex_count);
2354 
2355    /* In stream mode we have to set control data bits for all vertices
2356     * unless we have disabled control data bits completely (which we do
2357     * do for MESA_PRIM_POINTS outputs that don't use streams).
2358     */
2359    if (s.gs_compile->control_data_header_size_bits > 0 &&
2360        gs_prog_data->control_data_format ==
2361           GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2362       set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2363    }
2364 }
2365 
2366 static void
emit_gs_input_load(nir_to_elk_state & ntb,const elk_fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2367 emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
2368                    const nir_src &vertex_src,
2369                    unsigned base_offset,
2370                    const nir_src &offset_src,
2371                    unsigned num_components,
2372                    unsigned first_component)
2373 {
2374    const fs_builder &bld = ntb.bld;
2375    elk_fs_visitor &s = ntb.s;
2376 
2377    assert(type_sz(dst.type) == 4);
2378    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2379    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2380 
2381    /* TODO: figure out push input layout for invocations == 1 */
2382    if (gs_prog_data->invocations == 1 &&
2383        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2384        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2385       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2386                        nir_src_as_uint(vertex_src) * push_reg_count;
2387       const elk_fs_reg attr = elk_fs_reg(ATTR, 0, dst.type);
2388       for (unsigned i = 0; i < num_components; i++) {
2389          ntb.bld.MOV(offset(dst, bld, i),
2390                      offset(attr, bld, imm_offset + i + first_component));
2391       }
2392       return;
2393    }
2394 
2395    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2396    assert(gs_prog_data->base.include_vue_handles);
2397 
2398    elk_fs_reg start = s.gs_payload().icp_handle_start;
2399    elk_fs_reg icp_handle = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2400 
2401    if (gs_prog_data->invocations == 1) {
2402       if (nir_src_is_const(vertex_src)) {
2403          /* The vertex index is constant; just select the proper URB handle. */
2404          icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
2405       } else {
2406          /* The vertex index is non-constant.  We need to use indirect
2407           * addressing to fetch the proper URB handle.
2408           *
2409           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2410           * indicating that channel <n> should read the handle from
2411           * DWord <n>.  We convert that to bytes by multiplying by 4.
2412           *
2413           * Next, we convert the vertex index to bytes by multiplying
2414           * by 32 (shifting by 5), and add the two together.  This is
2415           * the final indirect byte offset.
2416           */
2417          elk_fs_reg sequence =
2418             ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2419          elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2420          elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2421          elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2422 
2423          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2424          bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2425          /* Convert vertex_index to bytes (multiply by 32) */
2426          bld.SHL(vertex_offset_bytes,
2427                  retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2428                  elk_imm_ud(5u));
2429          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2430 
2431          /* Use first_icp_handle as the base offset.  There is one register
2432           * of URB handles per vertex, so inform the register allocator that
2433           * we might read up to nir->info.gs.vertices_in registers.
2434           */
2435          bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2436                   elk_fs_reg(icp_offset_bytes),
2437                   elk_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
2438       }
2439    } else {
2440       assert(gs_prog_data->invocations > 1);
2441 
2442       if (nir_src_is_const(vertex_src)) {
2443          unsigned vertex = nir_src_as_uint(vertex_src);
2444          assert(vertex <= 5);
2445          bld.MOV(icp_handle, component(start, vertex));
2446       } else {
2447          /* The vertex index is non-constant.  We need to use indirect
2448           * addressing to fetch the proper URB handle.
2449           *
2450           */
2451          elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2452 
2453          /* Convert vertex_index to bytes (multiply by 4) */
2454          bld.SHL(icp_offset_bytes,
2455                  retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2456                  elk_imm_ud(2u));
2457 
2458          /* Use first_icp_handle as the base offset.  There is one DWord
2459           * of URB handles per vertex, so inform the register allocator that
2460           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2461           */
2462          bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2463                   elk_fs_reg(icp_offset_bytes),
2464                   elk_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2465                              REG_SIZE));
2466       }
2467    }
2468 
2469    elk_fs_inst *inst;
2470    elk_fs_reg indirect_offset = get_nir_src(ntb, offset_src);
2471 
2472    if (nir_src_is_const(offset_src)) {
2473       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2474       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2475 
2476       /* Constant indexing - use global offset. */
2477       if (first_component != 0) {
2478          unsigned read_components = num_components + first_component;
2479          elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2480          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2481                          ARRAY_SIZE(srcs));
2482          inst->size_written = read_components *
2483                               tmp.component_size(inst->exec_size);
2484          for (unsigned i = 0; i < num_components; i++) {
2485             bld.MOV(offset(dst, bld, i),
2486                     offset(tmp, bld, i + first_component));
2487          }
2488       } else {
2489          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2490                          ARRAY_SIZE(srcs));
2491          inst->size_written = num_components *
2492                               dst.component_size(inst->exec_size);
2493       }
2494       inst->offset = base_offset + nir_src_as_uint(offset_src);
2495    } else {
2496       /* Indirect indexing - use per-slot offsets as well. */
2497       unsigned read_components = num_components + first_component;
2498       elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2499 
2500       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2501       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2502       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2503 
2504       if (first_component != 0) {
2505          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2506                          srcs, ARRAY_SIZE(srcs));
2507          inst->size_written = read_components *
2508                               tmp.component_size(inst->exec_size);
2509          for (unsigned i = 0; i < num_components; i++) {
2510             bld.MOV(offset(dst, bld, i),
2511                     offset(tmp, bld, i + first_component));
2512          }
2513       } else {
2514          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2515                          srcs, ARRAY_SIZE(srcs));
2516          inst->size_written = num_components *
2517                               dst.component_size(inst->exec_size);
2518       }
2519       inst->offset = base_offset;
2520    }
2521 }
2522 
2523 static elk_fs_reg
get_indirect_offset(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2524 get_indirect_offset(nir_to_elk_state &ntb, nir_intrinsic_instr *instr)
2525 {
2526    nir_src *offset_src = nir_get_io_offset_src(instr);
2527 
2528    if (nir_src_is_const(*offset_src)) {
2529       /* The only constant offset we should find is 0.  elk_nir.c's
2530        * add_const_offset_to_base() will fold other constant offsets
2531        * into the "base" index.
2532        */
2533       assert(nir_src_as_uint(*offset_src) == 0);
2534       return elk_fs_reg();
2535    }
2536 
2537    return get_nir_src(ntb, *offset_src);
2538 }
2539 
2540 static void
fs_nir_emit_vs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2541 fs_nir_emit_vs_intrinsic(nir_to_elk_state &ntb,
2542                          nir_intrinsic_instr *instr)
2543 {
2544    const fs_builder &bld = ntb.bld;
2545    elk_fs_visitor &s = ntb.s;
2546    assert(s.stage == MESA_SHADER_VERTEX);
2547 
2548    elk_fs_reg dest;
2549    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2550       dest = get_nir_def(ntb, instr->def);
2551 
2552    switch (instr->intrinsic) {
2553    case nir_intrinsic_load_vertex_id:
2554    case nir_intrinsic_load_base_vertex:
2555       unreachable("should be lowered by nir_lower_system_values()");
2556 
2557    case nir_intrinsic_load_input: {
2558       assert(instr->def.bit_size == 32);
2559       const elk_fs_reg src = offset(elk_fs_reg(ATTR, 0, dest.type), bld,
2560                                 nir_intrinsic_base(instr) * 4 +
2561                                 nir_intrinsic_component(instr) +
2562                                 nir_src_as_uint(instr->src[0]));
2563 
2564       for (unsigned i = 0; i < instr->num_components; i++)
2565          bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2566       break;
2567    }
2568 
2569    case nir_intrinsic_load_vertex_id_zero_base:
2570    case nir_intrinsic_load_instance_id:
2571    case nir_intrinsic_load_base_instance:
2572    case nir_intrinsic_load_draw_id:
2573    case nir_intrinsic_load_first_vertex:
2574    case nir_intrinsic_load_is_indexed_draw:
2575       unreachable("lowered by elk_nir_lower_vs_inputs");
2576 
2577    default:
2578       fs_nir_emit_intrinsic(ntb, bld, instr);
2579       break;
2580    }
2581 }
2582 
2583 static elk_fs_reg
get_tcs_single_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2584 get_tcs_single_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2585                                 nir_intrinsic_instr *instr)
2586 {
2587    elk_fs_visitor &s = ntb.s;
2588 
2589    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2590    const nir_src &vertex_src = instr->src[0];
2591    nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2592 
2593    const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2594 
2595    elk_fs_reg icp_handle;
2596 
2597    if (nir_src_is_const(vertex_src)) {
2598       /* Emit a MOV to resolve <0,1,0> regioning. */
2599       icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2600       unsigned vertex = nir_src_as_uint(vertex_src);
2601       bld.MOV(icp_handle, component(start, vertex));
2602    } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2603               vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2604       /* For the common case of only 1 instance, an array index of
2605        * gl_InvocationID means reading the handles from the start.  Skip all
2606        * the indirect work.
2607        */
2608       icp_handle = start;
2609    } else {
2610       /* The vertex index is non-constant.  We need to use indirect
2611        * addressing to fetch the proper URB handle.
2612        */
2613       icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2614 
2615       /* Each ICP handle is a single DWord (4 bytes) */
2616       elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2617       bld.SHL(vertex_offset_bytes,
2618               retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2619               elk_imm_ud(2u));
2620 
2621       /* We might read up to 4 registers. */
2622       bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2623                start, vertex_offset_bytes,
2624                elk_imm_ud(4 * REG_SIZE));
2625    }
2626 
2627    return icp_handle;
2628 }
2629 
2630 static elk_fs_reg
get_tcs_multi_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2631 get_tcs_multi_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2632                                nir_intrinsic_instr *instr)
2633 {
2634    elk_fs_visitor &s = ntb.s;
2635    const intel_device_info *devinfo = s.devinfo;
2636 
2637    struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) s.key;
2638    const nir_src &vertex_src = instr->src[0];
2639    const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2640 
2641    const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2642 
2643    if (nir_src_is_const(vertex_src))
2644       return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2645 
2646    /* The vertex index is non-constant.  We need to use indirect
2647     * addressing to fetch the proper URB handle.
2648     *
2649     * First, we start with the sequence indicating that channel <n>
2650     * should read the handle from DWord <n>.  We convert that to bytes
2651     * by multiplying by 4.
2652     *
2653     * Next, we convert the vertex index to bytes by multiplying
2654     * by the GRF size (by shifting), and add the two together.  This is
2655     * the final indirect byte offset.
2656     */
2657    elk_fs_reg icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2658    elk_fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2659    elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2660    elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2661    elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2662 
2663    /* Offsets will be 0, 4, 8, ... */
2664    bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2665    /* Convert vertex_index to bytes (multiply by 32) */
2666    assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2667    bld.SHL(vertex_offset_bytes,
2668            retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2669            elk_imm_ud(ffs(grf_size_bytes) - 1));
2670    bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2671 
2672    /* Use start of ICP handles as the base offset.  There is one register
2673     * of URB handles per vertex, so inform the register allocator that
2674     * we might read up to nir->info.gs.vertices_in registers.
2675     */
2676    bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2677             icp_offset_bytes,
2678             elk_imm_ud(elk_tcs_prog_key_input_vertices(tcs_key) *
2679                        grf_size_bytes));
2680 
2681    return icp_handle;
2682 }
2683 
2684 static void
emit_barrier(nir_to_elk_state & ntb)2685 emit_barrier(nir_to_elk_state &ntb)
2686 {
2687    const intel_device_info *devinfo = ntb.devinfo;
2688    const fs_builder &bld = ntb.bld;
2689    elk_fs_visitor &s = ntb.s;
2690 
2691    /* We are getting the barrier ID from the compute shader header */
2692    assert(gl_shader_stage_uses_workgroup(s.stage));
2693 
2694    elk_fs_reg payload = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
2695 
2696    /* Clear the message payload */
2697    bld.exec_all().group(8, 0).MOV(payload, elk_imm_ud(0u));
2698 
2699    assert(gl_shader_stage_is_compute(s.stage));
2700 
2701    uint32_t barrier_id_mask;
2702    switch (devinfo->ver) {
2703    case 7:
2704    case 8:
2705       barrier_id_mask = 0x0f000000u; break;
2706    default:
2707       unreachable("barrier is only available on gen >= 7");
2708    }
2709 
2710    /* Copy the barrier id from r0.2 to the message payload reg.2 */
2711    elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
2712    bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
2713                                   elk_imm_ud(barrier_id_mask));
2714 
2715    /* Emit a gateway "barrier" message using the payload we set up, followed
2716     * by a wait instruction.
2717     */
2718    bld.exec_all().emit(ELK_SHADER_OPCODE_BARRIER, reg_undef, payload);
2719 }
2720 
2721 static void
emit_tcs_barrier(nir_to_elk_state & ntb)2722 emit_tcs_barrier(nir_to_elk_state &ntb)
2723 {
2724    const fs_builder &bld = ntb.bld;
2725    elk_fs_visitor &s = ntb.s;
2726 
2727    assert(s.stage == MESA_SHADER_TESS_CTRL);
2728    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2729 
2730    elk_fs_reg m0 = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2731    elk_fs_reg m0_2 = component(m0, 2);
2732 
2733    const fs_builder chanbld = bld.exec_all().group(1, 0);
2734 
2735    /* Zero the message header */
2736    bld.exec_all().MOV(m0, elk_imm_ud(0u));
2737 
2738    /* Copy "Barrier ID" from r0.2, bits 16:13 */
2739    chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
2740                elk_imm_ud(INTEL_MASK(16, 13)));
2741 
2742    /* Shift it up to bits 27:24. */
2743    chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
2744 
2745    /* Set the Barrier Count and the enable bit */
2746    chanbld.OR(m0_2, m0_2,
2747               elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2748 
2749    bld.emit(ELK_SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2750 }
2751 
2752 static void
fs_nir_emit_tcs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2753 fs_nir_emit_tcs_intrinsic(nir_to_elk_state &ntb,
2754                           nir_intrinsic_instr *instr)
2755 {
2756    const intel_device_info *devinfo = ntb.devinfo;
2757    const fs_builder &bld = ntb.bld;
2758    elk_fs_visitor &s = ntb.s;
2759 
2760    assert(s.stage == MESA_SHADER_TESS_CTRL);
2761    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2762    struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2763 
2764    elk_fs_reg dst;
2765    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2766       dst = get_nir_def(ntb, instr->def);
2767 
2768    switch (instr->intrinsic) {
2769    case nir_intrinsic_load_primitive_id:
2770       bld.MOV(dst, s.tcs_payload().primitive_id);
2771       break;
2772    case nir_intrinsic_load_invocation_id:
2773       bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
2774       break;
2775 
2776    case nir_intrinsic_barrier:
2777       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
2778          fs_nir_emit_intrinsic(ntb, bld, instr);
2779       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
2780          if (tcs_prog_data->instances != 1)
2781             emit_tcs_barrier(ntb);
2782       }
2783       break;
2784 
2785    case nir_intrinsic_load_input:
2786       unreachable("nir_lower_io should never give us these.");
2787       break;
2788 
2789    case nir_intrinsic_load_per_vertex_input: {
2790       assert(instr->def.bit_size == 32);
2791       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2792       unsigned imm_offset = nir_intrinsic_base(instr);
2793       elk_fs_inst *inst;
2794 
2795       const bool multi_patch =
2796          vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
2797 
2798       elk_fs_reg icp_handle = multi_patch ?
2799          get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
2800          get_tcs_single_patch_icp_handle(ntb, bld, instr);
2801 
2802       /* We can only read two double components with each URB read, so
2803        * we send two read messages in that case, each one loading up to
2804        * two double components.
2805        */
2806       unsigned num_components = instr->num_components;
2807       unsigned first_component = nir_intrinsic_component(instr);
2808 
2809       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2810       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2811 
2812       if (indirect_offset.file == BAD_FILE) {
2813          /* Constant indexing - use global offset. */
2814          if (first_component != 0) {
2815             unsigned read_components = num_components + first_component;
2816             elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2817             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2818                             ARRAY_SIZE(srcs));
2819             for (unsigned i = 0; i < num_components; i++) {
2820                bld.MOV(offset(dst, bld, i),
2821                        offset(tmp, bld, i + first_component));
2822             }
2823          } else {
2824             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2825                             ARRAY_SIZE(srcs));
2826          }
2827          inst->offset = imm_offset;
2828       } else {
2829          /* Indirect indexing - use per-slot offsets as well. */
2830          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2831 
2832          if (first_component != 0) {
2833             unsigned read_components = num_components + first_component;
2834             elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2835             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2836                             srcs, ARRAY_SIZE(srcs));
2837             for (unsigned i = 0; i < num_components; i++) {
2838                bld.MOV(offset(dst, bld, i),
2839                        offset(tmp, bld, i + first_component));
2840             }
2841          } else {
2842             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2843                             srcs, ARRAY_SIZE(srcs));
2844          }
2845          inst->offset = imm_offset;
2846       }
2847       inst->size_written = (num_components + first_component) *
2848                            inst->dst.component_size(inst->exec_size);
2849 
2850       /* Copy the temporary to the destination to deal with writemasking.
2851        *
2852        * Also attempt to deal with gl_PointSize being in the .w component.
2853        */
2854       if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2855          assert(type_sz(dst.type) == 4);
2856          inst->dst = bld.vgrf(dst.type, 4);
2857          inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
2858          bld.MOV(dst, offset(inst->dst, bld, 3));
2859       }
2860       break;
2861    }
2862 
2863    case nir_intrinsic_load_output:
2864    case nir_intrinsic_load_per_vertex_output: {
2865       assert(instr->def.bit_size == 32);
2866       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2867       unsigned imm_offset = nir_intrinsic_base(instr);
2868       unsigned first_component = nir_intrinsic_component(instr);
2869 
2870       elk_fs_inst *inst;
2871       if (indirect_offset.file == BAD_FILE) {
2872          /* This MOV replicates the output handle to all enabled channels
2873           * is SINGLE_PATCH mode.
2874           */
2875          elk_fs_reg patch_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2876          bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
2877 
2878          {
2879             elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2880             srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
2881 
2882             if (first_component != 0) {
2883                unsigned read_components =
2884                   instr->num_components + first_component;
2885                elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2886                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2887                                srcs, ARRAY_SIZE(srcs));
2888                inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2889                for (unsigned i = 0; i < instr->num_components; i++) {
2890                   bld.MOV(offset(dst, bld, i),
2891                           offset(tmp, bld, i + first_component));
2892                }
2893             } else {
2894                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2895                                srcs, ARRAY_SIZE(srcs));
2896                inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2897             }
2898             inst->offset = imm_offset;
2899          }
2900       } else {
2901          /* Indirect indexing - use per-slot offsets as well. */
2902          elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2903          srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2904          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2905 
2906          if (first_component != 0) {
2907             unsigned read_components =
2908                instr->num_components + first_component;
2909             elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2910             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2911                             srcs, ARRAY_SIZE(srcs));
2912             inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2913             for (unsigned i = 0; i < instr->num_components; i++) {
2914                bld.MOV(offset(dst, bld, i),
2915                        offset(tmp, bld, i + first_component));
2916             }
2917          } else {
2918             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2919                             srcs, ARRAY_SIZE(srcs));
2920             inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2921          }
2922          inst->offset = imm_offset;
2923       }
2924       break;
2925    }
2926 
2927    case nir_intrinsic_store_output:
2928    case nir_intrinsic_store_per_vertex_output: {
2929       assert(nir_src_bit_size(instr->src[0]) == 32);
2930       elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
2931       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2932       unsigned imm_offset = nir_intrinsic_base(instr);
2933       unsigned mask = nir_intrinsic_write_mask(instr);
2934 
2935       if (mask == 0)
2936          break;
2937 
2938       unsigned num_components = util_last_bit(mask);
2939       unsigned first_component = nir_intrinsic_component(instr);
2940       assert((first_component + num_components) <= 4);
2941 
2942       mask = mask << first_component;
2943 
2944       elk_fs_reg mask_reg;
2945       if (mask != WRITEMASK_XYZW)
2946          mask_reg = elk_imm_ud(mask << 16);
2947 
2948       elk_fs_reg sources[4];
2949 
2950       unsigned m = first_component;
2951       for (unsigned i = 0; i < num_components; i++) {
2952          int c = i + first_component;
2953          if (mask & (1 << c)) {
2954             sources[m++] = offset(value, bld, i);
2955          } else {
2956             m++;
2957          }
2958       }
2959 
2960       assert(m == (first_component + num_components));
2961 
2962       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2963       srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2964       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2965       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
2966       srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, m);
2967       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(m);
2968       bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
2969 
2970       elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2971                                srcs, ARRAY_SIZE(srcs));
2972       inst->offset = imm_offset;
2973       break;
2974    }
2975 
2976    default:
2977       fs_nir_emit_intrinsic(ntb, bld, instr);
2978       break;
2979    }
2980 }
2981 
2982 static void
fs_nir_emit_tes_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2983 fs_nir_emit_tes_intrinsic(nir_to_elk_state &ntb,
2984                           nir_intrinsic_instr *instr)
2985 {
2986    const intel_device_info *devinfo = ntb.devinfo;
2987    const fs_builder &bld = ntb.bld;
2988    elk_fs_visitor &s = ntb.s;
2989 
2990    assert(s.stage == MESA_SHADER_TESS_EVAL);
2991    struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(s.prog_data);
2992 
2993    elk_fs_reg dest;
2994    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2995       dest = get_nir_def(ntb, instr->def);
2996 
2997    switch (instr->intrinsic) {
2998    case nir_intrinsic_load_primitive_id:
2999       bld.MOV(dest, s.tes_payload().primitive_id);
3000       break;
3001 
3002    case nir_intrinsic_load_tess_coord:
3003       for (unsigned i = 0; i < 3; i++)
3004          bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3005       break;
3006 
3007    case nir_intrinsic_load_input:
3008    case nir_intrinsic_load_per_vertex_input: {
3009       assert(instr->def.bit_size == 32);
3010       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3011       unsigned imm_offset = nir_intrinsic_base(instr);
3012       unsigned first_component = nir_intrinsic_component(instr);
3013 
3014       elk_fs_inst *inst;
3015       if (indirect_offset.file == BAD_FILE) {
3016          /* Arbitrarily only push up to 32 vec4 slots worth of data,
3017           * which is 16 registers (since each holds 2 vec4 slots).
3018           */
3019          const unsigned max_push_slots = 32;
3020          if (imm_offset < max_push_slots) {
3021             const elk_fs_reg src = horiz_offset(elk_fs_reg(ATTR, 0, dest.type),
3022                                             4 * imm_offset + first_component);
3023             for (int i = 0; i < instr->num_components; i++)
3024                bld.MOV(offset(dest, bld, i), component(src, i));
3025 
3026             tes_prog_data->base.urb_read_length =
3027                MAX2(tes_prog_data->base.urb_read_length,
3028                     (imm_offset / 2) + 1);
3029          } else {
3030             /* Replicate the patch handle to all enabled channels */
3031             elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3032             srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3033 
3034             if (first_component != 0) {
3035                unsigned read_components =
3036                   instr->num_components + first_component;
3037                elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3038                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3039                                srcs, ARRAY_SIZE(srcs));
3040                inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3041                for (unsigned i = 0; i < instr->num_components; i++) {
3042                   bld.MOV(offset(dest, bld, i),
3043                           offset(tmp, bld, i + first_component));
3044                }
3045             } else {
3046                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3047                                srcs, ARRAY_SIZE(srcs));
3048                inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3049             }
3050             inst->offset = imm_offset;
3051          }
3052       } else {
3053          /* Indirect indexing - use per-slot offsets as well. */
3054 
3055          /* We can only read two double components with each URB read, so
3056           * we send two read messages in that case, each one loading up to
3057           * two double components.
3058           */
3059          unsigned num_components = instr->num_components;
3060 
3061          elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3062          srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3063          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3064 
3065          if (first_component != 0) {
3066             unsigned read_components =
3067                 num_components + first_component;
3068             elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3069             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3070                             srcs, ARRAY_SIZE(srcs));
3071             for (unsigned i = 0; i < num_components; i++) {
3072                bld.MOV(offset(dest, bld, i),
3073                        offset(tmp, bld, i + first_component));
3074             }
3075          } else {
3076             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3077                             srcs, ARRAY_SIZE(srcs));
3078          }
3079          inst->offset = imm_offset;
3080          inst->size_written = (num_components + first_component) *
3081                               inst->dst.component_size(inst->exec_size);
3082       }
3083       break;
3084    }
3085    default:
3086       fs_nir_emit_intrinsic(ntb, bld, instr);
3087       break;
3088    }
3089 }
3090 
3091 static void
fs_nir_emit_gs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3092 fs_nir_emit_gs_intrinsic(nir_to_elk_state &ntb,
3093                          nir_intrinsic_instr *instr)
3094 {
3095    const fs_builder &bld = ntb.bld;
3096    elk_fs_visitor &s = ntb.s;
3097 
3098    assert(s.stage == MESA_SHADER_GEOMETRY);
3099    elk_fs_reg indirect_offset;
3100 
3101    elk_fs_reg dest;
3102    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3103       dest = get_nir_def(ntb, instr->def);
3104 
3105    switch (instr->intrinsic) {
3106    case nir_intrinsic_load_primitive_id:
3107       assert(s.stage == MESA_SHADER_GEOMETRY);
3108       assert(elk_gs_prog_data(s.prog_data)->include_primitive_id);
3109       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
3110       break;
3111 
3112    case nir_intrinsic_load_input:
3113       unreachable("load_input intrinsics are invalid for the GS stage");
3114 
3115    case nir_intrinsic_load_per_vertex_input:
3116       emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3117                          instr->src[1], instr->num_components,
3118                          nir_intrinsic_component(instr));
3119       break;
3120 
3121    case nir_intrinsic_emit_vertex_with_counter:
3122       emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3123       break;
3124 
3125    case nir_intrinsic_end_primitive_with_counter:
3126       emit_gs_end_primitive(ntb, instr->src[0]);
3127       break;
3128 
3129    case nir_intrinsic_set_vertex_and_primitive_count:
3130       bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3131       break;
3132 
3133    case nir_intrinsic_load_invocation_id: {
3134       elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3135       assert(val.file != BAD_FILE);
3136       dest.type = val.type;
3137       bld.MOV(dest, val);
3138       break;
3139    }
3140 
3141    default:
3142       fs_nir_emit_intrinsic(ntb, bld, instr);
3143       break;
3144    }
3145 }
3146 
3147 /**
3148  * Fetch the current render target layer index.
3149  */
3150 static elk_fs_reg
fetch_render_target_array_index(const fs_builder & bld)3151 fetch_render_target_array_index(const fs_builder &bld)
3152 {
3153    if (bld.shader->devinfo->ver >= 6) {
3154       /* The render target array index is provided in the thread payload as
3155        * bits 26:16 of r0.0.
3156        */
3157       const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3158       bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 0, 1),
3159               elk_imm_uw(0x7ff));
3160       return idx;
3161    } else {
3162       /* Pre-SNB we only ever render into the first layer of the framebuffer
3163        * since layered rendering is not implemented.
3164        */
3165       return elk_imm_ud(0);
3166    }
3167 }
3168 
3169 /* Sample from the MCS surface attached to this multisample texture. */
3170 static elk_fs_reg
emit_mcs_fetch(nir_to_elk_state & ntb,const elk_fs_reg & coordinate,unsigned components,const elk_fs_reg & texture,const elk_fs_reg & texture_handle)3171 emit_mcs_fetch(nir_to_elk_state &ntb, const elk_fs_reg &coordinate, unsigned components,
3172                const elk_fs_reg &texture,
3173                const elk_fs_reg &texture_handle)
3174 {
3175    const fs_builder &bld = ntb.bld;
3176 
3177    const elk_fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
3178 
3179    elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3180    srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3181    srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3182    srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3183    srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3184    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(components);
3185    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
3186    srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
3187 
3188    elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3189                             ARRAY_SIZE(srcs));
3190 
3191    /* We only care about one or two regs of response, but the sampler always
3192     * writes 4/8.
3193     */
3194    inst->size_written = 4 * dest.component_size(inst->exec_size);
3195 
3196    return dest;
3197 }
3198 
3199 /**
3200  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3201  * framebuffer at the current fragment coordinates and sample index.
3202  */
3203 static elk_fs_inst *
emit_non_coherent_fb_read(nir_to_elk_state & ntb,const fs_builder & bld,const elk_fs_reg & dst,unsigned target)3204 emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const elk_fs_reg &dst,
3205                           unsigned target)
3206 {
3207    elk_fs_visitor &s = ntb.s;
3208 
3209    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3210    const elk_wm_prog_key *wm_key =
3211       reinterpret_cast<const elk_wm_prog_key *>(s.key);
3212    assert(!wm_key->coherent_fb_fetch);
3213 
3214    /* Calculate the fragment coordinates. */
3215    const elk_fs_reg coords = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
3216    bld.MOV(offset(coords, bld, 0), s.pixel_x);
3217    bld.MOV(offset(coords, bld, 1), s.pixel_y);
3218    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3219 
3220    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3221     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3222     * shouldn't be necessary to recompile based on whether the framebuffer is
3223     * CMS or UMS.
3224     */
3225    assert(wm_key->multisample_fbo == ELK_ALWAYS ||
3226           wm_key->multisample_fbo == ELK_NEVER);
3227    if (wm_key->multisample_fbo &&
3228        ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3229       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3230 
3231    const elk_fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3232    const elk_fs_reg mcs = wm_key->multisample_fbo ?
3233       emit_mcs_fetch(ntb, coords, 3, elk_imm_ud(target), elk_fs_reg()) : elk_fs_reg();
3234 
3235    /* Use either a normal or a CMS texel fetch message depending on whether
3236     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3237     * message just in case the framebuffer uses 16x multisampling, it should
3238     * be equivalent to the normal CMS fetch for lower multisampling modes.
3239     */
3240    elk_opcode op;
3241    if (wm_key->multisample_fbo) {
3242       op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
3243    } else {
3244       op = ELK_SHADER_OPCODE_TXF_LOGICAL;
3245    }
3246 
3247    /* Emit the instruction. */
3248    elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3249    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3250    srcs[TEX_LOGICAL_SRC_LOD]              = elk_imm_ud(0);
3251    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3252    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3253    srcs[TEX_LOGICAL_SRC_SURFACE]          = elk_imm_ud(target);
3254    srcs[TEX_LOGICAL_SRC_SAMPLER]          = elk_imm_ud(0);
3255    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_ud(3);
3256    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = elk_imm_ud(0);
3257    srcs[TEX_LOGICAL_SRC_RESIDENCY]        = elk_imm_ud(0);
3258 
3259    elk_fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3260    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3261 
3262    return inst;
3263 }
3264 
3265 static elk_fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,elk_fs_reg * regs,unsigned n)3266 alloc_temporary(const fs_builder &bld, unsigned size, elk_fs_reg *regs, unsigned n)
3267 {
3268    if (n && regs[0].file != BAD_FILE) {
3269       return regs[0];
3270 
3271    } else {
3272       const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, size);
3273 
3274       for (unsigned i = 0; i < n; i++)
3275          regs[i] = tmp;
3276 
3277       return tmp;
3278    }
3279 }
3280 
3281 static elk_fs_reg
alloc_frag_output(nir_to_elk_state & ntb,unsigned location)3282 alloc_frag_output(nir_to_elk_state &ntb, unsigned location)
3283 {
3284    elk_fs_visitor &s = ntb.s;
3285 
3286    assert(s.stage == MESA_SHADER_FRAGMENT);
3287    const elk_wm_prog_key *const key =
3288       reinterpret_cast<const elk_wm_prog_key *>(s.key);
3289    const unsigned l = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_LOCATION);
3290    const unsigned i = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_INDEX);
3291 
3292    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3293       return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3294 
3295    else if (l == FRAG_RESULT_COLOR)
3296       return alloc_temporary(ntb.bld, 4, s.outputs,
3297                              MAX2(key->nr_color_regions, 1));
3298 
3299    else if (l == FRAG_RESULT_DEPTH)
3300       return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3301 
3302    else if (l == FRAG_RESULT_STENCIL)
3303       return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3304 
3305    else if (l == FRAG_RESULT_SAMPLE_MASK)
3306       return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3307 
3308    else if (l >= FRAG_RESULT_DATA0 &&
3309             l < FRAG_RESULT_DATA0 + ELK_MAX_DRAW_BUFFERS)
3310       return alloc_temporary(ntb.bld, 4,
3311                              &s.outputs[l - FRAG_RESULT_DATA0], 1);
3312 
3313    else
3314       unreachable("Invalid location");
3315 }
3316 
3317 static void
emit_is_helper_invocation(nir_to_elk_state & ntb,elk_fs_reg result)3318 emit_is_helper_invocation(nir_to_elk_state &ntb, elk_fs_reg result)
3319 {
3320    const fs_builder &bld = ntb.bld;
3321 
3322    /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3323     * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3324     * consideration demoted invocations.
3325     */
3326    result.type = ELK_REGISTER_TYPE_UD;
3327 
3328    bld.MOV(result, elk_imm_ud(0));
3329 
3330    /* See elk_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3331    unsigned width = bld.dispatch_width();
3332    for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3333       const fs_builder b = bld.group(MIN2(width, 16), i);
3334 
3335       elk_fs_inst *mov = b.MOV(offset(result, b, i), elk_imm_ud(~0));
3336 
3337       /* The at() ensures that any code emitted to get the predicate happens
3338        * before the mov right above.  This is not an issue elsewhere because
3339        * lowering code already set up the builder this way.
3340        */
3341       elk_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3342       mov->predicate_inverse = true;
3343    }
3344 }
3345 
3346 static void
emit_fragcoord_interpolation(nir_to_elk_state & ntb,elk_fs_reg wpos)3347 emit_fragcoord_interpolation(nir_to_elk_state &ntb, elk_fs_reg wpos)
3348 {
3349    const intel_device_info *devinfo = ntb.devinfo;
3350    const fs_builder &bld = ntb.bld;
3351    elk_fs_visitor &s = ntb.s;
3352 
3353    assert(s.stage == MESA_SHADER_FRAGMENT);
3354 
3355    /* gl_FragCoord.x */
3356    bld.MOV(wpos, s.pixel_x);
3357    wpos = offset(wpos, bld, 1);
3358 
3359    /* gl_FragCoord.y */
3360    bld.MOV(wpos, s.pixel_y);
3361    wpos = offset(wpos, bld, 1);
3362 
3363    /* gl_FragCoord.z */
3364    if (devinfo->ver >= 6) {
3365       bld.MOV(wpos, s.pixel_z);
3366    } else {
3367       bld.emit(ELK_FS_OPCODE_LINTERP, wpos,
3368                s.delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL],
3369                s.interp_reg(bld, VARYING_SLOT_POS, 2, 0));
3370    }
3371    wpos = offset(wpos, bld, 1);
3372 
3373    /* gl_FragCoord.w: Already set up in emit_interpolation */
3374    bld.MOV(wpos, s.wpos_w);
3375 }
3376 
3377 static elk_fs_reg
emit_frontfacing_interpolation(nir_to_elk_state & ntb)3378 emit_frontfacing_interpolation(nir_to_elk_state &ntb)
3379 {
3380    const intel_device_info *devinfo = ntb.devinfo;
3381    const fs_builder &bld = ntb.bld;
3382 
3383    elk_fs_reg ff = bld.vgrf(ELK_REGISTER_TYPE_D);
3384 
3385    if (devinfo->ver >= 6) {
3386       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3387        * a boolean result from this (~0/true or 0/false).
3388        *
3389        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3390        * this task in only one instruction:
3391        *    - a negation source modifier will flip the bit; and
3392        *    - a W -> D type conversion will sign extend the bit into the high
3393        *      word of the destination.
3394        *
3395        * An ASR 15 fills the low word of the destination.
3396        */
3397       elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
3398       g0.negate = true;
3399 
3400       bld.ASR(ff, g0, elk_imm_d(15));
3401    } else {
3402       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
3403        * a boolean result from this (1/true or 0/false).
3404        *
3405        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
3406        * the negation source modifier to flip it. Unfortunately the SHR
3407        * instruction only operates on UD (or D with an abs source modifier)
3408        * sources without negation.
3409        *
3410        * Instead, use ASR (which will give ~0/true or 0/false).
3411        */
3412       elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
3413       g1_6.negate = true;
3414 
3415       bld.ASR(ff, g1_6, elk_imm_d(31));
3416    }
3417 
3418    return ff;
3419 }
3420 
3421 static elk_fs_reg
emit_samplepos_setup(nir_to_elk_state & ntb)3422 emit_samplepos_setup(nir_to_elk_state &ntb)
3423 {
3424    const intel_device_info *devinfo = ntb.devinfo;
3425    const fs_builder &bld = ntb.bld;
3426    elk_fs_visitor &s = ntb.s;
3427 
3428    assert(s.stage == MESA_SHADER_FRAGMENT);
3429    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3430    assert(devinfo->ver >= 6);
3431 
3432    const fs_builder abld = bld.annotate("compute sample position");
3433    elk_fs_reg pos = abld.vgrf(ELK_REGISTER_TYPE_F, 2);
3434 
3435    if (wm_prog_data->persample_dispatch == ELK_NEVER) {
3436       /* From ARB_sample_shading specification:
3437        * "When rendering to a non-multisample buffer, or if multisample
3438        *  rasterization is disabled, gl_SamplePosition will always be
3439        *  (0.5, 0.5).
3440        */
3441       bld.MOV(offset(pos, bld, 0), elk_imm_f(0.5f));
3442       bld.MOV(offset(pos, bld, 1), elk_imm_f(0.5f));
3443       return pos;
3444    }
3445 
3446    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3447     * mode will be enabled.
3448     *
3449     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3450     * R31.1:0         Position Offset X/Y for Slot[3:0]
3451     * R31.3:2         Position Offset X/Y for Slot[7:4]
3452     * .....
3453     *
3454     * The X, Y sample positions come in as bytes in  thread payload. So, read
3455     * the positions using vstride=16, width=8, hstride=2.
3456     */
3457    const elk_fs_reg sample_pos_reg =
3458       fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, ELK_REGISTER_TYPE_W);
3459 
3460    for (unsigned i = 0; i < 2; i++) {
3461       elk_fs_reg tmp_d = bld.vgrf(ELK_REGISTER_TYPE_D);
3462       abld.MOV(tmp_d, subscript(sample_pos_reg, ELK_REGISTER_TYPE_B, i));
3463       /* Convert int_sample_pos to floating point */
3464       elk_fs_reg tmp_f = bld.vgrf(ELK_REGISTER_TYPE_F);
3465       abld.MOV(tmp_f, tmp_d);
3466       /* Scale to the range [0, 1] */
3467       abld.MUL(offset(pos, abld, i), tmp_f, elk_imm_f(1 / 16.0f));
3468    }
3469 
3470    if (wm_prog_data->persample_dispatch == ELK_SOMETIMES) {
3471       check_dynamic_msaa_flag(abld, wm_prog_data,
3472                               INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3473       for (unsigned i = 0; i < 2; i++) {
3474          set_predicate(ELK_PREDICATE_NORMAL,
3475                        bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3476                                elk_imm_f(0.5f)));
3477       }
3478    }
3479 
3480    return pos;
3481 }
3482 
3483 static elk_fs_reg
emit_sampleid_setup(nir_to_elk_state & ntb)3484 emit_sampleid_setup(nir_to_elk_state &ntb)
3485 {
3486    const intel_device_info *devinfo = ntb.devinfo;
3487    const fs_builder &bld = ntb.bld;
3488    elk_fs_visitor &s = ntb.s;
3489 
3490    assert(s.stage == MESA_SHADER_FRAGMENT);
3491    ASSERTED elk_wm_prog_key *key = (elk_wm_prog_key*) s.key;
3492    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3493    assert(devinfo->ver >= 6);
3494 
3495    const fs_builder abld = bld.annotate("compute sample id");
3496    elk_fs_reg sample_id = abld.vgrf(ELK_REGISTER_TYPE_UD);
3497 
3498    assert(key->multisample_fbo != ELK_NEVER);
3499 
3500    if (devinfo->ver >= 8) {
3501       /* Sample ID comes in as 4-bit numbers in g1.0:
3502        *
3503        *    15:12 Slot 3 SampleID (only used in SIMD16)
3504        *     11:8 Slot 2 SampleID (only used in SIMD16)
3505        *      7:4 Slot 1 SampleID
3506        *      3:0 Slot 0 SampleID
3507        *
3508        * Each slot corresponds to four channels, so we want to replicate each
3509        * half-byte value to 4 channels in a row:
3510        *
3511        *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
3512        *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
3513        *
3514        *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
3515        *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
3516        *
3517        * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
3518        * channels to read the first byte (7:0), and the second group of 8
3519        * channels to read the second byte (15:8).  Then, we shift right by
3520        * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
3521        * values into place.  Finally, we AND with 0xf to keep the low nibble.
3522        *
3523        *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
3524        *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
3525        *
3526        * TODO: These payload bits exist on Gfx7 too, but they appear to always
3527        *       be zero, so this code fails to work.  We should find out why.
3528        */
3529       const elk_fs_reg tmp = abld.vgrf(ELK_REGISTER_TYPE_UW);
3530 
3531       for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3532          const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
3533          /* According to the "PS Thread Payload for Normal Dispatch"
3534           * pages on the BSpec, the sample ids are stored in R1.0/R2.0 on gfx8+.
3535           */
3536          const struct elk_reg id_reg = elk_vec1_grf(i + 1, 0);
3537          hbld.SHR(offset(tmp, hbld, i),
3538                   stride(retype(id_reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
3539                   elk_imm_v(0x44440000));
3540       }
3541 
3542       abld.AND(sample_id, tmp, elk_imm_w(0xf));
3543    } else {
3544       const elk_fs_reg t1 = component(abld.vgrf(ELK_REGISTER_TYPE_UD), 0);
3545       const elk_fs_reg t2 = abld.vgrf(ELK_REGISTER_TYPE_UW);
3546 
3547       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
3548        * 8x multisampling, subspan 0 will represent sample N (where N
3549        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
3550        * 7. We can find the value of N by looking at R0.0 bits 7:6
3551        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
3552        * (since samples are always delivered in pairs). That is, we
3553        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
3554        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
3555        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
3556        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
3557        * populating a temporary variable with the sequence (0, 1, 2, 3),
3558        * and then reading from it using vstride=1, width=4, hstride=0.
3559        * These computations hold good for 4x multisampling as well.
3560        *
3561        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
3562        * the first four slots are sample 0 of subspan 0; the next four
3563        * are sample 1 of subspan 0; the third group is sample 0 of
3564        * subspan 1, and finally sample 1 of subspan 1.
3565        */
3566 
3567       /* SKL+ has an extra bit for the Starting Sample Pair Index to
3568        * accommodate 16x MSAA.
3569        */
3570       abld.exec_all().group(1, 0)
3571           .AND(t1, elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD)),
3572                elk_imm_ud(0xc0));
3573       abld.exec_all().group(1, 0).SHR(t1, t1, elk_imm_d(5));
3574 
3575       /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
3576        * can assume 4x MSAA.  Disallow it on IVB+
3577        *
3578        * FINISHME: One day, we could come up with a way to do this that
3579        * actually works on gfx7.
3580        */
3581       if (devinfo->ver >= 7)
3582          s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
3583       abld.exec_all().group(8, 0).MOV(t2, elk_imm_v(0x32103210));
3584 
3585       /* This special instruction takes care of setting vstride=1,
3586        * width=4, hstride=0 of t2 during an ADD instruction.
3587        */
3588       abld.emit(ELK_FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
3589    }
3590 
3591    if (key->multisample_fbo == ELK_SOMETIMES) {
3592       check_dynamic_msaa_flag(abld, wm_prog_data,
3593                               INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3594       set_predicate(ELK_PREDICATE_NORMAL,
3595                     abld.SEL(sample_id, sample_id, elk_imm_ud(0)));
3596    }
3597 
3598    return sample_id;
3599 }
3600 
3601 static elk_fs_reg
emit_samplemaskin_setup(nir_to_elk_state & ntb)3602 emit_samplemaskin_setup(nir_to_elk_state &ntb)
3603 {
3604    const intel_device_info *devinfo = ntb.devinfo;
3605    const fs_builder &bld = ntb.bld;
3606    elk_fs_visitor &s = ntb.s;
3607 
3608    assert(s.stage == MESA_SHADER_FRAGMENT);
3609    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3610    assert(devinfo->ver >= 6);
3611 
3612    elk_fs_reg coverage_mask =
3613       fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, ELK_REGISTER_TYPE_D);
3614 
3615    if (wm_prog_data->persample_dispatch == ELK_NEVER)
3616       return coverage_mask;
3617 
3618    /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
3619     * and a mask representing which sample is being processed by the
3620     * current shader invocation.
3621     *
3622     * From the OES_sample_variables specification:
3623     * "When per-sample shading is active due to the use of a fragment input
3624     *  qualified by "sample" or due to the use of the gl_SampleID or
3625     *  gl_SamplePosition variables, only the bit for the current sample is
3626     *  set in gl_SampleMaskIn."
3627     */
3628    const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
3629 
3630    if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3631       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3632 
3633    elk_fs_reg one = s.vgrf(glsl_int_type());
3634    elk_fs_reg enabled_mask = s.vgrf(glsl_int_type());
3635    abld.MOV(one, elk_imm_d(1));
3636    abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
3637    elk_fs_reg mask = bld.vgrf(ELK_REGISTER_TYPE_D);
3638    abld.AND(mask, enabled_mask, coverage_mask);
3639 
3640    if (wm_prog_data->persample_dispatch == ELK_ALWAYS)
3641       return mask;
3642 
3643    check_dynamic_msaa_flag(abld, wm_prog_data,
3644                            INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3645    set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
3646 
3647    return mask;
3648 }
3649 
3650 static void
fs_nir_emit_fs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3651 fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
3652                          nir_intrinsic_instr *instr)
3653 {
3654    const intel_device_info *devinfo = ntb.devinfo;
3655    const fs_builder &bld = ntb.bld;
3656    elk_fs_visitor &s = ntb.s;
3657 
3658    assert(s.stage == MESA_SHADER_FRAGMENT);
3659 
3660    elk_fs_reg dest;
3661    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3662       dest = get_nir_def(ntb, instr->def);
3663 
3664    switch (instr->intrinsic) {
3665    case nir_intrinsic_load_front_face:
3666       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
3667               emit_frontfacing_interpolation(ntb));
3668       break;
3669 
3670    case nir_intrinsic_load_sample_pos:
3671    case nir_intrinsic_load_sample_pos_or_center: {
3672       elk_fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
3673       assert(sample_pos.file != BAD_FILE);
3674       dest.type = sample_pos.type;
3675       bld.MOV(dest, sample_pos);
3676       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3677       break;
3678    }
3679 
3680    case nir_intrinsic_load_layer_id:
3681       dest.type = ELK_REGISTER_TYPE_UD;
3682       bld.MOV(dest, fetch_render_target_array_index(bld));
3683       break;
3684 
3685    case nir_intrinsic_is_helper_invocation:
3686       emit_is_helper_invocation(ntb, dest);
3687       break;
3688 
3689    case nir_intrinsic_load_helper_invocation:
3690    case nir_intrinsic_load_sample_mask_in:
3691    case nir_intrinsic_load_sample_id:
3692    case nir_intrinsic_load_frag_shading_rate: {
3693       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3694       elk_fs_reg val = ntb.system_values[sv];
3695       assert(val.file != BAD_FILE);
3696       dest.type = val.type;
3697       bld.MOV(dest, val);
3698       break;
3699    }
3700 
3701    case nir_intrinsic_store_output: {
3702       const elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
3703       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3704       const unsigned location = nir_intrinsic_base(instr) +
3705          SET_FIELD(store_offset, ELK_NIR_FRAG_OUTPUT_LOCATION);
3706       const elk_fs_reg new_dest = retype(alloc_frag_output(ntb, location),
3707                                      src.type);
3708 
3709       for (unsigned j = 0; j < instr->num_components; j++)
3710          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3711                  offset(src, bld, j));
3712 
3713       break;
3714    }
3715 
3716    case nir_intrinsic_load_output: {
3717       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3718                                    ELK_NIR_FRAG_OUTPUT_LOCATION);
3719       assert(l >= FRAG_RESULT_DATA0);
3720       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3721       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3722       const elk_fs_reg tmp = bld.vgrf(dest.type, 4);
3723 
3724       assert(!reinterpret_cast<const elk_wm_prog_key *>(s.key)->coherent_fb_fetch);
3725       emit_non_coherent_fb_read(ntb, bld, tmp, target);
3726 
3727       for (unsigned j = 0; j < instr->num_components; j++) {
3728          bld.MOV(offset(dest, bld, j),
3729                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3730       }
3731 
3732       break;
3733    }
3734 
3735    case nir_intrinsic_demote:
3736    case nir_intrinsic_terminate:
3737    case nir_intrinsic_demote_if:
3738    case nir_intrinsic_terminate_if: {
3739       /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
3740        * can update just the flag bits that aren't yet discarded.  If there's
3741        * no condition, we emit a CMP of g0 != g0, so all currently executing
3742        * channels will get turned off.
3743        */
3744       elk_fs_inst *cmp = NULL;
3745       if (instr->intrinsic == nir_intrinsic_demote_if ||
3746           instr->intrinsic == nir_intrinsic_terminate_if) {
3747          nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3748 
3749          if (alu != NULL &&
3750              alu->op != nir_op_bcsel &&
3751              (devinfo->ver > 5 ||
3752               (alu->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) != ELK_NIR_BOOLEAN_NEEDS_RESOLVE ||
3753               alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
3754               alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3755               alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3756               alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3757               alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3758             /* Re-emit the instruction that generated the Boolean value, but
3759              * do not store it.  Since this instruction will be conditional,
3760              * other instructions that want to use the real Boolean value may
3761              * get garbage.  This was a problem for piglit's fs-discard-exit-2
3762              * test.
3763              *
3764              * Ideally we'd detect that the instruction cannot have a
3765              * conditional modifier before emitting the instructions.  Alas,
3766              * that is nigh impossible.  Instead, we're going to assume the
3767              * instruction (or last instruction) generated can have a
3768              * conditional modifier.  If it cannot, fallback to the old-style
3769              * compare, and hope dead code elimination will clean up the
3770              * extra instructions generated.
3771              */
3772             fs_nir_emit_alu(ntb, alu, false);
3773 
3774             cmp = (elk_fs_inst *) s.instructions.get_tail();
3775             if (cmp->conditional_mod == ELK_CONDITIONAL_NONE) {
3776                if (cmp->can_do_cmod())
3777                   cmp->conditional_mod = ELK_CONDITIONAL_Z;
3778                else
3779                   cmp = NULL;
3780             } else {
3781                /* The old sequence that would have been generated is,
3782                 * basically, bool_result == false.  This is equivalent to
3783                 * !bool_result, so negate the old modifier.
3784                 */
3785                cmp->conditional_mod = elk_negate_cmod(cmp->conditional_mod);
3786             }
3787          }
3788 
3789          if (cmp == NULL) {
3790             cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
3791                           elk_imm_d(0), ELK_CONDITIONAL_Z);
3792          }
3793       } else {
3794          elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
3795                                        ELK_REGISTER_TYPE_UW));
3796          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, ELK_CONDITIONAL_NZ);
3797       }
3798 
3799       cmp->predicate = ELK_PREDICATE_NORMAL;
3800       cmp->flag_subreg = sample_mask_flag_subreg(s);
3801 
3802       elk_fs_inst *jump = bld.emit(ELK_OPCODE_HALT);
3803       jump->flag_subreg = sample_mask_flag_subreg(s);
3804       jump->predicate_inverse = true;
3805 
3806       if (instr->intrinsic == nir_intrinsic_terminate ||
3807           instr->intrinsic == nir_intrinsic_terminate_if) {
3808          jump->predicate = ELK_PREDICATE_NORMAL;
3809       } else {
3810          /* Only jump when the whole quad is demoted.  For historical
3811           * reasons this is also used for discard.
3812           */
3813          jump->predicate = ELK_PREDICATE_ALIGN1_ANY4H;
3814       }
3815 
3816       if (devinfo->ver < 7)
3817          s.limit_dispatch_width(
3818             16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3819       break;
3820    }
3821 
3822    case nir_intrinsic_load_input:
3823    case nir_intrinsic_load_per_primitive_input: {
3824       /* In Fragment Shaders load_input is used either for flat inputs or
3825        * per-primitive inputs.
3826        */
3827       assert(instr->def.bit_size == 32);
3828       unsigned base = nir_intrinsic_base(instr);
3829       unsigned comp = nir_intrinsic_component(instr);
3830       unsigned num_components = instr->num_components;
3831 
3832       /* Special case fields in the VUE header */
3833       if (base == VARYING_SLOT_LAYER)
3834          comp = 1;
3835       else if (base == VARYING_SLOT_VIEWPORT)
3836          comp = 2;
3837 
3838       if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
3839          assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
3840          for (unsigned int i = 0; i < num_components; i++) {
3841             bld.MOV(offset(dest, bld, i),
3842                     retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
3843          }
3844       } else {
3845          const unsigned k = 3;
3846          for (unsigned int i = 0; i < num_components; i++) {
3847             bld.MOV(offset(dest, bld, i),
3848                     retype(s.interp_reg(bld, base, comp + i, k), dest.type));
3849          }
3850       }
3851       break;
3852    }
3853 
3854    case nir_intrinsic_load_fs_input_interp_deltas: {
3855       assert(s.stage == MESA_SHADER_FRAGMENT);
3856       assert(nir_src_as_uint(instr->src[0]) == 0);
3857       const unsigned base = nir_intrinsic_base(instr);
3858       const unsigned comp = nir_intrinsic_component(instr);
3859       dest.type = ELK_REGISTER_TYPE_F;
3860 
3861       bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
3862       bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
3863       bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
3864 
3865       break;
3866    }
3867 
3868    case nir_intrinsic_load_barycentric_pixel:
3869    case nir_intrinsic_load_barycentric_centroid:
3870    case nir_intrinsic_load_barycentric_sample: {
3871       /* Use the delta_xy values computed from the payload */
3872       enum elk_barycentric_mode bary = elk_barycentric_mode(instr);
3873       const elk_fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
3874                               offset(s.delta_xy[bary], bld, 1) };
3875       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3876       break;
3877    }
3878 
3879    case nir_intrinsic_load_barycentric_at_sample: {
3880       const glsl_interp_mode interpolation =
3881          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3882 
3883       elk_fs_reg msg_data;
3884       if (nir_src_is_const(instr->src[0])) {
3885          msg_data = elk_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
3886       } else {
3887          const elk_fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
3888                                           ELK_REGISTER_TYPE_UD);
3889          const elk_fs_reg sample_id = bld.emit_uniformize(sample_src);
3890          msg_data = component(bld.group(8, 0).vgrf(ELK_REGISTER_TYPE_UD), 0);
3891          bld.exec_all().group(1, 0).SHL(msg_data, sample_id, elk_imm_ud(4u));
3892       }
3893 
3894       elk_fs_reg flag_reg;
3895       struct elk_wm_prog_key *wm_prog_key = (struct elk_wm_prog_key *) s.key;
3896       if (wm_prog_key->multisample_fbo == ELK_SOMETIMES) {
3897          struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3898 
3899          check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
3900                                  wm_prog_data,
3901                                  INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3902          flag_reg = elk_flag_reg(0, 0);
3903       }
3904 
3905       emit_pixel_interpolater_send(bld,
3906                                    ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3907                                    dest,
3908                                    elk_fs_reg(), /* src */
3909                                    msg_data,
3910                                    flag_reg,
3911                                    interpolation);
3912       break;
3913    }
3914 
3915    case nir_intrinsic_load_barycentric_at_offset: {
3916       const glsl_interp_mode interpolation =
3917          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3918 
3919       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3920 
3921       if (const_offset) {
3922          assert(nir_src_bit_size(instr->src[0]) == 32);
3923          unsigned off_x = const_offset[0].u32 & 0xf;
3924          unsigned off_y = const_offset[1].u32 & 0xf;
3925 
3926          emit_pixel_interpolater_send(bld,
3927                                       ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3928                                       dest,
3929                                       elk_fs_reg(), /* src */
3930                                       elk_imm_ud(off_x | (off_y << 4)),
3931                                       elk_fs_reg(), /* flag_reg */
3932                                       interpolation);
3933       } else {
3934          elk_fs_reg src = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_D);
3935          const enum elk_opcode opcode = ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3936          emit_pixel_interpolater_send(bld,
3937                                       opcode,
3938                                       dest,
3939                                       src,
3940                                       elk_imm_ud(0u),
3941                                       elk_fs_reg(), /* flag_reg */
3942                                       interpolation);
3943       }
3944       break;
3945    }
3946 
3947    case nir_intrinsic_load_frag_coord:
3948       emit_fragcoord_interpolation(ntb, dest);
3949       break;
3950 
3951    case nir_intrinsic_load_interpolated_input: {
3952       assert(instr->src[0].ssa &&
3953              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3954       nir_intrinsic_instr *bary_intrinsic =
3955          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3956       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3957       enum glsl_interp_mode interp_mode =
3958          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3959       elk_fs_reg dst_xy;
3960 
3961       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3962           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3963          /* Use the result of the PI message. */
3964          dst_xy = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F);
3965       } else {
3966          /* Use the delta_xy values computed from the payload */
3967          enum elk_barycentric_mode bary = elk_barycentric_mode(bary_intrinsic);
3968          dst_xy = s.delta_xy[bary];
3969       }
3970 
3971       for (unsigned int i = 0; i < instr->num_components; i++) {
3972          elk_fs_reg interp =
3973             s.interp_reg(bld, nir_intrinsic_base(instr),
3974                          nir_intrinsic_component(instr) + i, 0);
3975          interp.type = ELK_REGISTER_TYPE_F;
3976          dest.type = ELK_REGISTER_TYPE_F;
3977 
3978          if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3979             elk_fs_reg tmp = s.vgrf(glsl_float_type());
3980             bld.emit(ELK_FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3981             bld.MUL(offset(dest, bld, i), tmp, s.pixel_w);
3982          } else {
3983             bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3984          }
3985       }
3986       break;
3987    }
3988 
3989    default:
3990       fs_nir_emit_intrinsic(ntb, bld, instr);
3991       break;
3992    }
3993 }
3994 
3995 static void
fs_nir_emit_cs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3996 fs_nir_emit_cs_intrinsic(nir_to_elk_state &ntb,
3997                          nir_intrinsic_instr *instr)
3998 {
3999    const intel_device_info *devinfo = ntb.devinfo;
4000    const fs_builder &bld = ntb.bld;
4001    elk_fs_visitor &s = ntb.s;
4002 
4003    assert(gl_shader_stage_uses_workgroup(s.stage));
4004    struct elk_cs_prog_data *cs_prog_data = elk_cs_prog_data(s.prog_data);
4005 
4006    elk_fs_reg dest;
4007    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4008       dest = get_nir_def(ntb, instr->def);
4009 
4010    switch (instr->intrinsic) {
4011    case nir_intrinsic_barrier:
4012       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4013          fs_nir_emit_intrinsic(ntb, bld, instr);
4014       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4015          /* The whole workgroup fits in a single HW thread, so all the
4016           * invocations are already executed lock-step.  Instead of an actual
4017           * barrier just emit a scheduling fence, that will generate no code.
4018           */
4019          if (!s.nir->info.workgroup_size_variable &&
4020              s.workgroup_size() <= s.dispatch_width) {
4021             bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE);
4022             break;
4023          }
4024 
4025          emit_barrier(ntb);
4026          cs_prog_data->uses_barrier = true;
4027       }
4028       break;
4029 
4030    case nir_intrinsic_load_subgroup_id:
4031       s.cs_payload().load_subgroup_id(bld, dest);
4032       break;
4033 
4034    case nir_intrinsic_load_workgroup_id: {
4035       elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4036       assert(val.file != BAD_FILE);
4037       dest.type = val.type;
4038       for (unsigned i = 0; i < 3; i++)
4039          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
4040       break;
4041    }
4042 
4043    case nir_intrinsic_load_num_workgroups: {
4044       assert(instr->def.bit_size == 32);
4045 
4046       cs_prog_data->uses_num_work_groups = true;
4047 
4048       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4049       srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(0);
4050       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4051       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(3); /* num components */
4052       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = elk_imm_ud(0);
4053       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4054       elk_fs_inst *inst =
4055          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4056                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4057       inst->size_written = 3 * s.dispatch_width * 4;
4058       break;
4059    }
4060 
4061    case nir_intrinsic_shared_atomic:
4062    case nir_intrinsic_shared_atomic_swap:
4063       fs_nir_emit_surface_atomic(ntb, bld, instr, elk_imm_ud(GFX7_BTI_SLM),
4064                                  false /* bindless */);
4065       break;
4066 
4067    case nir_intrinsic_load_shared: {
4068       assert(devinfo->ver >= 7);
4069 
4070       const unsigned bit_size = instr->def.bit_size;
4071       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4072       srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4073 
4074       elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
4075       int base = nir_intrinsic_base(instr);
4076       if (base) {
4077          elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4078          bld.ADD(addr_off, addr, elk_imm_d(base));
4079          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4080       } else {
4081          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4082       }
4083 
4084       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4085       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4086 
4087       /* Make dest unsigned because that's what the temporary will be */
4088       dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4089 
4090       /* Read the vector */
4091       assert(bit_size <= 32);
4092       assert(nir_intrinsic_align(instr) > 0);
4093       if (bit_size == 32 &&
4094           nir_intrinsic_align(instr) >= 4) {
4095          assert(instr->def.num_components <= 4);
4096          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4097          elk_fs_inst *inst =
4098             bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4099                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4100          inst->size_written = instr->num_components * s.dispatch_width * 4;
4101       } else {
4102          assert(instr->def.num_components == 1);
4103          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4104 
4105          elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
4106          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4107                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4108          bld.MOV(dest, subscript(read_result, dest.type, 0));
4109       }
4110       break;
4111    }
4112 
4113    case nir_intrinsic_store_shared: {
4114       assert(devinfo->ver >= 7);
4115 
4116       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4117       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4118       srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4119 
4120       elk_fs_reg addr = get_nir_src(ntb, instr->src[1]);
4121       int base = nir_intrinsic_base(instr);
4122       if (base) {
4123          elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4124          bld.ADD(addr_off, addr, elk_imm_d(base));
4125          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4126       } else {
4127          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4128       }
4129 
4130       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4131       /* No point in masking with sample mask, here we're handling compute
4132        * intrinsics.
4133        */
4134       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4135 
4136       elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
4137       data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4138 
4139       assert(bit_size <= 32);
4140       assert(nir_intrinsic_write_mask(instr) ==
4141              (1u << instr->num_components) - 1);
4142       assert(nir_intrinsic_align(instr) > 0);
4143       if (bit_size == 32 &&
4144           nir_intrinsic_align(instr) >= 4) {
4145          assert(nir_src_num_components(instr->src[0]) <= 4);
4146          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4147          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4148          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4149                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4150       } else {
4151          assert(nir_src_num_components(instr->src[0]) == 1);
4152          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4153 
4154          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
4155          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4156 
4157          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4158                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4159       }
4160       break;
4161    }
4162 
4163    case nir_intrinsic_load_workgroup_size: {
4164       /* Should have been lowered by elk_nir_lower_cs_intrinsics() or
4165        * crocus/iris_setup_uniforms() for the variable group size case.
4166        */
4167       unreachable("Should have been lowered");
4168       break;
4169    }
4170 
4171    default:
4172       fs_nir_emit_intrinsic(ntb, bld, instr);
4173       break;
4174    }
4175 }
4176 
4177 static elk_fs_reg
elk_nir_reduction_op_identity(const fs_builder & bld,nir_op op,elk_reg_type type)4178 elk_nir_reduction_op_identity(const fs_builder &bld,
4179                               nir_op op, elk_reg_type type)
4180 {
4181    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4182    switch (type_sz(type)) {
4183    case 1:
4184       if (type == ELK_REGISTER_TYPE_UB) {
4185          return elk_imm_uw(value.u8);
4186       } else {
4187          assert(type == ELK_REGISTER_TYPE_B);
4188          return elk_imm_w(value.i8);
4189       }
4190    case 2:
4191       return retype(elk_imm_uw(value.u16), type);
4192    case 4:
4193       return retype(elk_imm_ud(value.u32), type);
4194    case 8:
4195       if (type == ELK_REGISTER_TYPE_DF)
4196          return elk_setup_imm_df(bld, value.f64);
4197       else
4198          return retype(elk_imm_u64(value.u64), type);
4199    default:
4200       unreachable("Invalid type size");
4201    }
4202 }
4203 
4204 static elk_opcode
elk_op_for_nir_reduction_op(nir_op op)4205 elk_op_for_nir_reduction_op(nir_op op)
4206 {
4207    switch (op) {
4208    case nir_op_iadd: return ELK_OPCODE_ADD;
4209    case nir_op_fadd: return ELK_OPCODE_ADD;
4210    case nir_op_imul: return ELK_OPCODE_MUL;
4211    case nir_op_fmul: return ELK_OPCODE_MUL;
4212    case nir_op_imin: return ELK_OPCODE_SEL;
4213    case nir_op_umin: return ELK_OPCODE_SEL;
4214    case nir_op_fmin: return ELK_OPCODE_SEL;
4215    case nir_op_imax: return ELK_OPCODE_SEL;
4216    case nir_op_umax: return ELK_OPCODE_SEL;
4217    case nir_op_fmax: return ELK_OPCODE_SEL;
4218    case nir_op_iand: return ELK_OPCODE_AND;
4219    case nir_op_ior:  return ELK_OPCODE_OR;
4220    case nir_op_ixor: return ELK_OPCODE_XOR;
4221    default:
4222       unreachable("Invalid reduction operation");
4223    }
4224 }
4225 
4226 static elk_conditional_mod
elk_cond_mod_for_nir_reduction_op(nir_op op)4227 elk_cond_mod_for_nir_reduction_op(nir_op op)
4228 {
4229    switch (op) {
4230    case nir_op_iadd: return ELK_CONDITIONAL_NONE;
4231    case nir_op_fadd: return ELK_CONDITIONAL_NONE;
4232    case nir_op_imul: return ELK_CONDITIONAL_NONE;
4233    case nir_op_fmul: return ELK_CONDITIONAL_NONE;
4234    case nir_op_imin: return ELK_CONDITIONAL_L;
4235    case nir_op_umin: return ELK_CONDITIONAL_L;
4236    case nir_op_fmin: return ELK_CONDITIONAL_L;
4237    case nir_op_imax: return ELK_CONDITIONAL_GE;
4238    case nir_op_umax: return ELK_CONDITIONAL_GE;
4239    case nir_op_fmax: return ELK_CONDITIONAL_GE;
4240    case nir_op_iand: return ELK_CONDITIONAL_NONE;
4241    case nir_op_ior:  return ELK_CONDITIONAL_NONE;
4242    case nir_op_ixor: return ELK_CONDITIONAL_NONE;
4243    default:
4244       unreachable("Invalid reduction operation");
4245    }
4246 }
4247 
4248 struct rebuild_resource {
4249    unsigned idx;
4250    std::vector<nir_def *> array;
4251 };
4252 
4253 static bool
add_rebuild_src(nir_src * src,void * state)4254 add_rebuild_src(nir_src *src, void *state)
4255 {
4256    struct rebuild_resource *res = (struct rebuild_resource *) state;
4257 
4258    for (nir_def *def : res->array) {
4259       if (def == src->ssa)
4260          return true;
4261    }
4262 
4263    nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
4264    res->array.push_back(src->ssa);
4265    return true;
4266 }
4267 
4268 static elk_fs_reg
try_rebuild_resource(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_def * resource_def)4269 try_rebuild_resource(nir_to_elk_state &ntb, const elk::fs_builder &bld, nir_def *resource_def)
4270 {
4271    /* Create a build at the location of the resource_intel intrinsic */
4272    fs_builder ubld8 = bld.exec_all().group(8, 0);
4273 
4274    struct rebuild_resource resources = {};
4275    resources.idx = 0;
4276 
4277    if (!nir_foreach_src(resource_def->parent_instr,
4278                         add_rebuild_src, &resources))
4279       return elk_fs_reg();
4280    resources.array.push_back(resource_def);
4281 
4282    if (resources.array.size() == 1) {
4283       nir_def *def = resources.array[0];
4284 
4285       if (def->parent_instr->type == nir_instr_type_load_const) {
4286          nir_load_const_instr *load_const =
4287             nir_instr_as_load_const(def->parent_instr);
4288          return elk_imm_ud(load_const->value[0].i32);
4289       } else {
4290          assert(def->parent_instr->type == nir_instr_type_intrinsic &&
4291                 (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
4292                  nir_intrinsic_load_uniform));
4293          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
4294          unsigned base_offset = nir_intrinsic_base(intrin);
4295          unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4296          elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4297          src.offset = load_offset + base_offset % 4;
4298          return src;
4299       }
4300    }
4301 
4302    for (unsigned i = 0; i < resources.array.size(); i++) {
4303       nir_def *def = resources.array[i];
4304 
4305       nir_instr *instr = def->parent_instr;
4306       switch (instr->type) {
4307       case nir_instr_type_load_const: {
4308          nir_load_const_instr *load_const =
4309             nir_instr_as_load_const(instr);
4310          elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4311          ntb.resource_insts[def->index] =
4312             ubld8.MOV(dst, elk_imm_ud(load_const->value[0].i32));
4313          break;
4314       }
4315 
4316       case nir_instr_type_alu: {
4317          nir_alu_instr *alu = nir_instr_as_alu(instr);
4318 
4319          if (nir_op_infos[alu->op].num_inputs == 2) {
4320             if (alu->src[0].swizzle[0] != 0 ||
4321                 alu->src[1].swizzle[0] != 0)
4322                break;
4323          } else if (nir_op_infos[alu->op].num_inputs == 3) {
4324             if (alu->src[0].swizzle[0] != 0 ||
4325                 alu->src[1].swizzle[0] != 0 ||
4326                 alu->src[2].swizzle[0] != 0)
4327                break;
4328          } else {
4329             /* Not supported ALU input count */
4330             break;
4331          }
4332 
4333          switch (alu->op) {
4334          case nir_op_iadd: {
4335             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4336             elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4337             elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4338             assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4339             assert(src0.type == ELK_REGISTER_TYPE_UD);
4340             ntb.resource_insts[def->index] =
4341                ubld8.ADD(dst,
4342                          src0.file != IMM ? src0 : src1,
4343                          src0.file != IMM ? src1 : src0);
4344             break;
4345          }
4346          case nir_op_ushr: {
4347             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4348             elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4349             elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4350             assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4351             assert(src0.type == ELK_REGISTER_TYPE_UD);
4352             ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
4353             break;
4354          }
4355          case nir_op_ishl: {
4356             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4357             elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4358             elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4359             assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4360             assert(src0.type == ELK_REGISTER_TYPE_UD);
4361             ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
4362             break;
4363          }
4364          case nir_op_mov: {
4365             break;
4366          }
4367          default:
4368             break;
4369          }
4370          break;
4371       }
4372 
4373       case nir_instr_type_intrinsic: {
4374          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4375          switch (intrin->intrinsic) {
4376          case nir_intrinsic_resource_intel:
4377             ntb.resource_insts[def->index] =
4378                ntb.resource_insts[intrin->src[1].ssa->index];
4379             break;
4380 
4381          case nir_intrinsic_load_uniform: {
4382             if (!nir_src_is_const(intrin->src[0]))
4383                break;
4384 
4385             unsigned base_offset = nir_intrinsic_base(intrin);
4386             unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4387             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4388             elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4389             src.offset = load_offset + base_offset % 4;
4390             ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
4391             break;
4392          }
4393 
4394          default:
4395             break;
4396          }
4397          break;
4398       }
4399 
4400       default:
4401          break;
4402       }
4403 
4404       if (ntb.resource_insts[def->index] == NULL)
4405          return elk_fs_reg();
4406    }
4407 
4408    assert(ntb.resource_insts[resource_def->index] != NULL);
4409    return component(ntb.resource_insts[resource_def->index]->dst, 0);
4410 }
4411 
4412 static elk_fs_reg
get_nir_image_intrinsic_image(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4413 get_nir_image_intrinsic_image(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4414                               nir_intrinsic_instr *instr)
4415 {
4416    if (is_resource_src(instr->src[0])) {
4417       elk_fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
4418       if (surf_index.file != BAD_FILE)
4419          return surf_index;
4420    }
4421 
4422    elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD);
4423    elk_fs_reg surf_index = image;
4424 
4425    return bld.emit_uniformize(surf_index);
4426 }
4427 
4428 static elk_fs_reg
get_nir_buffer_intrinsic_index(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4429 get_nir_buffer_intrinsic_index(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4430                                nir_intrinsic_instr *instr)
4431 {
4432    /* SSBO stores are weird in that their index is in src[1] */
4433    const bool is_store =
4434       instr->intrinsic == nir_intrinsic_store_ssbo ||
4435       instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4436    nir_src src = is_store ? instr->src[1] : instr->src[0];
4437 
4438    if (nir_src_is_const(src)) {
4439       return elk_imm_ud(nir_src_as_uint(src));
4440    } else if (is_resource_src(src)) {
4441       elk_fs_reg surf_index = get_resource_nir_src(ntb, src);
4442       if (surf_index.file != BAD_FILE)
4443          return surf_index;
4444    }
4445    return bld.emit_uniformize(get_nir_src(ntb, src));
4446 }
4447 
4448 /**
4449  * The offsets we get from NIR act as if each SIMD channel has it's own blob
4450  * of contiguous space.  However, if we actually place each SIMD channel in
4451  * it's own space, we end up with terrible cache performance because each SIMD
4452  * channel accesses a different cache line even when they're all accessing the
4453  * same byte offset.  To deal with this problem, we swizzle the address using
4454  * a simple algorithm which ensures that any time a SIMD message reads or
4455  * writes the same address, it's all in the same cache line.  We have to keep
4456  * the bottom two bits fixed so that we can read/write up to a dword at a time
4457  * and the individual element is contiguous.  We do this by splitting the
4458  * address as follows:
4459  *
4460  *    31                             4-6           2          0
4461  *    +-------------------------------+------------+----------+
4462  *    |        Hi address bits        | chan index | addr low |
4463  *    +-------------------------------+------------+----------+
4464  *
4465  * In other words, the bottom two address bits stay, and the top 30 get
4466  * shifted up so that we can stick the SIMD channel index in the middle.  This
4467  * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4468  * at the same logical offset, the scratch read/write instruction acts on
4469  * continuous elements and we get good cache locality.
4470  */
4471 static elk_fs_reg
swizzle_nir_scratch_addr(nir_to_elk_state & ntb,const elk::fs_builder & bld,const elk_fs_reg & nir_addr,bool in_dwords)4472 swizzle_nir_scratch_addr(nir_to_elk_state &ntb,
4473                          const elk::fs_builder &bld,
4474                          const elk_fs_reg &nir_addr,
4475                          bool in_dwords)
4476 {
4477    elk_fs_visitor &s = ntb.s;
4478 
4479    const elk_fs_reg &chan_index =
4480       ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4481    const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4482 
4483    elk_fs_reg addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4484    if (in_dwords) {
4485       /* In this case, we know the address is aligned to a DWORD and we want
4486        * the final address in DWORDs.
4487        */
4488       bld.SHL(addr, nir_addr, elk_imm_ud(chan_index_bits - 2));
4489       bld.OR(addr, addr, chan_index);
4490    } else {
4491       /* This case substantially more annoying because we have to pay
4492        * attention to those pesky two bottom bits.
4493        */
4494       elk_fs_reg addr_hi = bld.vgrf(ELK_REGISTER_TYPE_UD);
4495       bld.AND(addr_hi, nir_addr, elk_imm_ud(~0x3u));
4496       bld.SHL(addr_hi, addr_hi, elk_imm_ud(chan_index_bits));
4497       elk_fs_reg chan_addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4498       bld.SHL(chan_addr, chan_index, elk_imm_ud(2));
4499       bld.AND(addr, nir_addr, elk_imm_ud(0x3u));
4500       bld.OR(addr, addr, addr_hi);
4501       bld.OR(addr, addr, chan_addr);
4502    }
4503    return addr;
4504 }
4505 
4506 static unsigned
choose_oword_block_size_dwords(const struct intel_device_info * devinfo,unsigned dwords)4507 choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
4508                                unsigned dwords)
4509 {
4510    unsigned block;
4511    if (devinfo->has_lsc && dwords >= 64) {
4512       block = 64;
4513    } else if (dwords >= 32) {
4514       block = 32;
4515    } else if (dwords >= 16) {
4516       block = 16;
4517    } else {
4518       block = 8;
4519    }
4520    assert(block <= dwords);
4521    return block;
4522 }
4523 
4524 static void
increment_a64_address(const fs_builder & bld,elk_fs_reg address,uint32_t v)4525 increment_a64_address(const fs_builder &bld, elk_fs_reg address, uint32_t v)
4526 {
4527    if (bld.shader->devinfo->has_64bit_int) {
4528       bld.ADD(address, address, elk_imm_ud(v));
4529    } else {
4530       elk_fs_reg low = retype(address, ELK_REGISTER_TYPE_UD);
4531       elk_fs_reg high = offset(low, bld, 1);
4532 
4533       /* Add low and if that overflows, add carry to high. */
4534       bld.ADD(low, low, elk_imm_ud(v))->conditional_mod = ELK_CONDITIONAL_O;
4535       bld.ADD(high, high, elk_imm_ud(0x1))->predicate = ELK_PREDICATE_NORMAL;
4536    }
4537 }
4538 
4539 static elk_fs_reg
emit_fence(const fs_builder & bld,enum elk_opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4540 emit_fence(const fs_builder &bld, enum elk_opcode opcode,
4541            uint8_t sfid, uint32_t desc,
4542            bool commit_enable, uint8_t bti)
4543 {
4544    assert(opcode == ELK_SHADER_OPCODE_INTERLOCK ||
4545           opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
4546 
4547    elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
4548    elk_fs_inst *fence = bld.emit(opcode, dst, elk_vec8_grf(0, 0),
4549                              elk_imm_ud(commit_enable),
4550                              elk_imm_ud(bti));
4551    fence->sfid = sfid;
4552    fence->desc = desc;
4553 
4554    return dst;
4555 }
4556 
4557 /**
4558  * Create a MOV to read the timestamp register.
4559  */
4560 static elk_fs_reg
get_timestamp(const fs_builder & bld)4561 get_timestamp(const fs_builder &bld)
4562 {
4563    elk_fs_visitor &s = *bld.shader;
4564    const intel_device_info *devinfo = s.devinfo;
4565 
4566    assert(devinfo->ver >= 7);
4567 
4568    elk_fs_reg ts = elk_fs_reg(retype(elk_vec4_reg(ELK_ARCHITECTURE_REGISTER_FILE,
4569                                           ELK_ARF_TIMESTAMP,
4570                                           0),
4571                              ELK_REGISTER_TYPE_UD));
4572 
4573    elk_fs_reg dst = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
4574 
4575    /* We want to read the 3 fields we care about even if it's not enabled in
4576     * the dispatch.
4577     */
4578    bld.group(4, 0).exec_all().MOV(dst, ts);
4579 
4580    return dst;
4581 }
4582 
4583 static void
fs_nir_emit_intrinsic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)4584 fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
4585                       const fs_builder &bld, nir_intrinsic_instr *instr)
4586 {
4587    const intel_device_info *devinfo = ntb.devinfo;
4588    elk_fs_visitor &s = ntb.s;
4589 
4590    /* We handle this as a special case */
4591    if (instr->intrinsic == nir_intrinsic_decl_reg) {
4592       assert(nir_intrinsic_num_array_elems(instr) == 0);
4593       unsigned bit_size = nir_intrinsic_bit_size(instr);
4594       unsigned num_components = nir_intrinsic_num_components(instr);
4595       const elk_reg_type reg_type =
4596          elk_reg_type_from_bit_size(bit_size, bit_size == 8 ?
4597                                               ELK_REGISTER_TYPE_D :
4598                                               ELK_REGISTER_TYPE_F);
4599 
4600       /* Re-use the destination's slot in the table for the register */
4601       ntb.ssa_values[instr->def.index] =
4602          bld.vgrf(reg_type, num_components);
4603       return;
4604    }
4605 
4606    elk_fs_reg dest;
4607    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4608       dest = get_nir_def(ntb, instr->def);
4609 
4610    switch (instr->intrinsic) {
4611    case nir_intrinsic_resource_intel:
4612       ntb.ssa_bind_infos[instr->def.index].valid = true;
4613       ntb.ssa_bind_infos[instr->def.index].bindless =
4614          (nir_intrinsic_resource_access_intel(instr) &
4615           nir_resource_intel_bindless) != 0;
4616       ntb.ssa_bind_infos[instr->def.index].block =
4617          nir_intrinsic_resource_block_intel(instr);
4618       ntb.ssa_bind_infos[instr->def.index].set =
4619          nir_intrinsic_desc_set(instr);
4620       ntb.ssa_bind_infos[instr->def.index].binding =
4621          nir_intrinsic_binding(instr);
4622 
4623       if (nir_intrinsic_resource_access_intel(instr) &
4624            nir_resource_intel_non_uniform) {
4625          ntb.resource_values[instr->def.index] = elk_fs_reg();
4626       } else {
4627          ntb.resource_values[instr->def.index] =
4628             try_rebuild_resource(ntb, bld, instr->src[1].ssa);
4629       }
4630       ntb.ssa_values[instr->def.index] =
4631          ntb.ssa_values[instr->src[1].ssa->index];
4632       break;
4633 
4634    case nir_intrinsic_load_reg:
4635    case nir_intrinsic_store_reg:
4636       /* Nothing to do with these. */
4637       break;
4638 
4639    case nir_intrinsic_image_load:
4640    case nir_intrinsic_image_store:
4641    case nir_intrinsic_image_atomic:
4642    case nir_intrinsic_image_atomic_swap:
4643    case nir_intrinsic_bindless_image_load:
4644    case nir_intrinsic_bindless_image_store:
4645    case nir_intrinsic_bindless_image_atomic:
4646    case nir_intrinsic_bindless_image_atomic_swap: {
4647       /* Get some metadata from the image intrinsic. */
4648       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4649 
4650       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4651 
4652       switch (instr->intrinsic) {
4653       case nir_intrinsic_image_load:
4654       case nir_intrinsic_image_store:
4655       case nir_intrinsic_image_atomic:
4656       case nir_intrinsic_image_atomic_swap:
4657          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4658             get_nir_image_intrinsic_image(ntb, bld, instr);
4659          break;
4660 
4661       default:
4662          /* Bindless */
4663          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4664             get_nir_image_intrinsic_image(ntb, bld, instr);
4665          break;
4666       }
4667 
4668       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4669       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4670          elk_imm_ud(nir_image_intrinsic_coord_components(instr));
4671 
4672       /* Emit an image load, store or atomic op. */
4673       if (instr->intrinsic == nir_intrinsic_image_load ||
4674           instr->intrinsic == nir_intrinsic_bindless_image_load) {
4675          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4676          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4677          elk_fs_inst *inst =
4678             bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4679                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4680          inst->size_written = instr->num_components * s.dispatch_width * 4;
4681       } else if (instr->intrinsic == nir_intrinsic_image_store ||
4682                  instr->intrinsic == nir_intrinsic_bindless_image_store) {
4683          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4684          srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
4685          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4686          bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4687                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4688       } else {
4689          unsigned num_srcs = info->num_srcs;
4690          enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
4691          if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
4692             assert(num_srcs == 4);
4693             num_srcs = 3;
4694          }
4695 
4696          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
4697 
4698          elk_fs_reg data;
4699          if (num_srcs >= 4)
4700             data = get_nir_src(ntb, instr->src[3]);
4701          if (num_srcs >= 5) {
4702             elk_fs_reg tmp = bld.vgrf(data.type, 2);
4703             elk_fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
4704             bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4705             data = tmp;
4706          }
4707          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4708          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4709 
4710          bld.emit(ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4711                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4712       }
4713       break;
4714    }
4715 
4716    case nir_intrinsic_image_size:
4717    case nir_intrinsic_bindless_image_size: {
4718       /* Cube image sizes should have previously been lowered to a 2D array */
4719       assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
4720 
4721       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4722        * into will handle the binding table index for us in the geneerator.
4723        * Incidentally, this means that we can handle bindless with exactly the
4724        * same code.
4725        */
4726       elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
4727                             ELK_REGISTER_TYPE_UD);
4728       image = bld.emit_uniformize(image);
4729 
4730       assert(nir_src_as_uint(instr->src[1]) == 0);
4731 
4732       elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4733       if (instr->intrinsic == nir_intrinsic_image_size)
4734          srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4735       else
4736          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4737       srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_d(0);
4738       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(0);
4739       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
4740       srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
4741 
4742       /* Since the image size is always uniform, we can just emit a SIMD8
4743        * query instruction and splat the result out.
4744        */
4745       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
4746 
4747       elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
4748       elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4749                                 tmp, srcs, ARRAY_SIZE(srcs));
4750       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
4751 
4752       for (unsigned c = 0; c < instr->def.num_components; ++c) {
4753          bld.MOV(offset(retype(dest, tmp.type), bld, c),
4754                  component(offset(tmp, ubld, c), 0));
4755       }
4756       break;
4757    }
4758 
4759    case nir_intrinsic_image_load_raw_intel: {
4760       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4761       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4762          get_nir_image_intrinsic_image(ntb, bld, instr);
4763       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4764       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4765       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4766       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4767 
4768       elk_fs_inst *inst =
4769          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4770                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4771       inst->size_written = instr->num_components * s.dispatch_width * 4;
4772       break;
4773    }
4774 
4775    case nir_intrinsic_image_store_raw_intel: {
4776       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4777       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4778          get_nir_image_intrinsic_image(ntb, bld, instr);
4779       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4780       srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
4781       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4782       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4783       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4784 
4785       bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4786                elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4787       break;
4788    }
4789 
4790    case nir_intrinsic_barrier:
4791    case nir_intrinsic_begin_invocation_interlock:
4792    case nir_intrinsic_end_invocation_interlock: {
4793       bool ugm_fence, slm_fence, tgm_fence, urb_fence;
4794       enum elk_opcode opcode = ELK_OPCODE_NOP;
4795 
4796       /* Handling interlock intrinsics here will allow the logic for IVB
4797        * render cache (see below) to be reused.
4798        */
4799 
4800       switch (instr->intrinsic) {
4801       case nir_intrinsic_barrier: {
4802          /* Note we only care about the memory part of the
4803           * barrier.  The execution part will be taken care
4804           * of by the stage specific intrinsic handler functions.
4805           */
4806          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4807          ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
4808          slm_fence = modes & nir_var_mem_shared;
4809          tgm_fence = modes & nir_var_image;
4810          urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
4811          if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4812             opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4813          break;
4814       }
4815 
4816       case nir_intrinsic_begin_invocation_interlock:
4817          /* For beginInvocationInterlockARB(), we will generate a memory fence
4818           * but with a different opcode so that generator can pick SENDC
4819           * instead of SEND.
4820           */
4821          assert(s.stage == MESA_SHADER_FRAGMENT);
4822          ugm_fence = tgm_fence = true;
4823          slm_fence = urb_fence = false;
4824          opcode = ELK_SHADER_OPCODE_INTERLOCK;
4825          break;
4826 
4827       case nir_intrinsic_end_invocation_interlock:
4828          /* For endInvocationInterlockARB(), we need to insert a memory fence which
4829           * stalls in the shader until the memory transactions prior to that
4830           * fence are complete.  This ensures that the shader does not end before
4831           * any writes from its critical section have landed.  Otherwise, you can
4832           * end up with a case where the next invocation on that pixel properly
4833           * stalls for previous FS invocation on its pixel to complete but
4834           * doesn't actually wait for the dataport memory transactions from that
4835           * thread to land before submitting its own.
4836           */
4837          assert(s.stage == MESA_SHADER_FRAGMENT);
4838          ugm_fence = tgm_fence = true;
4839          slm_fence = urb_fence = false;
4840          opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4841          break;
4842 
4843       default:
4844          unreachable("invalid intrinsic");
4845       }
4846 
4847       if (opcode == ELK_OPCODE_NOP)
4848          break;
4849 
4850       if (s.nir->info.shared_size > 0) {
4851          assert(gl_shader_stage_uses_workgroup(s.stage));
4852       } else {
4853          slm_fence = false;
4854       }
4855 
4856       /* If the workgroup fits in a single HW thread, the messages for SLM are
4857        * processed in-order and the shader itself is already synchronized so
4858        * the memory fence is not necessary.
4859        *
4860        * TODO: Check if applies for many HW threads sharing same Data Port.
4861        */
4862       if (!s.nir->info.workgroup_size_variable &&
4863           slm_fence && s.workgroup_size() <= s.dispatch_width)
4864          slm_fence = false;
4865 
4866       switch (s.stage) {
4867          case MESA_SHADER_TESS_CTRL:
4868             break;
4869          default:
4870             urb_fence = false;
4871             break;
4872       }
4873 
4874       unsigned fence_regs_count = 0;
4875       elk_fs_reg fence_regs[4] = {};
4876 
4877       const fs_builder ubld = bld.group(8, 0);
4878 
4879       /* Prior to Icelake, they're all lumped into a single cache except on
4880        * Ivy Bridge and Bay Trail where typed messages actually go through
4881        * the render cache.  There, we need both fences because we may
4882        * access storage images as either typed or untyped.
4883        */
4884       const bool render_fence = tgm_fence && devinfo->verx10 == 70;
4885 
4886       const bool commit_enable = render_fence ||
4887          instr->intrinsic == nir_intrinsic_end_invocation_interlock;
4888 
4889       if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
4890          fence_regs[fence_regs_count++] =
4891             emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4892                        commit_enable, 0 /* BTI */);
4893       }
4894 
4895       if (render_fence) {
4896          fence_regs[fence_regs_count++] =
4897             emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
4898                        commit_enable, /* bti */ 0);
4899       }
4900 
4901       assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
4902 
4903       /* There are four cases where we want to insert a stall:
4904        *
4905        *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
4906        *     required to ensure that the shader EOT doesn't happen until
4907        *     after the fence returns.  Otherwise, we might end up with the
4908        *     next shader invocation for that pixel not respecting our fence
4909        *     because it may happen on a different HW thread.
4910        *
4911        *  2. If we have multiple fences.  This is required to ensure that
4912        *     they all complete and nothing gets weirdly out-of-order.
4913        *
4914        *  3. If we have no fences.  In this case, we need at least a
4915        *     scheduling barrier to keep the compiler from moving things
4916        *     around in an invalid way.
4917        */
4918       if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4919           fence_regs_count != 1) {
4920          ubld.exec_all().group(1, 0).emit(
4921             ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4922             fence_regs, fence_regs_count);
4923       }
4924 
4925       break;
4926    }
4927 
4928    case nir_intrinsic_shader_clock: {
4929       /* We cannot do anything if there is an event, so ignore it for now */
4930       const elk_fs_reg shader_clock = get_timestamp(bld);
4931       const elk_fs_reg srcs[] = { component(shader_clock, 0),
4932                               component(shader_clock, 1) };
4933       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4934       break;
4935    }
4936 
4937    case nir_intrinsic_load_reloc_const_intel: {
4938       uint32_t id = nir_intrinsic_param_idx(instr);
4939 
4940       /* Emit the reloc in the smallest SIMD size to limit register usage. */
4941       const fs_builder ubld = bld.exec_all().group(1, 0);
4942       elk_fs_reg small_dest = ubld.vgrf(dest.type);
4943       ubld.UNDEF(small_dest);
4944       ubld.exec_all().group(1, 0).emit(ELK_SHADER_OPCODE_MOV_RELOC_IMM,
4945                                        small_dest, elk_imm_ud(id));
4946 
4947       /* Copy propagation will get rid of this MOV. */
4948       bld.MOV(dest, component(small_dest, 0));
4949       break;
4950    }
4951 
4952    case nir_intrinsic_load_uniform: {
4953       /* Offsets are in bytes but they should always aligned to
4954        * the type size
4955        */
4956       unsigned base_offset = nir_intrinsic_base(instr);
4957       assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
4958 
4959       elk_fs_reg src(UNIFORM, base_offset / 4, dest.type);
4960 
4961       if (nir_src_is_const(instr->src[0])) {
4962          unsigned load_offset = nir_src_as_uint(instr->src[0]);
4963          assert(load_offset % type_sz(dest.type) == 0);
4964          /* The base offset can only handle 32-bit units, so for 16-bit
4965           * data take the modulo of the offset with 4 bytes and add it to
4966           * the offset to read from within the source register.
4967           */
4968          src.offset = load_offset + base_offset % 4;
4969 
4970          for (unsigned j = 0; j < instr->num_components; j++) {
4971             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4972          }
4973       } else {
4974          elk_fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
4975                                   ELK_REGISTER_TYPE_UD);
4976 
4977          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4978           * go past the end of the uniform.  In order to keep the n'th
4979           * component from running past, we subtract off the size of all but
4980           * one component of the vector.
4981           */
4982          assert(nir_intrinsic_range(instr) >=
4983                 instr->num_components * type_sz(dest.type));
4984          unsigned read_size = nir_intrinsic_range(instr) -
4985             (instr->num_components - 1) * type_sz(dest.type);
4986 
4987          bool supports_64bit_indirects = devinfo->platform != INTEL_PLATFORM_CHV;
4988 
4989          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4990             for (unsigned j = 0; j < instr->num_components; j++) {
4991                bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
4992                         offset(dest, bld, j), offset(src, bld, j),
4993                         indirect, elk_imm_ud(read_size));
4994             }
4995          } else {
4996             const unsigned num_mov_indirects =
4997                type_sz(dest.type) / type_sz(ELK_REGISTER_TYPE_UD);
4998             /* We read a little bit less per MOV INDIRECT, as they are now
4999              * 32-bits ones instead of 64-bit. Fix read_size then.
5000              */
5001             const unsigned read_size_32bit = read_size -
5002                 (num_mov_indirects - 1) * type_sz(ELK_REGISTER_TYPE_UD);
5003             for (unsigned j = 0; j < instr->num_components; j++) {
5004                for (unsigned i = 0; i < num_mov_indirects; i++) {
5005                   bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5006                            subscript(offset(dest, bld, j), ELK_REGISTER_TYPE_UD, i),
5007                            subscript(offset(src, bld, j), ELK_REGISTER_TYPE_UD, i),
5008                            indirect, elk_imm_ud(read_size_32bit));
5009                }
5010             }
5011          }
5012       }
5013       break;
5014    }
5015 
5016    case nir_intrinsic_load_ubo:
5017    case nir_intrinsic_load_ubo_uniform_block_intel: {
5018       elk_fs_reg surface, surface_handle;
5019 
5020       if (get_nir_src_bindless(ntb, instr->src[0]))
5021          surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5022       else
5023          surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5024 
5025       if (!nir_src_is_const(instr->src[1])) {
5026          if (instr->intrinsic == nir_intrinsic_load_ubo) {
5027             /* load_ubo with non-uniform offset */
5028             elk_fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
5029                                         ELK_REGISTER_TYPE_UD);
5030 
5031             const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
5032 
5033             for (int i = 0; i < instr->num_components; i += comps_per_load) {
5034                const unsigned remaining = instr->num_components - i;
5035                s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
5036                                             surface, surface_handle,
5037                                             base_offset,
5038                                             i * type_sz(dest.type),
5039                                             instr->def.bit_size / 8,
5040                                             MIN2(remaining, comps_per_load));
5041             }
5042 
5043             s.prog_data->has_ubo_pull = true;
5044          } else {
5045             /* load_ubo with uniform offset */
5046             const fs_builder ubld1 = bld.exec_all().group(1, 0);
5047             const fs_builder ubld8 = bld.exec_all().group(8, 0);
5048             const fs_builder ubld16 = bld.exec_all().group(16, 0);
5049 
5050             elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5051 
5052             srcs[SURFACE_LOGICAL_SRC_SURFACE]        = surface;
5053             srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
5054 
5055             const nir_src load_offset = instr->src[1];
5056             if (nir_src_is_const(load_offset)) {
5057                elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5058                ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5059                srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5060             } else {
5061                srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5062                   bld.emit_uniformize(get_nir_src(ntb, load_offset));
5063             }
5064 
5065             const unsigned total_dwords =
5066                ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
5067             unsigned loaded_dwords = 0;
5068 
5069             const elk_fs_reg packed_consts =
5070                ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5071 
5072             while (loaded_dwords < total_dwords) {
5073                const unsigned block =
5074                   choose_oword_block_size_dwords(devinfo,
5075                                                  total_dwords - loaded_dwords);
5076                const unsigned block_bytes = block * 4;
5077 
5078                srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5079 
5080                const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5081                ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5082                          retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5083                          srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5084                   align(block_bytes, REG_SIZE * reg_unit(devinfo));
5085 
5086                loaded_dwords += block;
5087 
5088                ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5089                          srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5090                          elk_imm_ud(block_bytes));
5091             }
5092 
5093             for (unsigned c = 0; c < instr->num_components; c++) {
5094                bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5095                        component(packed_consts, c));
5096             }
5097 
5098             s.prog_data->has_ubo_pull = true;
5099          }
5100       } else {
5101          /* Even if we are loading doubles, a pull constant load will load
5102           * a 32-bit vec4, so should only reserve vgrf space for that. If we
5103           * need to load a full dvec4 we will have to emit 2 loads. This is
5104           * similar to demote_pull_constants(), except that in that case we
5105           * see individual accesses to each component of the vector and then
5106           * we let CSE deal with duplicate loads. Here we see a vector access
5107           * and we have to split it if necessary.
5108           */
5109          const unsigned type_size = type_sz(dest.type);
5110          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
5111          const unsigned ubo_block =
5112             elk_nir_ubo_surface_index_get_push_block(instr->src[0]);
5113          const unsigned offset_256b = load_offset / 32;
5114          const unsigned end_256b =
5115             DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
5116 
5117          /* See if we've selected this as a push constant candidate */
5118          elk_fs_reg push_reg;
5119          for (int i = 0; i < 4; i++) {
5120             const struct elk_ubo_range *range = &s.prog_data->ubo_ranges[i];
5121             if (range->block == ubo_block &&
5122                 offset_256b >= range->start &&
5123                 end_256b <= range->start + range->length) {
5124 
5125                push_reg = elk_fs_reg(UNIFORM, UBO_START + i, dest.type);
5126                push_reg.offset = load_offset - 32 * range->start;
5127                break;
5128             }
5129          }
5130 
5131          if (push_reg.file != BAD_FILE) {
5132             for (unsigned i = 0; i < instr->num_components; i++) {
5133                bld.MOV(offset(dest, bld, i),
5134                        byte_offset(push_reg, i * type_size));
5135             }
5136             break;
5137          }
5138 
5139          s.prog_data->has_ubo_pull = true;
5140 
5141          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
5142          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
5143 
5144          for (unsigned c = 0; c < instr->num_components;) {
5145             const unsigned base = load_offset + c * type_size;
5146             /* Number of usable components in the next block-aligned load. */
5147             const unsigned count = MIN2(instr->num_components - c,
5148                                         (block_sz - base % block_sz) / type_size);
5149 
5150             const elk_fs_reg packed_consts = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5151             elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
5152             srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE]        = surface;
5153             srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
5154             srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]         = elk_imm_ud(base & ~(block_sz - 1));
5155             srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]           = elk_imm_ud(block_sz);
5156 
5157             ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
5158                       srcs, PULL_UNIFORM_CONSTANT_SRCS);
5159 
5160             const elk_fs_reg consts =
5161                retype(byte_offset(packed_consts, base & (block_sz - 1)),
5162                       dest.type);
5163 
5164             for (unsigned d = 0; d < count; d++)
5165                bld.MOV(offset(dest, bld, c + d), component(consts, d));
5166 
5167             c += count;
5168          }
5169       }
5170       break;
5171    }
5172 
5173    case nir_intrinsic_load_global:
5174    case nir_intrinsic_load_global_constant: {
5175       assert(devinfo->ver >= 8);
5176 
5177       assert(instr->def.bit_size <= 32);
5178       assert(nir_intrinsic_align(instr) > 0);
5179       elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5180       srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
5181       srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5182       srcs[A64_LOGICAL_ENABLE_HELPERS] =
5183          elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5184 
5185       if (instr->def.bit_size == 32 &&
5186           nir_intrinsic_align(instr) >= 4) {
5187          assert(instr->def.num_components <= 4);
5188 
5189          srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5190 
5191          elk_fs_inst *inst =
5192             bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
5193                      srcs, A64_LOGICAL_NUM_SRCS);
5194          inst->size_written = instr->num_components *
5195                               inst->dst.component_size(inst->exec_size);
5196       } else {
5197          const unsigned bit_size = instr->def.bit_size;
5198          assert(instr->def.num_components == 1);
5199          elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5200 
5201          srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5202 
5203          bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
5204                   srcs, A64_LOGICAL_NUM_SRCS);
5205          bld.MOV(dest, subscript(tmp, dest.type, 0));
5206       }
5207       break;
5208    }
5209 
5210    case nir_intrinsic_store_global: {
5211       assert(devinfo->ver >= 8);
5212 
5213       assert(nir_src_bit_size(instr->src[0]) <= 32);
5214       assert(nir_intrinsic_write_mask(instr) ==
5215              (1u << instr->num_components) - 1);
5216       assert(nir_intrinsic_align(instr) > 0);
5217 
5218       elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5219       srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5220       srcs[A64_LOGICAL_ENABLE_HELPERS] =
5221          elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5222 
5223       if (nir_src_bit_size(instr->src[0]) == 32 &&
5224           nir_intrinsic_align(instr) >= 4) {
5225          assert(nir_src_num_components(instr->src[0]) <= 4);
5226 
5227          srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
5228          srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5229 
5230          bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, elk_fs_reg(),
5231                   srcs, A64_LOGICAL_NUM_SRCS);
5232       } else {
5233          assert(nir_src_num_components(instr->src[0]) == 1);
5234          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5235          elk_reg_type data_type =
5236             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5237          elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5238          bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
5239 
5240          srcs[A64_LOGICAL_SRC] = tmp;
5241          srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5242 
5243          bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, elk_fs_reg(),
5244                   srcs, A64_LOGICAL_NUM_SRCS);
5245       }
5246       break;
5247    }
5248 
5249    case nir_intrinsic_global_atomic:
5250    case nir_intrinsic_global_atomic_swap:
5251       fs_nir_emit_global_atomic(ntb, bld, instr);
5252       break;
5253 
5254    case nir_intrinsic_load_global_constant_uniform_block_intel: {
5255       const unsigned total_dwords = ALIGN(instr->num_components,
5256                                           REG_SIZE * reg_unit(devinfo) / 4);
5257       unsigned loaded_dwords = 0;
5258 
5259       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5260       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5261       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5262 
5263       const elk_fs_reg packed_consts =
5264          ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5265       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
5266 
5267       while (loaded_dwords < total_dwords) {
5268          const unsigned block =
5269             choose_oword_block_size_dwords(devinfo,
5270                                            total_dwords - loaded_dwords);
5271          const unsigned block_bytes = block * 4;
5272 
5273          const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5274 
5275          elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5276          srcs[A64_LOGICAL_ADDRESS] = address;
5277          srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5278          srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
5279          srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
5280          ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5281                    retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5282                    srcs, A64_LOGICAL_NUM_SRCS)->size_written =
5283             align(block_bytes, REG_SIZE * reg_unit(devinfo));
5284 
5285          increment_a64_address(ubld1, address, block_bytes);
5286          loaded_dwords += block;
5287       }
5288 
5289       for (unsigned c = 0; c < instr->num_components; c++)
5290          bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5291                  component(packed_consts, c));
5292 
5293       break;
5294    }
5295 
5296    case nir_intrinsic_load_ssbo: {
5297       assert(devinfo->ver >= 7);
5298 
5299       const unsigned bit_size = instr->def.bit_size;
5300       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5301       srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5302            SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5303            SURFACE_LOGICAL_SRC_SURFACE] =
5304          get_nir_buffer_intrinsic_index(ntb, bld, instr);
5305       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5306       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5307       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5308 
5309       /* Make dest unsigned because that's what the temporary will be */
5310       dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5311 
5312       /* Read the vector */
5313       assert(bit_size <= 32);
5314       assert(nir_intrinsic_align(instr) > 0);
5315       if (bit_size == 32 &&
5316           nir_intrinsic_align(instr) >= 4) {
5317          assert(instr->def.num_components <= 4);
5318          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5319          elk_fs_inst *inst =
5320             bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5321                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5322          inst->size_written = instr->num_components * s.dispatch_width * 4;
5323       } else {
5324          assert(instr->def.num_components == 1);
5325          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5326 
5327          elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5328          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5329                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5330          bld.MOV(dest, subscript(read_result, dest.type, 0));
5331       }
5332       break;
5333    }
5334 
5335    case nir_intrinsic_store_ssbo: {
5336       assert(devinfo->ver >= 7);
5337 
5338       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5339       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5340       srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
5341            SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5342            SURFACE_LOGICAL_SRC_SURFACE] =
5343          get_nir_buffer_intrinsic_index(ntb, bld, instr);
5344       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
5345       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5346       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5347 
5348       elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5349       data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5350 
5351       assert(bit_size <= 32);
5352       assert(nir_intrinsic_write_mask(instr) ==
5353              (1u << instr->num_components) - 1);
5354       assert(nir_intrinsic_align(instr) > 0);
5355       if (bit_size == 32 &&
5356           nir_intrinsic_align(instr) >= 4) {
5357          assert(nir_src_num_components(instr->src[0]) <= 4);
5358          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5359          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5360          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5361                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5362       } else {
5363          assert(nir_src_num_components(instr->src[0]) == 1);
5364          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5365 
5366          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5367          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5368 
5369          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5370                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5371       }
5372       break;
5373    }
5374 
5375    case nir_intrinsic_load_ssbo_uniform_block_intel:
5376    case nir_intrinsic_load_shared_uniform_block_intel: {
5377       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5378 
5379       const bool is_ssbo =
5380          instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
5381       if (is_ssbo) {
5382          srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5383               SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5384               SURFACE_LOGICAL_SRC_SURFACE] =
5385             get_nir_buffer_intrinsic_index(ntb, bld, instr);
5386       } else {
5387          srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
5388       }
5389 
5390       const unsigned total_dwords = ALIGN(instr->num_components,
5391                                           REG_SIZE * reg_unit(devinfo) / 4);
5392       unsigned loaded_dwords = 0;
5393 
5394       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5395       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5396       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5397 
5398       const elk_fs_reg packed_consts =
5399          ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5400 
5401       const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
5402       if (nir_src_is_const(load_offset)) {
5403          elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5404          ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5405          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5406       } else {
5407          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5408             bld.emit_uniformize(get_nir_src(ntb, load_offset));
5409       }
5410 
5411       while (loaded_dwords < total_dwords) {
5412          const unsigned block =
5413             choose_oword_block_size_dwords(devinfo,
5414                                            total_dwords - loaded_dwords);
5415          const unsigned block_bytes = block * 4;
5416 
5417          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5418 
5419          const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5420          ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5421                    retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5422                    srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5423             align(block_bytes, REG_SIZE * reg_unit(devinfo));
5424 
5425          loaded_dwords += block;
5426 
5427          ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5428                    srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5429                    elk_imm_ud(block_bytes));
5430       }
5431 
5432       for (unsigned c = 0; c < instr->num_components; c++)
5433          bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5434                  component(packed_consts, c));
5435 
5436       break;
5437    }
5438 
5439    case nir_intrinsic_store_output: {
5440       assert(nir_src_bit_size(instr->src[0]) == 32);
5441       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5442 
5443       unsigned store_offset = nir_src_as_uint(instr->src[1]);
5444       unsigned num_components = instr->num_components;
5445       unsigned first_component = nir_intrinsic_component(instr);
5446 
5447       elk_fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
5448                                       4 * store_offset), src.type);
5449       for (unsigned j = 0; j < num_components; j++) {
5450          bld.MOV(offset(new_dest, bld, j + first_component),
5451                  offset(src, bld, j));
5452       }
5453       break;
5454    }
5455 
5456    case nir_intrinsic_ssbo_atomic:
5457    case nir_intrinsic_ssbo_atomic_swap:
5458       fs_nir_emit_surface_atomic(ntb, bld, instr,
5459                                  get_nir_buffer_intrinsic_index(ntb, bld, instr),
5460                                  get_nir_src_bindless(ntb, instr->src[0]));
5461       break;
5462 
5463    case nir_intrinsic_get_ssbo_size: {
5464       assert(nir_src_num_components(instr->src[0]) == 1);
5465 
5466       /* A resinfo's sampler message is used to get the buffer size.  The
5467        * SIMD8's writeback message consists of four registers and SIMD16's
5468        * writeback message consists of 8 destination registers (two per each
5469        * component).  Because we are only interested on the first channel of
5470        * the first returned component, where resinfo returns the buffer size
5471        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
5472        * the dispatch width.
5473        */
5474       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
5475       elk_fs_reg src_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5476       elk_fs_reg ret_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
5477 
5478       /* Set LOD = 0 */
5479       ubld.MOV(src_payload, elk_imm_d(0));
5480 
5481       elk_fs_reg srcs[GET_BUFFER_SIZE_SRCS];
5482       srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5483            GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
5484            GET_BUFFER_SIZE_SRC_SURFACE] =
5485          get_nir_buffer_intrinsic_index(ntb, bld, instr);
5486       srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
5487       elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
5488                                 srcs, GET_BUFFER_SIZE_SRCS);
5489       inst->header_size = 0;
5490       inst->mlen = reg_unit(devinfo);
5491       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5492 
5493       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
5494        *
5495        * "Out-of-bounds checking is always performed at a DWord granularity. If
5496        * any part of the DWord is out-of-bounds then the whole DWord is
5497        * considered out-of-bounds."
5498        *
5499        * This implies that types with size smaller than 4-bytes need to be
5500        * padded if they don't complete the last dword of the buffer. But as we
5501        * need to maintain the original size we need to reverse the padding
5502        * calculation to return the correct size to know the number of elements
5503        * of an unsized array. As we stored in the last two bits of the surface
5504        * size the needed padding for the buffer, we calculate here the
5505        * original buffer_size reversing the surface_size calculation:
5506        *
5507        * surface_size = isl_align(buffer_size, 4) +
5508        *                (isl_align(buffer_size) - buffer_size)
5509        *
5510        * buffer_size = surface_size & ~3 - surface_size & 3
5511        */
5512 
5513       elk_fs_reg size_aligned4 = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5514       elk_fs_reg size_padding = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5515       elk_fs_reg buffer_size = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5516 
5517       ubld.AND(size_padding, ret_payload, elk_imm_ud(3));
5518       ubld.AND(size_aligned4, ret_payload, elk_imm_ud(~3));
5519       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
5520 
5521       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
5522       break;
5523    }
5524 
5525    case nir_intrinsic_load_scratch: {
5526       assert(devinfo->ver >= 7);
5527 
5528       assert(instr->def.num_components == 1);
5529       const unsigned bit_size = instr->def.bit_size;
5530       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5531 
5532       if (devinfo->ver >= 8) {
5533          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5534             elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5535       } else {
5536          srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5537       }
5538 
5539       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5540       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5541       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5542       const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
5543 
5544       /* Make dest unsigned because that's what the temporary will be */
5545       dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5546 
5547       /* Read the vector */
5548       assert(instr->def.num_components == 1);
5549       assert(bit_size <= 32);
5550       assert(nir_intrinsic_align(instr) > 0);
5551       if (bit_size == 32 &&
5552           nir_intrinsic_align(instr) >= 4) {
5553          /* The offset for a DWORD scattered message is in dwords. */
5554          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5555             swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5556 
5557          bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
5558                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5559       } else {
5560          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5561             swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5562 
5563          elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5564          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5565                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5566          bld.MOV(dest, read_result);
5567       }
5568 
5569       s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5570       break;
5571    }
5572 
5573    case nir_intrinsic_store_scratch: {
5574       assert(devinfo->ver >= 7);
5575 
5576       assert(nir_src_num_components(instr->src[0]) == 1);
5577       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5578       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5579 
5580       if (devinfo->ver >= 8) {
5581          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5582             elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5583       } else {
5584          srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5585       }
5586 
5587       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5588       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5589       /**
5590        * While this instruction has side-effects, it should not be predicated
5591        * on sample mask, because otherwise fs helper invocations would
5592        * load undefined values from scratch memory. And scratch memory
5593        * load-stores are produced from operations without side-effects, thus
5594        * they should not have different behaviour in the helper invocations.
5595        */
5596       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5597       const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
5598 
5599       elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5600       data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5601 
5602       assert(nir_src_num_components(instr->src[0]) == 1);
5603       assert(bit_size <= 32);
5604       assert(nir_intrinsic_write_mask(instr) == 1);
5605       assert(nir_intrinsic_align(instr) > 0);
5606       if (bit_size == 32 &&
5607           nir_intrinsic_align(instr) >= 4) {
5608          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5609 
5610          /* The offset for a DWORD scattered message is in dwords. */
5611          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5612             swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5613 
5614          bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
5615                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5616       } else {
5617          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5618          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5619 
5620          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5621             swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5622 
5623          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5624                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5625       }
5626       s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5627       break;
5628    }
5629 
5630    case nir_intrinsic_load_subgroup_size:
5631       /* This should only happen for fragment shaders because every other case
5632        * is lowered in NIR so we can optimize on it.
5633        */
5634       assert(s.stage == MESA_SHADER_FRAGMENT);
5635       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), elk_imm_d(s.dispatch_width));
5636       break;
5637 
5638    case nir_intrinsic_load_subgroup_invocation:
5639       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
5640               ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
5641       break;
5642 
5643    case nir_intrinsic_load_subgroup_eq_mask:
5644    case nir_intrinsic_load_subgroup_ge_mask:
5645    case nir_intrinsic_load_subgroup_gt_mask:
5646    case nir_intrinsic_load_subgroup_le_mask:
5647    case nir_intrinsic_load_subgroup_lt_mask:
5648       unreachable("not reached");
5649 
5650    case nir_intrinsic_vote_any: {
5651       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5652 
5653       /* The any/all predicates do not consider channel enables. To prevent
5654        * dead channels from affecting the result, we initialize the flag with
5655        * with the identity value for the logical operation.
5656        */
5657       if (s.dispatch_width == 32) {
5658          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5659          ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5660                    elk_imm_ud(0));
5661       } else {
5662          ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0));
5663       }
5664       bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5665 
5666       /* For some reason, the any/all predicates don't work properly with
5667        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5668        * doesn't read the correct subset of the flag register and you end up
5669        * getting garbage in the second half.  Work around this by using a pair
5670        * of 1-wide MOVs and scattering the result.
5671        */
5672       const fs_builder ubld = ubld1;
5673       elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5674       ubld.MOV(res1, elk_imm_d(0));
5675       set_predicate(s.dispatch_width == 8  ? ELK_PREDICATE_ALIGN1_ANY8H :
5676                     s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ANY16H :
5677                                              ELK_PREDICATE_ALIGN1_ANY32H,
5678                     ubld.MOV(res1, elk_imm_d(-1)));
5679 
5680       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5681       break;
5682    }
5683    case nir_intrinsic_vote_all: {
5684       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5685 
5686       /* The any/all predicates do not consider channel enables. To prevent
5687        * dead channels from affecting the result, we initialize the flag with
5688        * with the identity value for the logical operation.
5689        */
5690       if (s.dispatch_width == 32) {
5691          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5692          ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5693                    elk_imm_ud(0xffffffff));
5694       } else {
5695          ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5696       }
5697       bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5698 
5699       /* For some reason, the any/all predicates don't work properly with
5700        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5701        * doesn't read the correct subset of the flag register and you end up
5702        * getting garbage in the second half.  Work around this by using a pair
5703        * of 1-wide MOVs and scattering the result.
5704        */
5705       const fs_builder ubld = ubld1;
5706       elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5707       ubld.MOV(res1, elk_imm_d(0));
5708       set_predicate(s.dispatch_width == 8  ? ELK_PREDICATE_ALIGN1_ALL8H :
5709                     s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5710                                              ELK_PREDICATE_ALIGN1_ALL32H,
5711                     ubld.MOV(res1, elk_imm_d(-1)));
5712 
5713       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5714       break;
5715    }
5716    case nir_intrinsic_vote_feq:
5717    case nir_intrinsic_vote_ieq: {
5718       elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5719       if (instr->intrinsic == nir_intrinsic_vote_feq) {
5720          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5721          value.type = bit_size == 8 ? ELK_REGISTER_TYPE_B :
5722             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_F);
5723       }
5724 
5725       elk_fs_reg uniformized = bld.emit_uniformize(value);
5726       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5727 
5728       /* The any/all predicates do not consider channel enables. To prevent
5729        * dead channels from affecting the result, we initialize the flag with
5730        * with the identity value for the logical operation.
5731        */
5732       if (s.dispatch_width == 32) {
5733          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5734          ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5735                          elk_imm_ud(0xffffffff));
5736       } else {
5737          ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5738       }
5739       bld.CMP(bld.null_reg_d(), value, uniformized, ELK_CONDITIONAL_Z);
5740 
5741       /* For some reason, the any/all predicates don't work properly with
5742        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5743        * doesn't read the correct subset of the flag register and you end up
5744        * getting garbage in the second half.  Work around this by using a pair
5745        * of 1-wide MOVs and scattering the result.
5746        */
5747       const fs_builder ubld = ubld1;
5748       elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5749       ubld.MOV(res1, elk_imm_d(0));
5750       set_predicate(s.dispatch_width == 8  ? ELK_PREDICATE_ALIGN1_ALL8H :
5751                     s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5752                                              ELK_PREDICATE_ALIGN1_ALL32H,
5753                     ubld.MOV(res1, elk_imm_d(-1)));
5754 
5755       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5756       break;
5757    }
5758 
5759    case nir_intrinsic_ballot: {
5760       const elk_fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
5761                                   ELK_REGISTER_TYPE_UD);
5762       struct elk_reg flag = elk_flag_reg(0, 0);
5763       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5764        * as f0.0.  This is a problem for fragment programs as we currently use
5765        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
5766        * programs yet so this isn't a problem.  When we do, something will
5767        * have to change.
5768        */
5769       if (s.dispatch_width == 32)
5770          flag.type = ELK_REGISTER_TYPE_UD;
5771 
5772       bld.exec_all().group(1, 0).MOV(flag, elk_imm_ud(0u));
5773       bld.CMP(bld.null_reg_ud(), value, elk_imm_ud(0u), ELK_CONDITIONAL_NZ);
5774 
5775       if (instr->def.bit_size > 32) {
5776          dest.type = ELK_REGISTER_TYPE_UQ;
5777       } else {
5778          dest.type = ELK_REGISTER_TYPE_UD;
5779       }
5780       bld.MOV(dest, flag);
5781       break;
5782    }
5783 
5784    case nir_intrinsic_read_invocation: {
5785       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5786       const elk_fs_reg invocation = get_nir_src(ntb, instr->src[1]);
5787 
5788       elk_fs_reg tmp = bld.vgrf(value.type);
5789 
5790       /* When for some reason the subgroup_size picked by NIR is larger than
5791        * the dispatch size picked by the backend (this could happen in RT,
5792        * FS), bound the invocation to the dispatch size.
5793        */
5794       elk_fs_reg bound_invocation;
5795       if (s.api_subgroup_size == 0 ||
5796           bld.dispatch_width() < s.api_subgroup_size) {
5797          bound_invocation = bld.vgrf(ELK_REGISTER_TYPE_UD);
5798          bld.AND(bound_invocation, invocation, elk_imm_ud(s.dispatch_width - 1));
5799       } else {
5800          bound_invocation = invocation;
5801       }
5802       bld.exec_all().emit(ELK_SHADER_OPCODE_BROADCAST, tmp, value,
5803                           bld.emit_uniformize(bound_invocation));
5804 
5805       bld.MOV(retype(dest, value.type), elk_fs_reg(component(tmp, 0)));
5806       break;
5807    }
5808 
5809    case nir_intrinsic_read_first_invocation: {
5810       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5811       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5812       break;
5813    }
5814 
5815    case nir_intrinsic_shuffle: {
5816       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5817       const elk_fs_reg index = get_nir_src(ntb, instr->src[1]);
5818 
5819       bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5820       break;
5821    }
5822 
5823    case nir_intrinsic_first_invocation: {
5824       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5825       bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5826       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5827               elk_fs_reg(component(tmp, 0)));
5828       break;
5829    }
5830 
5831    case nir_intrinsic_last_invocation: {
5832       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5833       bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
5834       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5835               elk_fs_reg(component(tmp, 0)));
5836       break;
5837    }
5838 
5839    case nir_intrinsic_quad_broadcast: {
5840       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5841       const unsigned index = nir_src_as_uint(instr->src[1]);
5842 
5843       bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5844                value, elk_imm_ud(index), elk_imm_ud(4));
5845       break;
5846    }
5847 
5848    case nir_intrinsic_quad_swap_horizontal: {
5849       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5850       const elk_fs_reg tmp = bld.vgrf(value.type);
5851       if (devinfo->ver <= 7) {
5852          /* The hardware doesn't seem to support these crazy regions with
5853           * compressed instructions on gfx7 and earlier so we fall back to
5854           * using quad swizzles.  Fortunately, we don't support 64-bit
5855           * anything in Vulkan on gfx7.
5856           */
5857          assert(nir_src_bit_size(instr->src[0]) == 32);
5858          const fs_builder ubld = bld.exec_all();
5859          ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5860                    elk_imm_ud(ELK_SWIZZLE4(1,0,3,2)));
5861          bld.MOV(retype(dest, value.type), tmp);
5862       } else {
5863          const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
5864 
5865          const elk_fs_reg src_left = horiz_stride(value, 2);
5866          const elk_fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5867          const elk_fs_reg tmp_left = horiz_stride(tmp, 2);
5868          const elk_fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5869 
5870          ubld.MOV(tmp_left, src_right);
5871          ubld.MOV(tmp_right, src_left);
5872 
5873       }
5874       bld.MOV(retype(dest, value.type), tmp);
5875       break;
5876    }
5877 
5878    case nir_intrinsic_quad_swap_vertical: {
5879       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5880       if (nir_src_bit_size(instr->src[0]) == 32) {
5881          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5882          const elk_fs_reg tmp = bld.vgrf(value.type);
5883          const fs_builder ubld = bld.exec_all();
5884          ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5885                    elk_imm_ud(ELK_SWIZZLE4(2,3,0,1)));
5886          bld.MOV(retype(dest, value.type), tmp);
5887       } else {
5888          /* For larger data types, we have to either emit dispatch_width many
5889           * MOVs or else fall back to doing indirects.
5890           */
5891          elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5892          bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5893                       elk_imm_w(0x2));
5894          bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5895       }
5896       break;
5897    }
5898 
5899    case nir_intrinsic_quad_swap_diagonal: {
5900       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5901       if (nir_src_bit_size(instr->src[0]) == 32) {
5902          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5903          const elk_fs_reg tmp = bld.vgrf(value.type);
5904          const fs_builder ubld = bld.exec_all();
5905          ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5906                    elk_imm_ud(ELK_SWIZZLE4(3,2,1,0)));
5907          bld.MOV(retype(dest, value.type), tmp);
5908       } else {
5909          /* For larger data types, we have to either emit dispatch_width many
5910           * MOVs or else fall back to doing indirects.
5911           */
5912          elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5913          bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5914                       elk_imm_w(0x3));
5915          bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5916       }
5917       break;
5918    }
5919 
5920    case nir_intrinsic_ddx_fine:
5921       bld.emit(ELK_FS_OPCODE_DDX_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5922                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5923       break;
5924    case nir_intrinsic_ddx:
5925    case nir_intrinsic_ddx_coarse:
5926       bld.emit(ELK_FS_OPCODE_DDX_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5927                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5928       break;
5929    case nir_intrinsic_ddy_fine:
5930       bld.emit(ELK_FS_OPCODE_DDY_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5931                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5932       break;
5933    case nir_intrinsic_ddy:
5934    case nir_intrinsic_ddy_coarse:
5935       bld.emit(ELK_FS_OPCODE_DDY_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5936                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5937       break;
5938 
5939    case nir_intrinsic_reduce: {
5940       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5941       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5942       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5943       if (cluster_size == 0 || cluster_size > s.dispatch_width)
5944          cluster_size = s.dispatch_width;
5945 
5946       /* Figure out the source type */
5947       src.type = elk_type_for_nir_type(devinfo,
5948          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5949                         nir_src_bit_size(instr->src[0])));
5950 
5951       elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
5952       elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
5953       elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
5954 
5955       /* Set up a register for all of our scratching around and initialize it
5956        * to reduction operation's identity value.
5957        */
5958       elk_fs_reg scan = bld.vgrf(src.type);
5959       bld.exec_all().emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5960 
5961       bld.emit_scan(elk_op, scan, cluster_size, cond_mod);
5962 
5963       dest.type = src.type;
5964       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5965          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5966           * the distance between clusters is at least 2 GRFs.  In this case,
5967           * we don't need the weird striding of the CLUSTER_BROADCAST
5968           * instruction and can just do regular MOVs.
5969           */
5970          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5971          const unsigned groups =
5972             (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5973          const unsigned group_size = s.dispatch_width / groups;
5974          for (unsigned i = 0; i < groups; i++) {
5975             const unsigned cluster = (i * group_size) / cluster_size;
5976             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5977             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5978                                          component(scan, comp));
5979          }
5980       } else {
5981          bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
5982                   elk_imm_ud(cluster_size - 1), elk_imm_ud(cluster_size));
5983       }
5984       break;
5985    }
5986 
5987    case nir_intrinsic_inclusive_scan:
5988    case nir_intrinsic_exclusive_scan: {
5989       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5990       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5991 
5992       /* Figure out the source type */
5993       src.type = elk_type_for_nir_type(devinfo,
5994          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5995                         nir_src_bit_size(instr->src[0])));
5996 
5997       elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
5998       elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
5999       elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
6000 
6001       /* Set up a register for all of our scratching around and initialize it
6002        * to reduction operation's identity value.
6003        */
6004       elk_fs_reg scan = bld.vgrf(src.type);
6005       const fs_builder allbld = bld.exec_all();
6006       allbld.emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
6007 
6008       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
6009          /* Exclusive scan is a bit harder because we have to do an annoying
6010           * shift of the contents before we can begin.  To make things worse,
6011           * we can't do this with a normal stride; we have to use indirects.
6012           */
6013          elk_fs_reg shifted = bld.vgrf(src.type);
6014          elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6015          allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6016                          elk_imm_w(-1));
6017          allbld.emit(ELK_SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
6018          allbld.group(1, 0).MOV(component(shifted, 0), identity);
6019          scan = shifted;
6020       }
6021 
6022       bld.emit_scan(elk_op, scan, s.dispatch_width, cond_mod);
6023 
6024       bld.MOV(retype(dest, src.type), scan);
6025       break;
6026    }
6027 
6028    case nir_intrinsic_load_global_block_intel: {
6029       assert(instr->def.bit_size == 32);
6030 
6031       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6032 
6033       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6034       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6035       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6036 
6037       const unsigned total = instr->num_components * s.dispatch_width;
6038       unsigned loaded = 0;
6039 
6040       while (loaded < total) {
6041          const unsigned block =
6042             choose_oword_block_size_dwords(devinfo, total - loaded);
6043          const unsigned block_bytes = block * 4;
6044 
6045          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6046 
6047          elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6048          srcs[A64_LOGICAL_ADDRESS] = address;
6049          srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
6050          srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6051          srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(1);
6052          ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6053                    retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6054                    srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6055 
6056          increment_a64_address(ubld1, address, block_bytes);
6057          loaded += block;
6058       }
6059 
6060       assert(loaded == total);
6061       break;
6062    }
6063 
6064    case nir_intrinsic_store_global_block_intel: {
6065       assert(nir_src_bit_size(instr->src[0]) == 32);
6066 
6067       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
6068       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6069 
6070       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6071       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6072       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6073 
6074       const unsigned total = instr->num_components * s.dispatch_width;
6075       unsigned written = 0;
6076 
6077       while (written < total) {
6078          const unsigned block =
6079             choose_oword_block_size_dwords(devinfo, total - written);
6080 
6081          elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6082          srcs[A64_LOGICAL_ADDRESS] = address;
6083          srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
6084                                         ELK_REGISTER_TYPE_UD);
6085          srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6086          srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6087 
6088          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6089          ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, elk_fs_reg(),
6090                    srcs, A64_LOGICAL_NUM_SRCS);
6091 
6092          const unsigned block_bytes = block * 4;
6093          increment_a64_address(ubld1, address, block_bytes);
6094          written += block;
6095       }
6096 
6097       assert(written == total);
6098       break;
6099    }
6100 
6101    case nir_intrinsic_load_shared_block_intel:
6102    case nir_intrinsic_load_ssbo_block_intel: {
6103       assert(instr->def.bit_size == 32);
6104 
6105       const bool is_ssbo =
6106          instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
6107       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
6108 
6109       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6110       srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6111          get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6112          elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6113       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6114 
6115       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6116       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6117       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6118 
6119       const unsigned total = instr->num_components * s.dispatch_width;
6120       unsigned loaded = 0;
6121 
6122       while (loaded < total) {
6123          const unsigned block =
6124             choose_oword_block_size_dwords(devinfo, total - loaded);
6125          const unsigned block_bytes = block * 4;
6126 
6127          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6128 
6129          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6130          ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6131                    retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6132                    srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6133 
6134          ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6135          loaded += block;
6136       }
6137 
6138       assert(loaded == total);
6139       break;
6140    }
6141 
6142    case nir_intrinsic_store_shared_block_intel:
6143    case nir_intrinsic_store_ssbo_block_intel: {
6144       assert(nir_src_bit_size(instr->src[0]) == 32);
6145 
6146       const bool is_ssbo =
6147          instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
6148 
6149       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
6150       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6151 
6152       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6153       srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6154          get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6155          elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6156       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6157 
6158       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6159       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6160       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6161 
6162       const unsigned total = instr->num_components * s.dispatch_width;
6163       unsigned written = 0;
6164 
6165       while (written < total) {
6166          const unsigned block =
6167             choose_oword_block_size_dwords(devinfo, total - written);
6168 
6169          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6170          srcs[SURFACE_LOGICAL_SRC_DATA] =
6171             retype(byte_offset(src, written * 4), ELK_REGISTER_TYPE_UD);
6172 
6173          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6174          ubld.emit(ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
6175                    elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6176 
6177          const unsigned block_bytes = block * 4;
6178          ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6179          written += block;
6180       }
6181 
6182       assert(written == total);
6183       break;
6184    }
6185 
6186    default:
6187 #ifndef NDEBUG
6188       assert(instr->intrinsic < nir_num_intrinsics);
6189       fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
6190 #endif
6191       unreachable("unknown intrinsic");
6192    }
6193 }
6194 
6195 static elk_fs_reg
expand_to_32bit(const fs_builder & bld,const elk_fs_reg & src)6196 expand_to_32bit(const fs_builder &bld, const elk_fs_reg &src)
6197 {
6198    if (type_sz(src.type) == 2) {
6199       elk_fs_reg src32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6200       bld.MOV(src32, retype(src, ELK_REGISTER_TYPE_UW));
6201       return src32;
6202    } else {
6203       return src;
6204    }
6205 }
6206 
6207 static void
fs_nir_emit_surface_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,elk_fs_reg surface,bool bindless)6208 fs_nir_emit_surface_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6209                            nir_intrinsic_instr *instr,
6210                            elk_fs_reg surface,
6211                            bool bindless)
6212 {
6213    const intel_device_info *devinfo = ntb.devinfo;
6214    elk_fs_visitor &s = ntb.s;
6215 
6216    enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6217    int num_data = lsc_op_num_data_values(op);
6218 
6219    bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
6220 
6221    /* The BTI untyped atomic messages only support 32-bit atomics.  If you
6222     * just look at the big table of messages in the Vol 7 of the SKL PRM, they
6223     * appear to exist.  However, if you look at Vol 2a, there are no message
6224     * descriptors provided for Qword atomic ops except for A64 messages.
6225     *
6226     * 16-bit float atomics are supported, however.
6227     */
6228    assert(instr->def.bit_size == 32 ||
6229           (instr->def.bit_size == 64 && devinfo->has_lsc) ||
6230           (instr->def.bit_size == 16 &&
6231            (devinfo->has_lsc || elk_lsc_opcode_is_atomic_float(op))));
6232 
6233    elk_fs_reg dest = get_nir_def(ntb, instr->def);
6234 
6235    elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6236    srcs[bindless ?
6237         SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6238         SURFACE_LOGICAL_SRC_SURFACE] = surface;
6239    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6240    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
6241    srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
6242 
6243    if (shared) {
6244       /* SLM - Get the offset */
6245       if (nir_src_is_const(instr->src[0])) {
6246          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6247             elk_imm_ud(nir_intrinsic_base(instr) +
6248                        nir_src_as_uint(instr->src[0]));
6249       } else {
6250          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
6251          bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6252                  retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD),
6253                  elk_imm_ud(nir_intrinsic_base(instr)));
6254       }
6255    } else {
6256       /* SSBOs */
6257       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6258    }
6259 
6260    elk_fs_reg data;
6261    if (num_data >= 1)
6262       data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
6263 
6264    if (num_data >= 2) {
6265       elk_fs_reg tmp = bld.vgrf(data.type, 2);
6266       elk_fs_reg sources[2] = {
6267          data,
6268          expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
6269       };
6270       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6271       data = tmp;
6272    }
6273    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6274 
6275    /* Emit the actual atomic operation */
6276 
6277    switch (instr->def.bit_size) {
6278       case 16: {
6279          elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6280          bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6281                   retype(dest32, dest.type),
6282                   srcs, SURFACE_LOGICAL_NUM_SRCS);
6283          bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW),
6284                  retype(dest32, ELK_REGISTER_TYPE_UD));
6285          break;
6286       }
6287 
6288       case 32:
6289       case 64:
6290          bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6291                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6292          break;
6293       default:
6294          unreachable("Unsupported bit size");
6295    }
6296 }
6297 
6298 static void
fs_nir_emit_global_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)6299 fs_nir_emit_global_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6300                           nir_intrinsic_instr *instr)
6301 {
6302    enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6303    int num_data = lsc_op_num_data_values(op);
6304 
6305    elk_fs_reg dest = get_nir_def(ntb, instr->def);
6306 
6307    elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
6308 
6309    elk_fs_reg data;
6310    if (num_data >= 1)
6311       data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
6312 
6313    if (num_data >= 2) {
6314       elk_fs_reg tmp = bld.vgrf(data.type, 2);
6315       elk_fs_reg sources[2] = {
6316          data,
6317          expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
6318       };
6319       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6320       data = tmp;
6321    }
6322 
6323    elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6324    srcs[A64_LOGICAL_ADDRESS] = addr;
6325    srcs[A64_LOGICAL_SRC] = data;
6326    srcs[A64_LOGICAL_ARG] = elk_imm_ud(op);
6327    srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6328 
6329    switch (instr->def.bit_size) {
6330    case 16: {
6331       elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6332       bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
6333                retype(dest32, dest.type),
6334                srcs, A64_LOGICAL_NUM_SRCS);
6335       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW), dest32);
6336       break;
6337    }
6338    case 32:
6339    case 64:
6340       bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
6341                srcs, A64_LOGICAL_NUM_SRCS);
6342       break;
6343    default:
6344       unreachable("Unsupported bit size");
6345    }
6346 }
6347 
6348 static void
fs_nir_emit_texture(nir_to_elk_state & ntb,nir_tex_instr * instr)6349 fs_nir_emit_texture(nir_to_elk_state &ntb,
6350                     nir_tex_instr *instr)
6351 {
6352    const intel_device_info *devinfo = ntb.devinfo;
6353    const fs_builder &bld = ntb.bld;
6354    elk_fs_visitor &s = ntb.s;
6355 
6356    elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
6357 
6358    /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
6359     *
6360     *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
6361     *     Enable will be incorect for sample_c when applied to a surface with
6362     *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
6363     *     Enable may incorrectly report pixels as referencing a Null surface."
6364     *
6365     * We'll take care of this in NIR.
6366     */
6367    assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
6368 
6369    srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(instr->is_sparse);
6370 
6371    int lod_components = 0;
6372 
6373    /* The hardware requires a LOD for buffer textures */
6374    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6375       srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_d(0);
6376 
6377    ASSERTED bool got_lod = false;
6378    ASSERTED bool got_bias = false;
6379    uint32_t header_bits = 0;
6380    for (unsigned i = 0; i < instr->num_srcs; i++) {
6381       nir_src nir_src = instr->src[i].src;
6382       elk_fs_reg src = get_nir_src(ntb, nir_src);
6383       switch (instr->src[i].src_type) {
6384       case nir_tex_src_bias:
6385          assert(!got_lod);
6386          got_bias = true;
6387 
6388          srcs[TEX_LOGICAL_SRC_LOD] =
6389             retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6390          break;
6391       case nir_tex_src_comparator:
6392          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, ELK_REGISTER_TYPE_F);
6393          break;
6394       case nir_tex_src_coord:
6395          switch (instr->op) {
6396          case nir_texop_txf:
6397          case nir_texop_txf_ms:
6398          case nir_texop_txf_ms_mcs_intel:
6399          case nir_texop_samples_identical:
6400             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_D);
6401             break;
6402          default:
6403             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_F);
6404             break;
6405          }
6406          break;
6407       case nir_tex_src_ddx:
6408          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, ELK_REGISTER_TYPE_F);
6409          lod_components = nir_tex_instr_src_size(instr, i);
6410          break;
6411       case nir_tex_src_ddy:
6412          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, ELK_REGISTER_TYPE_F);
6413          break;
6414       case nir_tex_src_lod:
6415          assert(!got_bias);
6416          got_lod = true;
6417 
6418          switch (instr->op) {
6419          case nir_texop_txs:
6420             srcs[TEX_LOGICAL_SRC_LOD] =
6421                retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_UD);
6422             break;
6423          case nir_texop_txf:
6424             srcs[TEX_LOGICAL_SRC_LOD] =
6425                retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_D);
6426             break;
6427          default:
6428             srcs[TEX_LOGICAL_SRC_LOD] =
6429                retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6430             break;
6431          }
6432          break;
6433       case nir_tex_src_min_lod:
6434          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
6435             retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6436          break;
6437       case nir_tex_src_ms_index:
6438          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, ELK_REGISTER_TYPE_UD);
6439          break;
6440 
6441       case nir_tex_src_offset: {
6442          uint32_t offset_bits = 0;
6443          if (elk_texture_offset(instr, i, &offset_bits)) {
6444             header_bits |= offset_bits;
6445          } else {
6446             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
6447                retype(src, ELK_REGISTER_TYPE_D);
6448          }
6449          break;
6450       }
6451 
6452       case nir_tex_src_projector:
6453          unreachable("should be lowered");
6454 
6455       case nir_tex_src_texture_offset: {
6456          assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
6457          /* Emit code to evaluate the actual indexing expression */
6458          if (instr->texture_index == 0 && is_resource_src(nir_src))
6459             srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
6460          if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
6461             elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6462             bld.ADD(tmp, src, elk_imm_ud(instr->texture_index));
6463             srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
6464          }
6465          assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
6466          break;
6467       }
6468 
6469       case nir_tex_src_sampler_offset: {
6470          /* Emit code to evaluate the actual indexing expression */
6471          if (instr->sampler_index == 0 && is_resource_src(nir_src))
6472             srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
6473          if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
6474             elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6475             bld.ADD(tmp, src, elk_imm_ud(instr->sampler_index));
6476             srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
6477          }
6478          break;
6479       }
6480 
6481       case nir_tex_src_texture_handle:
6482          assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
6483          srcs[TEX_LOGICAL_SRC_SURFACE] = elk_fs_reg();
6484          if (is_resource_src(nir_src))
6485             srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
6486          if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6487             srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
6488          break;
6489 
6490       case nir_tex_src_sampler_handle:
6491          assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
6492          srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_fs_reg();
6493          if (is_resource_src(nir_src))
6494             srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
6495          if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6496             srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
6497          break;
6498 
6499       case nir_tex_src_ms_mcs_intel:
6500          assert(instr->op == nir_texop_txf_ms);
6501          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, ELK_REGISTER_TYPE_D);
6502          break;
6503 
6504       /* If this parameter is present, we are packing either the explicit LOD
6505        * or LOD bias and the array index into a single (32-bit) value when
6506        * 32-bit texture coordinates are used.
6507        */
6508       case nir_tex_src_backend1:
6509          assert(!got_lod && !got_bias);
6510          got_lod = true;
6511 
6512          assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
6513          srcs[TEX_LOGICAL_SRC_LOD] =
6514             retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6515          break;
6516 
6517       default:
6518          unreachable("unknown texture source");
6519       }
6520    }
6521 
6522    /* If the surface or sampler were not specified through sources, use the
6523     * instruction index.
6524     */
6525    if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
6526        srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6527       srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(instr->texture_index);
6528    if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
6529        srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6530       srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(instr->sampler_index);
6531 
6532    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
6533        (instr->op == nir_texop_txf_ms ||
6534         instr->op == nir_texop_samples_identical)) {
6535       if (devinfo->ver >= 7) {
6536          srcs[TEX_LOGICAL_SRC_MCS] =
6537             emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
6538                            instr->coord_components,
6539                            srcs[TEX_LOGICAL_SRC_SURFACE],
6540                            srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
6541       } else {
6542          srcs[TEX_LOGICAL_SRC_MCS] = elk_imm_ud(0u);
6543       }
6544    }
6545 
6546    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(instr->coord_components);
6547    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(lod_components);
6548 
6549    enum elk_opcode opcode;
6550    switch (instr->op) {
6551    case nir_texop_tex:
6552       opcode = ELK_SHADER_OPCODE_TEX_LOGICAL;
6553       break;
6554    case nir_texop_txb:
6555       opcode = ELK_FS_OPCODE_TXB_LOGICAL;
6556       break;
6557    case nir_texop_txl:
6558       opcode = ELK_SHADER_OPCODE_TXL_LOGICAL;
6559       break;
6560    case nir_texop_txd:
6561       opcode = ELK_SHADER_OPCODE_TXD_LOGICAL;
6562       break;
6563    case nir_texop_txf:
6564       opcode = ELK_SHADER_OPCODE_TXF_LOGICAL;
6565       break;
6566    case nir_texop_txf_ms:
6567       opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
6568       break;
6569    case nir_texop_txf_ms_mcs_intel:
6570       opcode = ELK_SHADER_OPCODE_TXF_MCS_LOGICAL;
6571       break;
6572    case nir_texop_query_levels:
6573    case nir_texop_txs:
6574       opcode = ELK_SHADER_OPCODE_TXS_LOGICAL;
6575       break;
6576    case nir_texop_lod:
6577       opcode = ELK_SHADER_OPCODE_LOD_LOGICAL;
6578       break;
6579    case nir_texop_tg4:
6580       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
6581          opcode = ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL;
6582       else
6583          opcode = ELK_SHADER_OPCODE_TG4_LOGICAL;
6584       break;
6585    case nir_texop_texture_samples:
6586       opcode = ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL;
6587       break;
6588    case nir_texop_samples_identical: {
6589       elk_fs_reg dst = retype(get_nir_def(ntb, instr->def), ELK_REGISTER_TYPE_D);
6590 
6591       /* If mcs is an immediate value, it means there is no MCS.  In that case
6592        * just return false.
6593        */
6594       if (srcs[TEX_LOGICAL_SRC_MCS].file == ELK_IMMEDIATE_VALUE) {
6595          bld.MOV(dst, elk_imm_ud(0u));
6596       } else {
6597          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], elk_imm_ud(0u),
6598                  ELK_CONDITIONAL_EQ);
6599       }
6600       return;
6601    }
6602    default:
6603       unreachable("unknown texture opcode");
6604    }
6605 
6606    if (instr->op == nir_texop_tg4) {
6607       if (instr->component == 1 &&
6608           s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
6609          /* gather4 sampler is broken for green channel on RG32F --
6610           * we must ask for blue instead.
6611           */
6612          header_bits |= 2 << 16;
6613       } else {
6614          header_bits |= instr->component << 16;
6615       }
6616    }
6617 
6618    elk_fs_reg dst = bld.vgrf(elk_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
6619    elk_fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
6620    inst->offset = header_bits;
6621 
6622    const unsigned dest_size = nir_tex_instr_dest_size(instr);
6623    inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
6624                         (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
6625 
6626    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
6627       inst->shadow_compare = true;
6628 
6629    /* Wa_14012688258:
6630     *
6631     * Don't trim zeros at the end of payload for sample operations
6632     * in cube and cube arrays.
6633     */
6634    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6635        intel_needs_workaround(devinfo, 14012688258)) {
6636 
6637       /* Compiler should send U,V,R parameters even if V,R are 0. */
6638       if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
6639          assert(instr->coord_components >= 3u);
6640 
6641       /* See opt_zero_samples(). */
6642       inst->keep_payload_trailing_zeros = true;
6643    }
6644 
6645    elk_fs_reg nir_dest[5];
6646    for (unsigned i = 0; i < dest_size; i++)
6647       nir_dest[i] = offset(dst, bld, i);
6648 
6649    if (instr->op == nir_texop_query_levels) {
6650       /* # levels is in .w */
6651       /**
6652        * Wa_1940217:
6653        *
6654        * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
6655        * MIPCount returned is undefined instead of 0.
6656        */
6657       elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
6658       mov->conditional_mod = ELK_CONDITIONAL_NZ;
6659       nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
6660       elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
6661       sel->predicate = ELK_PREDICATE_NORMAL;
6662    } else if (instr->op == nir_texop_txs &&
6663               dest_size >= 3 && devinfo->ver < 7) {
6664       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
6665       elk_fs_reg depth = offset(dst, bld, 2);
6666       nir_dest[2] = s.vgrf(glsl_int_type());
6667       bld.emit_minmax(nir_dest[2], depth, elk_imm_d(1), ELK_CONDITIONAL_GE);
6668    }
6669 
6670    /* The residency bits are only in the first component. */
6671    if (instr->is_sparse)
6672       nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
6673 
6674    bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
6675 }
6676 
6677 static void
fs_nir_emit_jump(nir_to_elk_state & ntb,nir_jump_instr * instr)6678 fs_nir_emit_jump(nir_to_elk_state &ntb, nir_jump_instr *instr)
6679 {
6680    switch (instr->type) {
6681    case nir_jump_break:
6682       ntb.bld.emit(ELK_OPCODE_BREAK);
6683       break;
6684    case nir_jump_continue:
6685       ntb.bld.emit(ELK_OPCODE_CONTINUE);
6686       break;
6687    case nir_jump_halt:
6688       ntb.bld.emit(ELK_OPCODE_HALT);
6689       break;
6690    case nir_jump_return:
6691    default:
6692       unreachable("unknown jump");
6693    }
6694 }
6695 
6696 /*
6697  * This helper takes a source register and un/shuffles it into the destination
6698  * register.
6699  *
6700  * If source type size is smaller than destination type size the operation
6701  * needed is a component shuffle. The opposite case would be an unshuffle. If
6702  * source/destination type size is equal a shuffle is done that would be
6703  * equivalent to a simple MOV.
6704  *
6705  * For example, if source is a 16-bit type and destination is 32-bit. A 3
6706  * components .xyz 16-bit vector on SIMD8 would be.
6707  *
6708  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
6709  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
6710  *
6711  * This helper will return the following 2 32-bit components with the 16-bit
6712  * values shuffled:
6713  *
6714  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
6715  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
6716  *
6717  * For unshuffle, the example would be the opposite, a 64-bit type source
6718  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
6719  * would be:
6720  *
6721  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
6722  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
6723  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
6724  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
6725  *
6726  * The returned result would be the following 4 32-bit components unshuffled:
6727  *
6728  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
6729  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
6730  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
6731  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
6732  *
6733  * - Source and destination register must not be overlapped.
6734  * - components units are measured in terms of the smaller type between
6735  *   source and destination because we are un/shuffling the smaller
6736  *   components from/into the bigger ones.
6737  * - first_component parameter allows skipping source components.
6738  */
6739 void
elk_shuffle_src_to_dst(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6740 elk_shuffle_src_to_dst(const fs_builder &bld,
6741                    const elk_fs_reg &dst,
6742                    const elk_fs_reg &src,
6743                    uint32_t first_component,
6744                    uint32_t components)
6745 {
6746    if (type_sz(src.type) == type_sz(dst.type)) {
6747       assert(!regions_overlap(dst,
6748          type_sz(dst.type) * bld.dispatch_width() * components,
6749          offset(src, bld, first_component),
6750          type_sz(src.type) * bld.dispatch_width() * components));
6751       for (unsigned i = 0; i < components; i++) {
6752          bld.MOV(retype(offset(dst, bld, i), src.type),
6753                  offset(src, bld, i + first_component));
6754       }
6755    } else if (type_sz(src.type) < type_sz(dst.type)) {
6756       /* Source is shuffled into destination */
6757       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
6758       assert(!regions_overlap(dst,
6759          type_sz(dst.type) * bld.dispatch_width() *
6760          DIV_ROUND_UP(components, size_ratio),
6761          offset(src, bld, first_component),
6762          type_sz(src.type) * bld.dispatch_width() * components));
6763 
6764       elk_reg_type shuffle_type =
6765          elk_reg_type_from_bit_size(8 * type_sz(src.type),
6766                                     ELK_REGISTER_TYPE_D);
6767       for (unsigned i = 0; i < components; i++) {
6768          elk_fs_reg shuffle_component_i =
6769             subscript(offset(dst, bld, i / size_ratio),
6770                       shuffle_type, i % size_ratio);
6771          bld.MOV(shuffle_component_i,
6772                  retype(offset(src, bld, i + first_component), shuffle_type));
6773       }
6774    } else {
6775       /* Source is unshuffled into destination */
6776       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
6777       assert(!regions_overlap(dst,
6778          type_sz(dst.type) * bld.dispatch_width() * components,
6779          offset(src, bld, first_component / size_ratio),
6780          type_sz(src.type) * bld.dispatch_width() *
6781          DIV_ROUND_UP(components + (first_component % size_ratio),
6782                       size_ratio)));
6783 
6784       elk_reg_type shuffle_type =
6785          elk_reg_type_from_bit_size(8 * type_sz(dst.type),
6786                                     ELK_REGISTER_TYPE_D);
6787       for (unsigned i = 0; i < components; i++) {
6788          elk_fs_reg shuffle_component_i =
6789             subscript(offset(src, bld, (first_component + i) / size_ratio),
6790                       shuffle_type, (first_component + i) % size_ratio);
6791          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
6792                  shuffle_component_i);
6793       }
6794    }
6795 }
6796 
6797 void
elk_shuffle_from_32bit_read(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6798 elk_shuffle_from_32bit_read(const fs_builder &bld,
6799                         const elk_fs_reg &dst,
6800                         const elk_fs_reg &src,
6801                         uint32_t first_component,
6802                         uint32_t components)
6803 {
6804    assert(type_sz(src.type) == 4);
6805 
6806    /* This function takes components in units of the destination type while
6807     * elk_shuffle_src_to_dst takes components in units of the smallest type
6808     */
6809    if (type_sz(dst.type) > 4) {
6810       assert(type_sz(dst.type) == 8);
6811       first_component *= 2;
6812       components *= 2;
6813    }
6814 
6815    elk_shuffle_src_to_dst(bld, dst, src, first_component, components);
6816 }
6817 
6818 elk_fs_reg
elk_setup_imm_df(const fs_builder & bld,double v)6819 elk_setup_imm_df(const fs_builder &bld, double v)
6820 {
6821    const struct intel_device_info *devinfo = bld.shader->devinfo;
6822    assert(devinfo->ver >= 7);
6823 
6824    if (devinfo->ver >= 8)
6825       return elk_imm_df(v);
6826 
6827    /* gfx7.5 does not support DF immediates straightforward but the DIM
6828     * instruction allows to set the 64-bit immediate value.
6829     */
6830    if (devinfo->platform == INTEL_PLATFORM_HSW) {
6831       const fs_builder ubld = bld.exec_all().group(1, 0);
6832       elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_DF, 1);
6833       ubld.DIM(dst, elk_imm_df(v));
6834       return component(dst, 0);
6835    }
6836 
6837    /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
6838     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
6839     * the high 32-bit to suboffset 4 and then applying a stride of 0.
6840     *
6841     * Alternatively, we could also produce a normal VGRF (without stride 0)
6842     * by writing to all the channels in the VGRF, however, that would hit the
6843     * gfx7 bug where we have to split writes that span more than 1 register
6844     * into instructions with a width of 4 (otherwise the write to the second
6845     * register written runs into an execmask hardware bug) which isn't very
6846     * nice.
6847     */
6848    union {
6849       double d;
6850       struct {
6851          uint32_t i1;
6852          uint32_t i2;
6853       };
6854    } di;
6855 
6856    di.d = v;
6857 
6858    const fs_builder ubld = bld.exec_all().group(1, 0);
6859    const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
6860    ubld.MOV(tmp, elk_imm_ud(di.i1));
6861    ubld.MOV(horiz_offset(tmp, 1), elk_imm_ud(di.i2));
6862 
6863    return component(retype(tmp, ELK_REGISTER_TYPE_DF), 0);
6864 }
6865 
6866 elk_fs_reg
elk_setup_imm_b(const fs_builder & bld,int8_t v)6867 elk_setup_imm_b(const fs_builder &bld, int8_t v)
6868 {
6869    const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_B);
6870    bld.MOV(tmp, elk_imm_w(v));
6871    return tmp;
6872 }
6873 
6874 elk_fs_reg
elk_setup_imm_ub(const fs_builder & bld,uint8_t v)6875 elk_setup_imm_ub(const fs_builder &bld, uint8_t v)
6876 {
6877    const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UB);
6878    bld.MOV(tmp, elk_imm_uw(v));
6879    return tmp;
6880 }
6881 
6882 static void
fs_nir_emit_instr(nir_to_elk_state & ntb,nir_instr * instr)6883 fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr)
6884 {
6885    ntb.bld = ntb.bld.annotate(NULL, instr);
6886 
6887    switch (instr->type) {
6888    case nir_instr_type_alu:
6889       fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
6890       break;
6891 
6892    case nir_instr_type_deref:
6893       unreachable("All derefs should've been lowered");
6894       break;
6895 
6896    case nir_instr_type_intrinsic:
6897       switch (ntb.s.stage) {
6898       case MESA_SHADER_VERTEX:
6899          fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6900          break;
6901       case MESA_SHADER_TESS_CTRL:
6902          fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6903          break;
6904       case MESA_SHADER_TESS_EVAL:
6905          fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6906          break;
6907       case MESA_SHADER_GEOMETRY:
6908          fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6909          break;
6910       case MESA_SHADER_FRAGMENT:
6911          fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6912          break;
6913       case MESA_SHADER_COMPUTE:
6914          fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6915          break;
6916       default:
6917          unreachable("unsupported shader stage");
6918       }
6919       break;
6920 
6921    case nir_instr_type_tex:
6922       fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
6923       break;
6924 
6925    case nir_instr_type_load_const:
6926       fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
6927       break;
6928 
6929    case nir_instr_type_undef:
6930       /* We create a new VGRF for undefs on every use (by handling
6931        * them in get_nir_src()), rather than for each definition.
6932        * This helps register coalescing eliminate MOVs from undef.
6933        */
6934       break;
6935 
6936    case nir_instr_type_jump:
6937       fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
6938       break;
6939 
6940    default:
6941       unreachable("unknown instruction type");
6942    }
6943 }
6944 
6945 static unsigned
elk_rnd_mode_from_nir(unsigned mode,unsigned * mask)6946 elk_rnd_mode_from_nir(unsigned mode, unsigned *mask)
6947 {
6948    unsigned elk_mode = 0;
6949    *mask = 0;
6950 
6951    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
6952         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
6953         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
6954        mode) {
6955       elk_mode |= ELK_RND_MODE_RTZ << ELK_CR0_RND_MODE_SHIFT;
6956       *mask |= ELK_CR0_RND_MODE_MASK;
6957    }
6958    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
6959         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
6960         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
6961        mode) {
6962       elk_mode |= ELK_RND_MODE_RTNE << ELK_CR0_RND_MODE_SHIFT;
6963       *mask |= ELK_CR0_RND_MODE_MASK;
6964    }
6965    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
6966       elk_mode |= ELK_CR0_FP16_DENORM_PRESERVE;
6967       *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6968    }
6969    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
6970       elk_mode |= ELK_CR0_FP32_DENORM_PRESERVE;
6971       *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
6972    }
6973    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
6974       elk_mode |= ELK_CR0_FP64_DENORM_PRESERVE;
6975       *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
6976    }
6977    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
6978       *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6979    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
6980       *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
6981    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
6982       *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
6983    if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
6984       *mask |= ELK_CR0_FP_MODE_MASK;
6985 
6986    if (*mask != 0)
6987       assert((*mask & elk_mode) == elk_mode);
6988 
6989    return elk_mode;
6990 }
6991 
6992 static void
emit_shader_float_controls_execution_mode(nir_to_elk_state & ntb)6993 emit_shader_float_controls_execution_mode(nir_to_elk_state &ntb)
6994 {
6995    const fs_builder &bld = ntb.bld;
6996    elk_fs_visitor &s = ntb.s;
6997 
6998    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
6999    if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7000       return;
7001 
7002    fs_builder ubld = bld.exec_all().group(1, 0);
7003    fs_builder abld = ubld.annotate("shader floats control execution mode");
7004    unsigned mask, mode = elk_rnd_mode_from_nir(execution_mode, &mask);
7005 
7006    if (mask == 0)
7007       return;
7008 
7009    abld.emit(ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7010              elk_imm_d(mode), elk_imm_d(mask));
7011 }
7012 
7013 void
nir_to_elk(elk_fs_visitor * s)7014 nir_to_elk(elk_fs_visitor *s)
7015 {
7016    nir_to_elk_state ntb = {
7017       .s       = *s,
7018       .nir     = s->nir,
7019       .devinfo = s->devinfo,
7020       .mem_ctx = ralloc_context(NULL),
7021       .bld     = fs_builder(s).at_end(),
7022    };
7023 
7024    emit_shader_float_controls_execution_mode(ntb);
7025 
7026    /* emit the arrays used for inputs and outputs - load/store intrinsics will
7027     * be converted to reads/writes of these arrays
7028     */
7029    fs_nir_setup_outputs(ntb);
7030    fs_nir_setup_uniforms(ntb.s);
7031    fs_nir_emit_system_values(ntb);
7032    ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7033 
7034    fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7035 
7036    ntb.bld.emit(ELK_SHADER_OPCODE_HALT_TARGET);
7037 
7038    ralloc_free(ntb.mem_ctx);
7039 }
7040 
7041