xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_vec4_nir.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_nir.h"
25 #include "elk_nir_private.h"
26 #include "elk_vec4.h"
27 #include "elk_vec4_builder.h"
28 #include "elk_vec4_surface_builder.h"
29 #include "elk_eu.h"
30 #include "nir.h"
31 #include "nir_intrinsics.h"
32 #include "nir_intrinsics_indices.h"
33 
34 using namespace elk;
35 using namespace elk::surface_access;
36 
37 namespace elk {
38 
39 void
emit_nir_code()40 vec4_visitor::emit_nir_code()
41 {
42    /* Globally set the rounding mode based on the float controls.  gen7 doesn't
43     * support 16-bit floats, and gen8 switches to scalar VS.  So we don't need
44     * to do any per-instruction mode switching the way the scalar FS handles.
45     */
46    emit_shader_float_controls_execution_mode();
47    if (nir->num_uniforms > 0)
48       nir_setup_uniforms();
49 
50    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
51 }
52 
53 void
nir_setup_uniforms()54 vec4_visitor::nir_setup_uniforms()
55 {
56    uniforms = nir->num_uniforms / 16;
57 }
58 
59 void
nir_emit_impl(nir_function_impl * impl)60 vec4_visitor::nir_emit_impl(nir_function_impl *impl)
61 {
62    nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
63 
64    nir_emit_cf_list(&impl->body);
65 }
66 
67 void
nir_emit_cf_list(exec_list * list)68 vec4_visitor::nir_emit_cf_list(exec_list *list)
69 {
70    exec_list_validate(list);
71    foreach_list_typed(nir_cf_node, node, node, list) {
72       switch (node->type) {
73       case nir_cf_node_if:
74          nir_emit_if(nir_cf_node_as_if(node));
75          break;
76 
77       case nir_cf_node_loop:
78          nir_emit_loop(nir_cf_node_as_loop(node));
79          break;
80 
81       case nir_cf_node_block:
82          nir_emit_block(nir_cf_node_as_block(node));
83          break;
84 
85       default:
86          unreachable("Invalid CFG node block");
87       }
88    }
89 }
90 
91 void
nir_emit_if(nir_if * if_stmt)92 vec4_visitor::nir_emit_if(nir_if *if_stmt)
93 {
94    /* First, put the condition in f0 */
95    src_reg condition = get_nir_src(if_stmt->condition, ELK_REGISTER_TYPE_D, 1);
96    vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
97    inst->conditional_mod = ELK_CONDITIONAL_NZ;
98 
99    /* We can just predicate based on the X channel, as the condition only
100     * goes on its own line */
101    emit(IF(ELK_PREDICATE_ALIGN16_REPLICATE_X));
102 
103    nir_emit_cf_list(&if_stmt->then_list);
104 
105    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
106       emit(ELK_OPCODE_ELSE);
107       nir_emit_cf_list(&if_stmt->else_list);
108    }
109 
110    emit(ELK_OPCODE_ENDIF);
111 }
112 
113 void
nir_emit_loop(nir_loop * loop)114 vec4_visitor::nir_emit_loop(nir_loop *loop)
115 {
116    assert(!nir_loop_has_continue_construct(loop));
117    emit(ELK_OPCODE_DO);
118 
119    nir_emit_cf_list(&loop->body);
120 
121    emit(ELK_OPCODE_WHILE);
122 }
123 
124 void
nir_emit_block(nir_block * block)125 vec4_visitor::nir_emit_block(nir_block *block)
126 {
127    nir_foreach_instr(instr, block) {
128       nir_emit_instr(instr);
129    }
130 }
131 
132 void
nir_emit_instr(nir_instr * instr)133 vec4_visitor::nir_emit_instr(nir_instr *instr)
134 {
135    base_ir = instr;
136 
137    switch (instr->type) {
138    case nir_instr_type_load_const:
139       nir_emit_load_const(nir_instr_as_load_const(instr));
140       break;
141 
142    case nir_instr_type_intrinsic:
143       nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
144       break;
145 
146    case nir_instr_type_alu:
147       nir_emit_alu(nir_instr_as_alu(instr));
148       break;
149 
150    case nir_instr_type_jump:
151       nir_emit_jump(nir_instr_as_jump(instr));
152       break;
153 
154    case nir_instr_type_tex:
155       nir_emit_texture(nir_instr_as_tex(instr));
156       break;
157 
158    case nir_instr_type_undef:
159       nir_emit_undef(nir_instr_as_undef(instr));
160       break;
161 
162    default:
163       unreachable("VS instruction not yet implemented by NIR->vec4");
164    }
165 }
166 
167 static dst_reg
dst_reg_for_nir_reg(vec4_visitor * v,nir_def * handle,unsigned base_offset,nir_src * indirect)168 dst_reg_for_nir_reg(vec4_visitor *v, nir_def *handle,
169                     unsigned base_offset, nir_src *indirect)
170 {
171    nir_intrinsic_instr *decl = nir_reg_get_decl(handle);
172    dst_reg reg = v->nir_ssa_values[handle->index];
173    if (nir_intrinsic_bit_size(decl) == 64)
174       reg.type = ELK_REGISTER_TYPE_DF;
175 
176    reg = offset(reg, 8, base_offset);
177    if (indirect) {
178       reg.reladdr =
179          new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
180                                                 ELK_REGISTER_TYPE_D,
181                                                 1));
182    }
183    return reg;
184 }
185 
186 dst_reg
get_nir_def(const nir_def & def)187 vec4_visitor::get_nir_def(const nir_def &def)
188 {
189    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
190    if (!store_reg) {
191       dst_reg dst =
192          dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(def.bit_size, 32)));
193       if (def.bit_size == 64)
194          dst.type = ELK_REGISTER_TYPE_DF;
195       nir_ssa_values[def.index] = dst;
196       return dst;
197    } else {
198       nir_src *indirect =
199          (store_reg->intrinsic == nir_intrinsic_store_reg_indirect) ?
200          &store_reg->src[2] : NULL;
201 
202       dst_reg dst = dst_reg_for_nir_reg(this, store_reg->src[1].ssa,
203                                         nir_intrinsic_base(store_reg),
204                                         indirect);
205       dst.writemask = nir_intrinsic_write_mask(store_reg);
206       return dst;
207    }
208 }
209 
210 dst_reg
get_nir_def(const nir_def & def,enum elk_reg_type type)211 vec4_visitor::get_nir_def(const nir_def &def, enum elk_reg_type type)
212 {
213    return retype(get_nir_def(def), type);
214 }
215 
216 dst_reg
get_nir_def(const nir_def & def,nir_alu_type type)217 vec4_visitor::get_nir_def(const nir_def &def, nir_alu_type type)
218 {
219    return get_nir_def(def, elk_type_for_nir_type(devinfo, type));
220 }
221 
222 src_reg
get_nir_src(const nir_src & src,enum elk_reg_type type,unsigned num_components)223 vec4_visitor::get_nir_src(const nir_src &src, enum elk_reg_type type,
224                           unsigned num_components)
225 {
226    nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
227 
228    dst_reg reg;
229    if (load_reg) {
230       nir_src *indirect =
231          (load_reg->intrinsic == nir_intrinsic_load_reg_indirect) ?
232          &load_reg->src[1] : NULL;
233 
234       reg = dst_reg_for_nir_reg(this, load_reg->src[0].ssa,
235                                       nir_intrinsic_base(load_reg),
236                                       indirect);
237    } else {
238       reg = nir_ssa_values[src.ssa->index];
239    }
240 
241    reg = retype(reg, type);
242 
243    src_reg reg_as_src = src_reg(reg);
244    reg_as_src.swizzle = elk_swizzle_for_size(num_components);
245    return reg_as_src;
246 }
247 
248 src_reg
get_nir_src(const nir_src & src,nir_alu_type type,unsigned num_components)249 vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
250                           unsigned num_components)
251 {
252    return get_nir_src(src, elk_type_for_nir_type(devinfo, type),
253                       num_components);
254 }
255 
256 src_reg
get_nir_src(const nir_src & src,unsigned num_components)257 vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
258 {
259    /* if type is not specified, default to signed int */
260    return get_nir_src(src, nir_type_int32, num_components);
261 }
262 
263 src_reg
get_nir_src_imm(const nir_src & src)264 vec4_visitor::get_nir_src_imm(const nir_src &src)
265 {
266    assert(nir_src_num_components(src) == 1);
267    assert(nir_src_bit_size(src) == 32);
268    return nir_src_is_const(src) ? src_reg(elk_imm_d(nir_src_as_int(src))) :
269                                   get_nir_src(src, 1);
270 }
271 
272 src_reg
get_indirect_offset(nir_intrinsic_instr * instr)273 vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
274 {
275    nir_src *offset_src = nir_get_io_offset_src(instr);
276 
277    if (nir_src_is_const(*offset_src)) {
278       /* The only constant offset we should find is 0.  elk_nir.c's
279        * add_const_offset_to_base() will fold other constant offsets
280        * into the base index.
281        */
282       assert(nir_src_as_uint(*offset_src) == 0);
283       return src_reg();
284    }
285 
286    return get_nir_src(*offset_src, ELK_REGISTER_TYPE_UD, 1);
287 }
288 
289 static src_reg
elk_setup_imm_df(const vec4_builder & bld,double v)290 elk_setup_imm_df(const vec4_builder &bld, double v)
291 {
292    const intel_device_info *devinfo = bld.shader->devinfo;
293    assert(devinfo->ver == 7);
294 
295    /* gfx7.5 does not support DF immediates straightforward but the DIM
296     * instruction allows to set the 64-bit immediate value.
297     */
298    if (devinfo->verx10 == 75) {
299       const vec4_builder ubld = bld.exec_all();
300       const dst_reg dst = bld.vgrf(ELK_REGISTER_TYPE_DF);
301       ubld.DIM(dst, elk_imm_df(v));
302       return swizzle(src_reg(dst), ELK_SWIZZLE_XXXX);
303    }
304 
305    /* gfx7 does not support DF immediates */
306    union {
307       double d;
308       struct {
309          uint32_t i1;
310          uint32_t i2;
311       };
312    } di;
313 
314    di.d = v;
315 
316    /* Write the low 32-bit of the constant to the X:UD channel and the
317     * high 32-bit to the Y:UD channel to build the constant in a VGRF.
318     * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
319     * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
320     * XXXX so any access to the VGRF only reads the constant data in these
321     * channels.
322     */
323    const dst_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD, 2);
324    for (unsigned n = 0; n < 2; n++) {
325       const vec4_builder ubld = bld.exec_all().group(4, n);
326       ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), elk_imm_ud(di.i1));
327       ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), elk_imm_ud(di.i2));
328    }
329 
330    return swizzle(src_reg(retype(tmp, ELK_REGISTER_TYPE_DF)), ELK_SWIZZLE_XXXX);
331 }
332 
333 void
nir_emit_load_const(nir_load_const_instr * instr)334 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
335 {
336    dst_reg reg;
337 
338    if (instr->def.bit_size == 64) {
339       reg = dst_reg(VGRF, alloc.allocate(2));
340       reg.type = ELK_REGISTER_TYPE_DF;
341    } else {
342       reg = dst_reg(VGRF, alloc.allocate(1));
343       reg.type = ELK_REGISTER_TYPE_D;
344    }
345 
346    const vec4_builder ibld = vec4_builder(this).at_end();
347    unsigned remaining = elk_writemask_for_size(instr->def.num_components);
348 
349    /* @FIXME: consider emitting vector operations to save some MOVs in
350     * cases where the components are representable in 8 bits.
351     * For now, we emit a MOV for each distinct value.
352     */
353    for (unsigned i = 0; i < instr->def.num_components; i++) {
354       unsigned writemask = 1 << i;
355 
356       if ((remaining & writemask) == 0)
357          continue;
358 
359       for (unsigned j = i; j < instr->def.num_components; j++) {
360          if ((instr->def.bit_size == 32 &&
361               instr->value[i].u32 == instr->value[j].u32) ||
362              (instr->def.bit_size == 64 &&
363               instr->value[i].f64 == instr->value[j].f64)) {
364             writemask |= 1 << j;
365          }
366       }
367 
368       reg.writemask = writemask;
369       if (instr->def.bit_size == 64) {
370          emit(MOV(reg, elk_setup_imm_df(ibld, instr->value[i].f64)));
371       } else {
372          emit(MOV(reg, elk_imm_d(instr->value[i].i32)));
373       }
374 
375       remaining &= ~writemask;
376    }
377 
378    /* Set final writemask */
379    reg.writemask = elk_writemask_for_size(instr->def.num_components);
380 
381    nir_ssa_values[instr->def.index] = reg;
382 }
383 
384 src_reg
get_nir_ssbo_intrinsic_index(nir_intrinsic_instr * instr)385 vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr)
386 {
387    /* SSBO stores are weird in that their index is in src[1] */
388    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
389 
390    if (nir_src_is_const(instr->src[src])) {
391       return elk_imm_ud(nir_src_as_uint(instr->src[src]));
392    } else {
393       return emit_uniformize(get_nir_src(instr->src[src]));
394    }
395 }
396 
397 void
nir_emit_intrinsic(nir_intrinsic_instr * instr)398 vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
399 {
400    dst_reg dest;
401    src_reg src;
402 
403    switch (instr->intrinsic) {
404    case nir_intrinsic_decl_reg: {
405       unsigned bit_size = nir_intrinsic_bit_size(instr);
406       unsigned array_elems = nir_intrinsic_num_array_elems(instr);
407       if (array_elems == 0)
408          array_elems = 1;
409 
410       const unsigned num_regs = array_elems * DIV_ROUND_UP(bit_size, 32);
411       dst_reg reg(VGRF, alloc.allocate(num_regs));
412       if (bit_size == 64)
413          reg.type = ELK_REGISTER_TYPE_DF;
414 
415       nir_ssa_values[instr->def.index] = reg;
416       break;
417    }
418 
419    case nir_intrinsic_load_reg:
420    case nir_intrinsic_load_reg_indirect:
421    case nir_intrinsic_store_reg:
422    case nir_intrinsic_store_reg_indirect:
423       /* Nothing to do with these. */
424       break;
425 
426    case nir_intrinsic_load_input:
427    case nir_intrinsic_load_per_primitive_input: {
428       assert(instr->def.bit_size == 32);
429       /* We set EmitNoIndirectInput for VS */
430       unsigned load_offset = nir_src_as_uint(instr->src[0]);
431 
432       dest = get_nir_def(instr->def);
433 
434       src = src_reg(ATTR, nir_intrinsic_base(instr) + load_offset,
435                     glsl_uvec4_type());
436       src = retype(src, dest.type);
437 
438       /* Swizzle source based on component layout qualifier */
439       src.swizzle = ELK_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
440       emit(MOV(dest, src));
441       break;
442    }
443 
444    case nir_intrinsic_store_output: {
445       assert(nir_src_bit_size(instr->src[0]) == 32);
446       unsigned store_offset = nir_src_as_uint(instr->src[1]);
447       int varying = nir_intrinsic_base(instr) + store_offset;
448       src = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_F,
449                         instr->num_components);
450 
451       unsigned c = nir_intrinsic_component(instr);
452       output_reg[varying][c] = dst_reg(src);
453       output_num_components[varying][c] = instr->num_components;
454       break;
455    }
456 
457    case nir_intrinsic_get_ssbo_size: {
458       assert(nir_src_num_components(instr->src[0]) == 1);
459       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
460                             nir_src_as_uint(instr->src[0]) : 0;
461 
462       dst_reg result_dst = get_nir_def(instr->def);
463       vec4_instruction *inst = new(mem_ctx)
464          vec4_instruction(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, result_dst);
465 
466       inst->base_mrf = 2;
467       inst->mlen = 1; /* always at least one */
468       inst->src[1] = elk_imm_ud(ssbo_index);
469 
470       /* MRF for the first parameter */
471       src_reg lod = elk_imm_d(0);
472       int param_base = inst->base_mrf;
473       int writemask = WRITEMASK_X;
474       emit(MOV(dst_reg(MRF, param_base, glsl_int_type(), writemask), lod));
475 
476       emit(inst);
477       break;
478    }
479 
480    case nir_intrinsic_store_ssbo: {
481       assert(devinfo->ver == 7);
482 
483       /* elk_nir_lower_mem_access_bit_sizes takes care of this */
484       assert(nir_src_bit_size(instr->src[0]) == 32);
485       assert(nir_intrinsic_write_mask(instr) ==
486              (1u << instr->num_components) - 1);
487 
488       src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
489       src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
490                                   ELK_REGISTER_TYPE_UD);
491 
492       /* Value */
493       src_reg val_reg = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_F, 4);
494 
495       /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
496        * writes will use SIMD8 mode. In order to hide this and keep symmetry across
497        * typed and untyped messages and across hardware platforms, the
498        * current implementation of the untyped messages will transparently convert
499        * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
500        * and enabling only channel X on the SEND instruction.
501        *
502        * The above, works well for full vector writes, but not for partial writes
503        * where we want to write some channels and not others, like when we have
504        * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
505        * quite restrictive with regards to the channel enables we can configure in
506        * the message descriptor (not all combinations are allowed) we cannot simply
507        * implement these scenarios with a single message while keeping the
508        * aforementioned symmetry in the implementation. For now we de decided that
509        * it is better to keep the symmetry to reduce complexity, so in situations
510        * such as the one described we end up emitting two untyped write messages
511        * (one for xy and another for w).
512        *
513        * The code below packs consecutive channels into a single write message,
514        * detects gaps in the vector write and if needed, sends a second message
515        * with the remaining channels. If in the future we decide that we want to
516        * emit a single message at the expense of losing the symmetry in the
517        * implementation we can:
518        *
519        * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
520        *    message payload. In this mode we can write up to 8 offsets and dwords
521        *    to the red channel only (for the two vec4s in the SIMD4x2 execution)
522        *    and select which of the 8 channels carry data to write by setting the
523        *    appropriate writemask in the dst register of the SEND instruction.
524        *    It would require to write a new generator opcode specifically for
525        *    IvyBridge since we would need to prepare a SIMD8 payload that could
526        *    use any channel, not just X.
527        *
528        * 2) For Haswell+: Simply send a single write message but set the writemask
529        *    on the dst of the SEND instruction to select the channels we want to
530        *    write. It would require to modify the current messages to receive
531        *    and honor the writemask provided.
532        */
533       const vec4_builder bld = vec4_builder(this).at_end()
534                                .annotate(current_annotation, base_ir);
535 
536       emit_untyped_write(bld, surf_index, offset_reg, val_reg,
537                          1 /* dims */, instr->num_components /* size */,
538                          ELK_PREDICATE_NONE);
539       break;
540    }
541 
542    case nir_intrinsic_load_ssbo: {
543       assert(devinfo->ver == 7);
544 
545       /* elk_nir_lower_mem_access_bit_sizes takes care of this */
546       assert(instr->def.bit_size == 32);
547 
548       src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
549       src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
550                                   ELK_REGISTER_TYPE_UD);
551 
552       /* Read the vector */
553       const vec4_builder bld = vec4_builder(this).at_end()
554          .annotate(current_annotation, base_ir);
555 
556       src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
557                                               1 /* dims */, 4 /* size*/,
558                                               ELK_PREDICATE_NONE);
559       dst_reg dest = get_nir_def(instr->def);
560       read_result.type = dest.type;
561       read_result.swizzle = elk_swizzle_for_size(instr->num_components);
562       emit(MOV(dest, read_result));
563       break;
564    }
565 
566    case nir_intrinsic_ssbo_atomic:
567    case nir_intrinsic_ssbo_atomic_swap:
568       nir_emit_ssbo_atomic(lsc_op_to_legacy_atomic(elk_lsc_aop_for_nir_intrinsic(instr)), instr);
569       break;
570 
571    case nir_intrinsic_load_vertex_id:
572       unreachable("should be lowered by vertex_id_zero_based");
573 
574    case nir_intrinsic_load_vertex_id_zero_base:
575    case nir_intrinsic_load_base_vertex:
576    case nir_intrinsic_load_instance_id:
577    case nir_intrinsic_load_base_instance:
578    case nir_intrinsic_load_draw_id:
579    case nir_intrinsic_load_invocation_id:
580       unreachable("should be lowered by elk_nir_lower_vs_inputs()");
581 
582    case nir_intrinsic_load_uniform: {
583       /* Offsets are in bytes but they should always be multiples of 4 */
584       assert(nir_intrinsic_base(instr) % 4 == 0);
585 
586       dest = get_nir_def(instr->def);
587 
588       src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
589       src.type = dest.type;
590 
591       /* Uniforms don't actually have to be vec4 aligned.  In the case that
592        * it isn't, we have to use a swizzle to shift things around.  They
593        * do still have the std140 alignment requirement that vec2's have to
594        * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
595        *
596        * The swizzle also works in the indirect case as the generator adds
597        * the swizzle to the offset for us.
598        */
599       const int type_size = type_sz(src.type);
600       unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size;
601       assert(shift + instr->num_components <= 4);
602 
603       if (nir_src_is_const(instr->src[0])) {
604          const unsigned load_offset = nir_src_as_uint(instr->src[0]);
605          /* Offsets are in bytes but they should always be multiples of 4 */
606          assert(load_offset % 4 == 0);
607 
608          src.swizzle = elk_swizzle_for_size(instr->num_components);
609          dest.writemask = elk_writemask_for_size(instr->num_components);
610          unsigned offset = load_offset + shift * type_size;
611          src.offset = ROUND_DOWN_TO(offset, 16);
612          shift = (offset % 16) / type_size;
613          assert(shift + instr->num_components <= 4);
614          src.swizzle += ELK_SWIZZLE4(shift, shift, shift, shift);
615 
616          emit(MOV(dest, src));
617       } else {
618          /* Uniform arrays are vec4 aligned, because of std140 alignment
619           * rules.
620           */
621          assert(shift == 0);
622 
623          src_reg indirect = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_UD, 1);
624 
625          /* MOV_INDIRECT is going to stomp the whole thing anyway */
626          dest.writemask = WRITEMASK_XYZW;
627 
628          emit(ELK_SHADER_OPCODE_MOV_INDIRECT, dest, src,
629               indirect, elk_imm_ud(nir_intrinsic_range(instr)));
630       }
631       break;
632    }
633 
634    case nir_intrinsic_load_ubo: {
635       src_reg surf_index;
636 
637       dest = get_nir_def(instr->def);
638 
639       if (nir_src_is_const(instr->src[0])) {
640          /* The block index is a constant, so just emit the binding table entry
641           * as an immediate.
642           */
643          const unsigned index = nir_src_as_uint(instr->src[0]);
644          surf_index = elk_imm_ud(index);
645       } else {
646          /* The block index is not a constant. Evaluate the index expression
647           * per-channel and add the base UBO index; we have to select a value
648           * from any live channel.
649           */
650          surf_index = src_reg(this, glsl_uint_type());
651          emit(MOV(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
652                                                    instr->num_components)));
653          surf_index = emit_uniformize(surf_index);
654       }
655 
656       src_reg push_reg;
657       src_reg offset_reg;
658       if (nir_src_is_const(instr->src[1])) {
659          unsigned load_offset = nir_src_as_uint(instr->src[1]);
660          unsigned aligned_offset = load_offset & ~15;
661          offset_reg = elk_imm_ud(aligned_offset);
662 
663          /* See if we've selected this as a push constant candidate */
664          if (nir_src_is_const(instr->src[0])) {
665             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
666             const unsigned offset_256b = aligned_offset / 32;
667 
668             for (int i = 0; i < 4; i++) {
669                const struct elk_ubo_range *range = &prog_data->base.ubo_ranges[i];
670                if (range->block == ubo_block &&
671                    offset_256b >= range->start &&
672                    offset_256b < range->start + range->length) {
673 
674                   push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i));
675                   push_reg.type = dest.type;
676                   push_reg.offset = aligned_offset - 32 * range->start;
677                   break;
678                }
679             }
680          }
681       } else {
682          offset_reg = src_reg(this, glsl_uint_type());
683          emit(MOV(dst_reg(offset_reg),
684                   get_nir_src(instr->src[1], nir_type_uint32, 1)));
685       }
686 
687       src_reg packed_consts;
688       if (push_reg.file != BAD_FILE) {
689          packed_consts = push_reg;
690       } else if (instr->def.bit_size == 32) {
691          packed_consts = src_reg(this, glsl_vec4_type());
692          emit_pull_constant_load_reg(dst_reg(packed_consts),
693                                      surf_index,
694                                      offset_reg,
695                                      NULL, NULL /* before_block/inst */);
696          prog_data->base.has_ubo_pull = true;
697       } else {
698          src_reg temp = src_reg(this, glsl_dvec4_type());
699          src_reg temp_float = retype(temp, ELK_REGISTER_TYPE_F);
700 
701          emit_pull_constant_load_reg(dst_reg(temp_float),
702                                      surf_index, offset_reg, NULL, NULL);
703          if (offset_reg.file == IMM)
704             offset_reg.ud += 16;
705          else
706             emit(ADD(dst_reg(offset_reg), offset_reg, elk_imm_ud(16u)));
707          emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
708                                      surf_index, offset_reg, NULL, NULL);
709          prog_data->base.has_ubo_pull = true;
710 
711          packed_consts = src_reg(this, glsl_dvec4_type());
712          shuffle_64bit_data(dst_reg(packed_consts), temp, false);
713       }
714 
715       packed_consts.swizzle = elk_swizzle_for_size(instr->num_components);
716       if (nir_src_is_const(instr->src[1])) {
717          unsigned load_offset = nir_src_as_uint(instr->src[1]);
718          unsigned type_size = type_sz(dest.type);
719          packed_consts.swizzle +=
720             ELK_SWIZZLE4(load_offset % 16 / type_size,
721                          load_offset % 16 / type_size,
722                          load_offset % 16 / type_size,
723                          load_offset % 16 / type_size);
724       }
725 
726       emit(MOV(dest, retype(packed_consts, dest.type)));
727 
728       break;
729    }
730 
731    case nir_intrinsic_barrier: {
732       if (nir_intrinsic_memory_scope(instr) == SCOPE_NONE)
733          break;
734       const vec4_builder bld =
735          vec4_builder(this).at_end().annotate(current_annotation, base_ir);
736       const dst_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
737       vec4_instruction *fence =
738          bld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE, tmp, elk_vec8_grf(0, 0));
739       fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
740       break;
741    }
742 
743    case nir_intrinsic_shader_clock: {
744       /* We cannot do anything if there is an event, so ignore it for now */
745       const src_reg shader_clock = get_timestamp();
746       const enum elk_reg_type type = elk_type_for_base_type(glsl_uvec2_type());
747 
748       dest = get_nir_def(instr->def, type);
749       emit(MOV(dest, shader_clock));
750       break;
751    }
752 
753    default:
754       unreachable("Unknown intrinsic");
755    }
756 }
757 
758 void
nir_emit_ssbo_atomic(int op,nir_intrinsic_instr * instr)759 vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
760 {
761    dst_reg dest;
762    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
763       dest = get_nir_def(instr->def);
764 
765    src_reg surface = get_nir_ssbo_intrinsic_index(instr);
766    src_reg offset = get_nir_src(instr->src[1], 1);
767    src_reg data1;
768    if (op != ELK_AOP_INC && op != ELK_AOP_DEC && op != ELK_AOP_PREDEC)
769       data1 = get_nir_src(instr->src[2], 1);
770    src_reg data2;
771    if (op == ELK_AOP_CMPWR)
772       data2 = get_nir_src(instr->src[3], 1);
773 
774    /* Emit the actual atomic operation operation */
775    const vec4_builder bld =
776       vec4_builder(this).at_end().annotate(current_annotation, base_ir);
777 
778    src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
779                                                data1, data2,
780                                                1 /* dims */, 1 /* rsize */,
781                                                op,
782                                                ELK_PREDICATE_NONE);
783    dest.type = atomic_result.type;
784    bld.MOV(dest, atomic_result);
785 }
786 
787 static unsigned
elk_swizzle_for_nir_swizzle(uint8_t swizzle[4])788 elk_swizzle_for_nir_swizzle(uint8_t swizzle[4])
789 {
790    return ELK_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
791 }
792 
793 bool
optimize_predicate(nir_alu_instr * instr,enum elk_predicate * predicate)794 vec4_visitor::optimize_predicate(nir_alu_instr *instr,
795                                  enum elk_predicate *predicate)
796 {
797    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
798       return false;
799 
800    nir_alu_instr *cmp_instr =
801       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
802 
803    switch (cmp_instr->op) {
804    case nir_op_b32any_fnequal2:
805    case nir_op_b32any_inequal2:
806    case nir_op_b32any_fnequal3:
807    case nir_op_b32any_inequal3:
808    case nir_op_b32any_fnequal4:
809    case nir_op_b32any_inequal4:
810       *predicate = ELK_PREDICATE_ALIGN16_ANY4H;
811       break;
812    case nir_op_b32all_fequal2:
813    case nir_op_b32all_iequal2:
814    case nir_op_b32all_fequal3:
815    case nir_op_b32all_iequal3:
816    case nir_op_b32all_fequal4:
817    case nir_op_b32all_iequal4:
818       *predicate = ELK_PREDICATE_ALIGN16_ALL4H;
819       break;
820    default:
821       return false;
822    }
823 
824    unsigned size_swizzle =
825       elk_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
826 
827    src_reg op[2];
828    assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
829    for (unsigned i = 0; i < 2; i++) {
830       nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
831       unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
832       type = (nir_alu_type) (((unsigned) type) | bit_size);
833       op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
834       unsigned base_swizzle =
835          elk_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
836       op[i].swizzle = elk_compose_swizzle(size_swizzle, base_swizzle);
837    }
838 
839    emit(CMP(dst_null_d(), op[0], op[1],
840             elk_cmod_for_nir_comparison(cmp_instr->op)));
841 
842    return true;
843 }
844 
845 void
emit_conversion_from_double(dst_reg dst,src_reg src)846 vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src)
847 {
848    enum elk_opcode op;
849    switch (dst.type) {
850    case ELK_REGISTER_TYPE_D:
851       op = ELK_VEC4_OPCODE_DOUBLE_TO_D32;
852       break;
853    case ELK_REGISTER_TYPE_UD:
854       op = ELK_VEC4_OPCODE_DOUBLE_TO_U32;
855       break;
856    case ELK_REGISTER_TYPE_F:
857       op = ELK_VEC4_OPCODE_DOUBLE_TO_F32;
858       break;
859    default:
860       unreachable("Unknown conversion");
861    }
862 
863    dst_reg temp = dst_reg(this, glsl_dvec4_type());
864    emit(MOV(temp, src));
865    dst_reg temp2 = dst_reg(this, glsl_dvec4_type());
866    emit(op, temp2, src_reg(temp));
867 
868    emit(ELK_VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2));
869    emit(MOV(dst, src_reg(retype(temp2, dst.type))));
870 }
871 
872 void
emit_conversion_to_double(dst_reg dst,src_reg src)873 vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src)
874 {
875    dst_reg tmp_dst = dst_reg(src_reg(this, glsl_dvec4_type()));
876    src_reg tmp_src = retype(src_reg(this, glsl_vec4_type()), src.type);
877    emit(MOV(dst_reg(tmp_src), src));
878    emit(ELK_VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
879    emit(MOV(dst, src_reg(tmp_dst)));
880 }
881 
882 /**
883  * Try to use an immediate value for a source
884  *
885  * In cases of flow control, constant propagation is sometimes unable to
886  * determine that a register contains a constant value.  To work around this,
887  * try to emit a literal as one of the sources.  If \c try_src0_also is set,
888  * \c op[0] will also be tried for an immediate value.
889  *
890  * If \c op[0] is modified, the operands will be exchanged so that \c op[1]
891  * will always be the immediate value.
892  *
893  * \return The index of the source that was modified, 0 or 1, if successful.
894  * Otherwise, -1.
895  *
896  * \param op - Operands to the instruction
897  * \param try_src0_also - True if \c op[0] should also be a candidate for
898  *                        getting an immediate value.  This should only be set
899  *                        for commutative operations.
900  */
901 static int
try_immediate_source(const nir_alu_instr * instr,src_reg * op,bool try_src0_also)902 try_immediate_source(const nir_alu_instr *instr, src_reg *op,
903                      bool try_src0_also)
904 {
905    unsigned idx;
906 
907    /* MOV should be the only single-source instruction passed to this
908     * function.  Any other unary instruction with a constant source should
909     * have been constant-folded away!
910     */
911    assert(nir_op_infos[instr->op].num_inputs > 1 ||
912           instr->op == nir_op_mov);
913 
914    if (instr->op != nir_op_mov &&
915        nir_src_bit_size(instr->src[1].src) == 32 &&
916        nir_src_is_const(instr->src[1].src)) {
917       idx = 1;
918    } else if (try_src0_also &&
919          nir_src_bit_size(instr->src[0].src) == 32 &&
920          nir_src_is_const(instr->src[0].src)) {
921       idx = 0;
922    } else {
923       return -1;
924    }
925 
926    const enum elk_reg_type old_type = op[idx].type;
927 
928    switch (old_type) {
929    case ELK_REGISTER_TYPE_D:
930    case ELK_REGISTER_TYPE_UD: {
931       int first_comp = -1;
932       int d = 0;
933 
934       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
935          if (nir_alu_instr_channel_used(instr, idx, i)) {
936             if (first_comp < 0) {
937                first_comp = i;
938                d = nir_src_comp_as_int(instr->src[idx].src,
939                                        instr->src[idx].swizzle[i]);
940             } else if (d != nir_src_comp_as_int(instr->src[idx].src,
941                                                 instr->src[idx].swizzle[i])) {
942                return -1;
943             }
944          }
945       }
946 
947       assert(first_comp >= 0);
948 
949       if (op[idx].abs)
950          d = MAX2(-d, d);
951 
952       if (op[idx].negate)
953          d = -d;
954 
955       op[idx] = retype(src_reg(elk_imm_d(d)), old_type);
956       break;
957    }
958 
959    case ELK_REGISTER_TYPE_F: {
960       int first_comp = -1;
961       float f[NIR_MAX_VEC_COMPONENTS] = { 0.0f };
962       bool is_scalar = true;
963 
964       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
965          if (nir_alu_instr_channel_used(instr, idx, i)) {
966             f[i] = nir_src_comp_as_float(instr->src[idx].src,
967                                          instr->src[idx].swizzle[i]);
968             if (first_comp < 0) {
969                first_comp = i;
970             } else if (f[first_comp] != f[i]) {
971                is_scalar = false;
972             }
973          }
974       }
975 
976       if (is_scalar) {
977          if (op[idx].abs)
978             f[first_comp] = fabs(f[first_comp]);
979 
980          if (op[idx].negate)
981             f[first_comp] = -f[first_comp];
982 
983          op[idx] = src_reg(elk_imm_f(f[first_comp]));
984          assert(op[idx].type == old_type);
985       } else {
986          uint8_t vf_values[4] = { 0, 0, 0, 0 };
987 
988          for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) {
989 
990             if (op[idx].abs)
991                f[i] = fabs(f[i]);
992 
993             if (op[idx].negate)
994                f[i] = -f[i];
995 
996             const int vf = elk_float_to_vf(f[i]);
997             if (vf == -1)
998                return -1;
999 
1000             vf_values[i] = vf;
1001          }
1002 
1003          op[idx] = src_reg(elk_imm_vf4(vf_values[0], vf_values[1],
1004                                        vf_values[2], vf_values[3]));
1005       }
1006       break;
1007    }
1008 
1009    default:
1010       unreachable("Non-32bit type.");
1011    }
1012 
1013    /* If the instruction has more than one source, the instruction format only
1014     * allows source 1 to be an immediate value.  If the immediate value was
1015     * source 0, then the sources must be exchanged.
1016     */
1017    if (idx == 0 && instr->op != nir_op_mov) {
1018       src_reg tmp = op[0];
1019       op[0] = op[1];
1020       op[1] = tmp;
1021    }
1022 
1023    return idx;
1024 }
1025 
1026 void
fix_float_operands(src_reg op[3],nir_alu_instr * instr)1027 vec4_visitor::fix_float_operands(src_reg op[3], nir_alu_instr *instr)
1028 {
1029    bool fixed[3] = { false, false, false };
1030 
1031    for (unsigned i = 0; i < 2; i++) {
1032       if (!nir_src_is_const(instr->src[i].src))
1033          continue;
1034 
1035       for (unsigned j = i + 1; j < 3; j++) {
1036          if (fixed[j])
1037             continue;
1038 
1039          if (!nir_src_is_const(instr->src[j].src))
1040             continue;
1041 
1042          if (nir_alu_srcs_equal(instr, instr, i, j)) {
1043             if (!fixed[i])
1044                op[i] = fix_3src_operand(op[i]);
1045 
1046             op[j] = op[i];
1047 
1048             fixed[i] = true;
1049             fixed[j] = true;
1050          } else if (nir_alu_srcs_negative_equal(instr, instr, i, j)) {
1051             if (!fixed[i])
1052                op[i] = fix_3src_operand(op[i]);
1053 
1054             op[j] = op[i];
1055             op[j].negate = !op[j].negate;
1056 
1057             fixed[i] = true;
1058             fixed[j] = true;
1059          }
1060       }
1061    }
1062 
1063    for (unsigned i = 0; i < 3; i++) {
1064       if (!fixed[i])
1065          op[i] = fix_3src_operand(op[i]);
1066    }
1067 }
1068 
1069 static bool
const_src_fits_in_16_bits(const nir_src & src,elk_reg_type type)1070 const_src_fits_in_16_bits(const nir_src &src, elk_reg_type type)
1071 {
1072    assert(nir_src_is_const(src));
1073    if (elk_reg_type_is_unsigned_integer(type)) {
1074       return nir_src_comp_as_uint(src, 0) <= UINT16_MAX;
1075    } else {
1076       const int64_t c = nir_src_comp_as_int(src, 0);
1077       return c <= INT16_MAX && c >= INT16_MIN;
1078    }
1079 }
1080 
1081 void
nir_emit_alu(nir_alu_instr * instr)1082 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
1083 {
1084    vec4_instruction *inst;
1085 
1086    nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
1087                                            instr->def.bit_size);
1088    dst_reg dst = get_nir_def(instr->def, dst_type);
1089    dst.writemask &= nir_component_mask(instr->def.num_components);
1090 
1091    src_reg op[4];
1092    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1093       nir_alu_type src_type = (nir_alu_type)
1094          (nir_op_infos[instr->op].input_types[i] |
1095           nir_src_bit_size(instr->src[i].src));
1096       op[i] = get_nir_src(instr->src[i].src, src_type, 4);
1097       op[i].swizzle = elk_swizzle_for_nir_swizzle(instr->src[i].swizzle);
1098    }
1099 
1100 #ifndef NDEBUG
1101    /* On Gen7 and earlier, no functionality is exposed that should allow 8-bit
1102     * integer types to ever exist.
1103     */
1104    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1105       assert(type_sz(op[i].type) > 1);
1106 #endif
1107 
1108    switch (instr->op) {
1109    case nir_op_mov:
1110       try_immediate_source(instr, &op[0], true);
1111       inst = emit(MOV(dst, op[0]));
1112       break;
1113 
1114    case nir_op_vec2:
1115    case nir_op_vec3:
1116    case nir_op_vec4:
1117       unreachable("not reached: should be handled by lower_vec_to_movs()");
1118 
1119    case nir_op_i2f32:
1120    case nir_op_u2f32:
1121       inst = emit(MOV(dst, op[0]));
1122       break;
1123 
1124    case nir_op_f2f32:
1125    case nir_op_f2i32:
1126    case nir_op_f2u32:
1127       if (nir_src_bit_size(instr->src[0].src) == 64)
1128          emit_conversion_from_double(dst, op[0]);
1129       else
1130          inst = emit(MOV(dst, op[0]));
1131       break;
1132 
1133    case nir_op_f2f64:
1134    case nir_op_i2f64:
1135    case nir_op_u2f64:
1136       emit_conversion_to_double(dst, op[0]);
1137       break;
1138 
1139    case nir_op_fsat:
1140       inst = emit(MOV(dst, op[0]));
1141       inst->saturate = true;
1142       break;
1143 
1144    case nir_op_fneg:
1145    case nir_op_ineg:
1146       op[0].negate = true;
1147       inst = emit(MOV(dst, op[0]));
1148       break;
1149 
1150    case nir_op_fabs:
1151    case nir_op_iabs:
1152       op[0].negate = false;
1153       op[0].abs = true;
1154       inst = emit(MOV(dst, op[0]));
1155       break;
1156 
1157    case nir_op_iadd:
1158       assert(instr->def.bit_size < 64);
1159       FALLTHROUGH;
1160    case nir_op_fadd:
1161       try_immediate_source(instr, op, true);
1162       inst = emit(ADD(dst, op[0], op[1]));
1163       break;
1164 
1165    case nir_op_uadd_sat:
1166       assert(instr->def.bit_size < 64);
1167       inst = emit(ADD(dst, op[0], op[1]));
1168       inst->saturate = true;
1169       break;
1170 
1171    case nir_op_fmul:
1172       try_immediate_source(instr, op, true);
1173       inst = emit(MUL(dst, op[0], op[1]));
1174       break;
1175 
1176    case nir_op_imul: {
1177       assert(instr->def.bit_size < 64);
1178 
1179       /* For integer multiplication, the MUL uses the low 16 bits of one of
1180        * the operands (src0 through SNB, src1 on IVB and later). The MACH
1181        * accumulates in the contribution of the upper 16 bits of that
1182        * operand. If we can determine that one of the args is in the low
1183        * 16 bits, though, we can just emit a single MUL.
1184        */
1185       if (nir_src_is_const(instr->src[0].src) &&
1186           nir_alu_instr_src_read_mask(instr, 0) == 1 &&
1187           const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) {
1188          if (devinfo->ver < 7)
1189             emit(MUL(dst, op[0], op[1]));
1190          else
1191             emit(MUL(dst, op[1], op[0]));
1192       } else if (nir_src_is_const(instr->src[1].src) &&
1193                  nir_alu_instr_src_read_mask(instr, 1) == 1 &&
1194                  const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) {
1195          if (devinfo->ver < 7)
1196             emit(MUL(dst, op[1], op[0]));
1197          else
1198             emit(MUL(dst, op[0], op[1]));
1199       } else {
1200          struct elk_reg acc = retype(elk_acc_reg(8), dst.type);
1201 
1202          emit(MUL(acc, op[0], op[1]));
1203          emit(MACH(dst_null_d(), op[0], op[1]));
1204          emit(MOV(dst, src_reg(acc)));
1205       }
1206       break;
1207    }
1208 
1209    case nir_op_imul_high:
1210    case nir_op_umul_high: {
1211       assert(instr->def.bit_size < 64);
1212       struct elk_reg acc = retype(elk_acc_reg(8), dst.type);
1213 
1214       emit(MUL(acc, op[0], op[1]));
1215       emit(MACH(dst, op[0], op[1]));
1216       break;
1217    }
1218 
1219    case nir_op_frcp:
1220       inst = emit_math(ELK_SHADER_OPCODE_RCP, dst, op[0]);
1221       break;
1222 
1223    case nir_op_fexp2:
1224       inst = emit_math(ELK_SHADER_OPCODE_EXP2, dst, op[0]);
1225       break;
1226 
1227    case nir_op_flog2:
1228       inst = emit_math(ELK_SHADER_OPCODE_LOG2, dst, op[0]);
1229       break;
1230 
1231    case nir_op_fsin:
1232       inst = emit_math(ELK_SHADER_OPCODE_SIN, dst, op[0]);
1233       break;
1234 
1235    case nir_op_fcos:
1236       inst = emit_math(ELK_SHADER_OPCODE_COS, dst, op[0]);
1237       break;
1238 
1239    case nir_op_idiv:
1240    case nir_op_udiv:
1241       assert(instr->def.bit_size < 64);
1242       emit_math(ELK_SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
1243       break;
1244 
1245    case nir_op_umod:
1246    case nir_op_irem:
1247       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1248        * appears that our hardware just does the right thing for signed
1249        * remainder.
1250        */
1251       assert(instr->def.bit_size < 64);
1252       emit_math(ELK_SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1253       break;
1254 
1255    case nir_op_imod: {
1256       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1257       inst = emit_math(ELK_SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1258 
1259       /* Math instructions don't support conditional mod */
1260       inst = emit(MOV(dst_null_d(), src_reg(dst)));
1261       inst->conditional_mod = ELK_CONDITIONAL_NZ;
1262 
1263       /* Now, we need to determine if signs of the sources are different.
1264        * When we XOR the sources, the top bit is 0 if they are the same and 1
1265        * if they are different.  We can then use a conditional modifier to
1266        * turn that into a predicate.  This leads us to an XOR.l instruction.
1267        *
1268        * Technically, according to the PRM, you're not allowed to use .l on a
1269        * XOR instruction.  However, empirical experiments and Curro's reading
1270        * of the simulator source both indicate that it's safe.
1271        */
1272       src_reg tmp = src_reg(this, glsl_ivec4_type());
1273       inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
1274       inst->predicate = ELK_PREDICATE_NORMAL;
1275       inst->conditional_mod = ELK_CONDITIONAL_L;
1276 
1277       /* If the result of the initial remainder operation is non-zero and the
1278        * two sources have different signs, add in a copy of op[1] to get the
1279        * final integer modulus value.
1280        */
1281       inst = emit(ADD(dst, src_reg(dst), op[1]));
1282       inst->predicate = ELK_PREDICATE_NORMAL;
1283       break;
1284    }
1285 
1286    case nir_op_ldexp:
1287       unreachable("not reached: should be handled by ldexp_to_arith()");
1288 
1289    case nir_op_fsqrt:
1290       inst = emit_math(ELK_SHADER_OPCODE_SQRT, dst, op[0]);
1291       break;
1292 
1293    case nir_op_frsq:
1294       inst = emit_math(ELK_SHADER_OPCODE_RSQ, dst, op[0]);
1295       break;
1296 
1297    case nir_op_fpow:
1298       inst = emit_math(ELK_SHADER_OPCODE_POW, dst, op[0], op[1]);
1299       break;
1300 
1301    case nir_op_uadd_carry: {
1302       assert(instr->def.bit_size < 64);
1303       struct elk_reg acc = retype(elk_acc_reg(8), ELK_REGISTER_TYPE_UD);
1304 
1305       emit(ADDC(dst_null_ud(), op[0], op[1]));
1306       emit(MOV(dst, src_reg(acc)));
1307       break;
1308    }
1309 
1310    case nir_op_usub_borrow: {
1311       assert(instr->def.bit_size < 64);
1312       struct elk_reg acc = retype(elk_acc_reg(8), ELK_REGISTER_TYPE_UD);
1313 
1314       emit(SUBB(dst_null_ud(), op[0], op[1]));
1315       emit(MOV(dst, src_reg(acc)));
1316       break;
1317    }
1318 
1319    case nir_op_ftrunc:
1320       inst = emit(RNDZ(dst, op[0]));
1321       if (devinfo->ver < 6) {
1322          inst->conditional_mod = ELK_CONDITIONAL_R;
1323          inst = emit(ADD(dst, src_reg(dst), elk_imm_f(1.0f)));
1324          inst->predicate = ELK_PREDICATE_NORMAL;
1325          inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
1326       }
1327       break;
1328 
1329    case nir_op_fceil: {
1330       src_reg tmp = src_reg(this, glsl_float_type());
1331       tmp.swizzle = elk_swizzle_for_size(nir_src_num_components(instr->src[0].src));
1332 
1333       op[0].negate = !op[0].negate;
1334       emit(RNDD(dst_reg(tmp), op[0]));
1335       tmp.negate = true;
1336       inst = emit(MOV(dst, tmp));
1337       break;
1338    }
1339 
1340    case nir_op_ffloor:
1341       inst = emit(RNDD(dst, op[0]));
1342       break;
1343 
1344    case nir_op_ffract:
1345       inst = emit(FRC(dst, op[0]));
1346       break;
1347 
1348    case nir_op_fround_even:
1349       inst = emit(RNDE(dst, op[0]));
1350       if (devinfo->ver < 6) {
1351          inst->conditional_mod = ELK_CONDITIONAL_R;
1352          inst = emit(ADD(dst, src_reg(dst), elk_imm_f(1.0f)));
1353          inst->predicate = ELK_PREDICATE_NORMAL;
1354          inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
1355       }
1356       break;
1357 
1358    case nir_op_fquantize2f16: {
1359       /* See also vec4_visitor::emit_pack_half_2x16() */
1360       src_reg tmp16 = src_reg(this, glsl_uvec4_type());
1361       src_reg tmp32 = src_reg(this, glsl_vec4_type());
1362       src_reg zero = src_reg(this, glsl_vec4_type());
1363 
1364       /* Check for denormal */
1365       src_reg abs_src0 = op[0];
1366       abs_src0.abs = true;
1367       emit(CMP(dst_null_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1368                ELK_CONDITIONAL_L));
1369       /* Get the appropriately signed zero */
1370       emit(AND(retype(dst_reg(zero), ELK_REGISTER_TYPE_UD),
1371                retype(op[0], ELK_REGISTER_TYPE_UD),
1372                elk_imm_ud(0x80000000)));
1373       /* Do the actual F32 -> F16 -> F32 conversion */
1374       emit(F32TO16(dst_reg(tmp16), op[0]));
1375       emit(F16TO32(dst_reg(tmp32), tmp16));
1376       /* Select that or zero based on normal status */
1377       inst = emit(ELK_OPCODE_SEL, dst, zero, tmp32);
1378       inst->predicate = ELK_PREDICATE_NORMAL;
1379       break;
1380    }
1381 
1382    case nir_op_imin:
1383    case nir_op_umin:
1384       assert(instr->def.bit_size < 64);
1385       FALLTHROUGH;
1386    case nir_op_fmin:
1387       try_immediate_source(instr, op, true);
1388       inst = emit_minmax(ELK_CONDITIONAL_L, dst, op[0], op[1]);
1389       break;
1390 
1391    case nir_op_imax:
1392    case nir_op_umax:
1393       assert(instr->def.bit_size < 64);
1394       FALLTHROUGH;
1395    case nir_op_fmax:
1396       try_immediate_source(instr, op, true);
1397       inst = emit_minmax(ELK_CONDITIONAL_GE, dst, op[0], op[1]);
1398       break;
1399 
1400    case nir_op_fddx:
1401    case nir_op_fddx_coarse:
1402    case nir_op_fddx_fine:
1403    case nir_op_fddy:
1404    case nir_op_fddy_coarse:
1405    case nir_op_fddy_fine:
1406       unreachable("derivatives are not valid in vertex shaders");
1407 
1408    case nir_op_ilt32:
1409    case nir_op_ult32:
1410    case nir_op_ige32:
1411    case nir_op_uge32:
1412    case nir_op_ieq32:
1413    case nir_op_ine32:
1414       assert(instr->def.bit_size < 64);
1415       FALLTHROUGH;
1416    case nir_op_flt32:
1417    case nir_op_fge32:
1418    case nir_op_feq32:
1419    case nir_op_fneu32: {
1420       enum elk_conditional_mod conditional_mod =
1421          elk_cmod_for_nir_comparison(instr->op);
1422 
1423       if (nir_src_bit_size(instr->src[0].src) < 64) {
1424          /* If the order of the sources is changed due to an immediate value,
1425           * then the condition must also be changed.
1426           */
1427          if (try_immediate_source(instr, op, true) == 0)
1428             conditional_mod = elk_swap_cmod(conditional_mod);
1429 
1430          emit(CMP(dst, op[0], op[1], conditional_mod));
1431       } else {
1432          /* Produce a 32-bit boolean result from the DF comparison by selecting
1433           * only the low 32-bit in each DF produced. Do this in a temporary
1434           * so we can then move from there to the result using align16 again
1435           * to honor the original writemask.
1436           */
1437          dst_reg temp = dst_reg(this, glsl_dvec4_type());
1438          emit(CMP(temp, op[0], op[1], conditional_mod));
1439          dst_reg result = dst_reg(this, glsl_bvec4_type());
1440          emit(ELK_VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
1441          emit(MOV(dst, src_reg(result)));
1442       }
1443       break;
1444    }
1445 
1446    case nir_op_b32all_iequal2:
1447    case nir_op_b32all_iequal3:
1448    case nir_op_b32all_iequal4:
1449       assert(instr->def.bit_size < 64);
1450       FALLTHROUGH;
1451    case nir_op_b32all_fequal2:
1452    case nir_op_b32all_fequal3:
1453    case nir_op_b32all_fequal4: {
1454       unsigned swiz =
1455          elk_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1456 
1457       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1458                elk_cmod_for_nir_comparison(instr->op)));
1459       emit(MOV(dst, elk_imm_d(0)));
1460       inst = emit(MOV(dst, elk_imm_d(~0)));
1461       inst->predicate = ELK_PREDICATE_ALIGN16_ALL4H;
1462       break;
1463    }
1464 
1465    case nir_op_b32any_inequal2:
1466    case nir_op_b32any_inequal3:
1467    case nir_op_b32any_inequal4:
1468       assert(instr->def.bit_size < 64);
1469       FALLTHROUGH;
1470    case nir_op_b32any_fnequal2:
1471    case nir_op_b32any_fnequal3:
1472    case nir_op_b32any_fnequal4: {
1473       unsigned swiz =
1474          elk_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1475 
1476       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1477                elk_cmod_for_nir_comparison(instr->op)));
1478 
1479       emit(MOV(dst, elk_imm_d(0)));
1480       inst = emit(MOV(dst, elk_imm_d(~0)));
1481       inst->predicate = ELK_PREDICATE_ALIGN16_ANY4H;
1482       break;
1483    }
1484 
1485    case nir_op_inot:
1486       assert(instr->def.bit_size < 64);
1487       emit(NOT(dst, op[0]));
1488       break;
1489 
1490    case nir_op_ixor:
1491       assert(instr->def.bit_size < 64);
1492       try_immediate_source(instr, op, true);
1493       emit(XOR(dst, op[0], op[1]));
1494       break;
1495 
1496    case nir_op_ior:
1497       assert(instr->def.bit_size < 64);
1498       try_immediate_source(instr, op, true);
1499       emit(OR(dst, op[0], op[1]));
1500       break;
1501 
1502    case nir_op_iand:
1503       assert(instr->def.bit_size < 64);
1504       try_immediate_source(instr, op, true);
1505       emit(AND(dst, op[0], op[1]));
1506       break;
1507 
1508    case nir_op_b2i32:
1509    case nir_op_b2f32:
1510    case nir_op_b2f64:
1511       if (instr->def.bit_size > 32) {
1512          assert(dst.type == ELK_REGISTER_TYPE_DF);
1513          emit_conversion_to_double(dst, negate(op[0]));
1514       } else {
1515          emit(MOV(dst, negate(op[0])));
1516       }
1517       break;
1518 
1519    case nir_op_unpack_half_2x16_split_x:
1520    case nir_op_unpack_half_2x16_split_y:
1521    case nir_op_pack_half_2x16_split:
1522       unreachable("not reached: should not occur in vertex shader");
1523 
1524    case nir_op_unpack_snorm_2x16:
1525    case nir_op_unpack_unorm_2x16:
1526    case nir_op_pack_snorm_2x16:
1527    case nir_op_pack_unorm_2x16:
1528       unreachable("not reached: should be handled by lower_packing_builtins");
1529 
1530    case nir_op_pack_uvec4_to_uint:
1531       unreachable("not reached");
1532 
1533    case nir_op_pack_uvec2_to_uint: {
1534       dst_reg tmp1 = dst_reg(this, glsl_uint_type());
1535       tmp1.writemask = WRITEMASK_X;
1536       op[0].swizzle = ELK_SWIZZLE_YYYY;
1537       emit(SHL(tmp1, op[0], src_reg(elk_imm_ud(16u))));
1538 
1539       dst_reg tmp2 = dst_reg(this, glsl_uint_type());
1540       tmp2.writemask = WRITEMASK_X;
1541       op[0].swizzle = ELK_SWIZZLE_XXXX;
1542       emit(AND(tmp2, op[0], src_reg(elk_imm_ud(0xffffu))));
1543 
1544       emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
1545       break;
1546    }
1547 
1548    case nir_op_pack_64_2x32_split: {
1549       dst_reg result = dst_reg(this, glsl_dvec4_type());
1550       dst_reg tmp = dst_reg(this, glsl_uvec4_type());
1551       emit(MOV(tmp, retype(op[0], ELK_REGISTER_TYPE_UD)));
1552       emit(ELK_VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
1553       emit(MOV(tmp, retype(op[1], ELK_REGISTER_TYPE_UD)));
1554       emit(ELK_VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
1555       emit(MOV(dst, src_reg(result)));
1556       break;
1557    }
1558 
1559    case nir_op_unpack_64_2x32_split_x:
1560    case nir_op_unpack_64_2x32_split_y: {
1561       enum elk_opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ?
1562          ELK_VEC4_OPCODE_PICK_LOW_32BIT : ELK_VEC4_OPCODE_PICK_HIGH_32BIT;
1563       dst_reg tmp = dst_reg(this, glsl_dvec4_type());
1564       emit(MOV(tmp, op[0]));
1565       dst_reg tmp2 = dst_reg(this, glsl_uvec4_type());
1566       emit(oper, tmp2, src_reg(tmp));
1567       emit(MOV(dst, src_reg(tmp2)));
1568       break;
1569    }
1570 
1571    case nir_op_unpack_half_2x16:
1572       /* As NIR does not guarantee that we have a correct swizzle outside the
1573        * boundaries of a vector, and the implementation of emit_unpack_half_2x16
1574        * uses the source operand in an operation with WRITEMASK_Y while our
1575        * source operand has only size 1, it accessed incorrect data producing
1576        * regressions in Piglit. We repeat the swizzle of the first component on the
1577        * rest of components to avoid regressions. In the vec4_visitor IR code path
1578        * this is not needed because the operand has already the correct swizzle.
1579        */
1580       op[0].swizzle = elk_compose_swizzle(ELK_SWIZZLE_XXXX, op[0].swizzle);
1581       emit_unpack_half_2x16(dst, op[0]);
1582       break;
1583 
1584    case nir_op_pack_half_2x16:
1585       emit_pack_half_2x16(dst, op[0]);
1586       break;
1587 
1588    case nir_op_unpack_unorm_4x8:
1589       assert(instr->def.bit_size < 64);
1590       emit_unpack_unorm_4x8(dst, op[0]);
1591       break;
1592 
1593    case nir_op_pack_unorm_4x8:
1594       assert(instr->def.bit_size < 64);
1595       emit_pack_unorm_4x8(dst, op[0]);
1596       break;
1597 
1598    case nir_op_unpack_snorm_4x8:
1599       assert(instr->def.bit_size < 64);
1600       emit_unpack_snorm_4x8(dst, op[0]);
1601       break;
1602 
1603    case nir_op_pack_snorm_4x8:
1604       assert(instr->def.bit_size < 64);
1605       emit_pack_snorm_4x8(dst, op[0]);
1606       break;
1607 
1608    case nir_op_bitfield_reverse:
1609       assert(instr->def.bit_size == 32);
1610       assert(nir_src_bit_size(instr->src[0].src) == 32);
1611       emit(BFREV(dst, op[0]));
1612       break;
1613 
1614    case nir_op_bit_count:
1615       assert(instr->def.bit_size == 32);
1616       assert(nir_src_bit_size(instr->src[0].src) < 64);
1617       emit(CBIT(dst, op[0]));
1618       break;
1619 
1620    case nir_op_ifind_msb: {
1621       assert(instr->def.bit_size == 32);
1622       assert(nir_src_bit_size(instr->src[0].src) == 32);
1623       assert(devinfo->ver >= 7);
1624 
1625       vec4_builder bld = vec4_builder(this).at_end();
1626       src_reg src(dst);
1627 
1628       emit(FBH(retype(dst, ELK_REGISTER_TYPE_UD), op[0]));
1629 
1630       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1631        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1632        * subtract the result from 31 to convert the MSB count into an LSB
1633        * count.
1634        */
1635       bld.CMP(dst_null_d(), src, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1636 
1637       inst = bld.ADD(dst, src, elk_imm_d(31));
1638       inst->predicate = ELK_PREDICATE_NORMAL;
1639       inst->src[0].negate = true;
1640       break;
1641    }
1642 
1643    case nir_op_uclz:
1644       assert(instr->def.bit_size == 32);
1645       assert(nir_src_bit_size(instr->src[0].src) == 32);
1646       emit(LZD(dst, op[0]));
1647       break;
1648 
1649    case nir_op_find_lsb:
1650       assert(instr->def.bit_size == 32);
1651       assert(nir_src_bit_size(instr->src[0].src) == 32);
1652       assert(devinfo->ver >= 7);
1653       emit(FBL(dst, op[0]));
1654       break;
1655 
1656    case nir_op_ubitfield_extract:
1657    case nir_op_ibitfield_extract:
1658       unreachable("should have been lowered");
1659    case nir_op_ubfe:
1660    case nir_op_ibfe:
1661       assert(instr->def.bit_size < 64);
1662       op[0] = fix_3src_operand(op[0]);
1663       op[1] = fix_3src_operand(op[1]);
1664       op[2] = fix_3src_operand(op[2]);
1665 
1666       emit(BFE(dst, op[2], op[1], op[0]));
1667       break;
1668 
1669    case nir_op_bfm:
1670       assert(instr->def.bit_size < 64);
1671       emit(BFI1(dst, op[0], op[1]));
1672       break;
1673 
1674    case nir_op_bfi:
1675       assert(instr->def.bit_size < 64);
1676       op[0] = fix_3src_operand(op[0]);
1677       op[1] = fix_3src_operand(op[1]);
1678       op[2] = fix_3src_operand(op[2]);
1679 
1680       emit(BFI2(dst, op[0], op[1], op[2]));
1681       break;
1682 
1683    case nir_op_bitfield_insert:
1684       unreachable("not reached: should have been lowered");
1685 
1686    case nir_op_fsign:
1687        if (type_sz(op[0].type) < 8) {
1688          /* AND(val, 0x80000000) gives the sign bit.
1689           *
1690           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1691           * zero.
1692           */
1693          emit(CMP(dst_null_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ));
1694 
1695          op[0].type = ELK_REGISTER_TYPE_UD;
1696          dst.type = ELK_REGISTER_TYPE_UD;
1697          emit(AND(dst, op[0], elk_imm_ud(0x80000000u)));
1698 
1699          inst = emit(OR(dst, src_reg(dst), elk_imm_ud(0x3f800000u)));
1700          inst->predicate = ELK_PREDICATE_NORMAL;
1701          dst.type = ELK_REGISTER_TYPE_F;
1702       } else {
1703           unreachable("Should have been lowered by nir_opt_algebraic.");
1704       }
1705       break;
1706 
1707    case nir_op_ishl:
1708       assert(instr->def.bit_size < 64);
1709       try_immediate_source(instr, op, false);
1710       emit(SHL(dst, op[0], op[1]));
1711       break;
1712 
1713    case nir_op_ishr:
1714       assert(instr->def.bit_size < 64);
1715       try_immediate_source(instr, op, false);
1716       emit(ASR(dst, op[0], op[1]));
1717       break;
1718 
1719    case nir_op_ushr:
1720       assert(instr->def.bit_size < 64);
1721       try_immediate_source(instr, op, false);
1722       emit(SHR(dst, op[0], op[1]));
1723       break;
1724 
1725    case nir_op_ffma:
1726       if (type_sz(dst.type) == 8) {
1727          dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
1728          emit(MUL(mul_dst, op[1], op[0]));
1729          inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
1730       } else {
1731          fix_float_operands(op, instr);
1732          inst = emit(MAD(dst, op[2], op[1], op[0]));
1733       }
1734       break;
1735 
1736    case nir_op_flrp:
1737       fix_float_operands(op, instr);
1738       inst = emit(LRP(dst, op[2], op[1], op[0]));
1739       break;
1740 
1741    case nir_op_b32csel:
1742       enum elk_predicate predicate;
1743       if (!optimize_predicate(instr, &predicate)) {
1744          emit(CMP(dst_null_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ));
1745          switch (dst.writemask) {
1746          case WRITEMASK_X:
1747             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_X;
1748             break;
1749          case WRITEMASK_Y:
1750             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_Y;
1751             break;
1752          case WRITEMASK_Z:
1753             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_Z;
1754             break;
1755          case WRITEMASK_W:
1756             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_W;
1757             break;
1758          default:
1759             predicate = ELK_PREDICATE_NORMAL;
1760             break;
1761          }
1762       }
1763       inst = emit(ELK_OPCODE_SEL, dst, op[1], op[2]);
1764       inst->predicate = predicate;
1765       break;
1766 
1767    case nir_op_fdot2_replicated:
1768       try_immediate_source(instr, op, true);
1769       inst = emit(ELK_OPCODE_DP2, dst, op[0], op[1]);
1770       break;
1771 
1772    case nir_op_fdot3_replicated:
1773       try_immediate_source(instr, op, true);
1774       inst = emit(ELK_OPCODE_DP3, dst, op[0], op[1]);
1775       break;
1776 
1777    case nir_op_fdot4_replicated:
1778       try_immediate_source(instr, op, true);
1779       inst = emit(ELK_OPCODE_DP4, dst, op[0], op[1]);
1780       break;
1781 
1782    case nir_op_fdph_replicated:
1783       try_immediate_source(instr, op, false);
1784       inst = emit(ELK_OPCODE_DPH, dst, op[0], op[1]);
1785       break;
1786 
1787    case nir_op_fdiv:
1788       unreachable("not reached: should be lowered by lower_fdiv in the compiler");
1789 
1790    case nir_op_fmod:
1791       unreachable("not reached: should be lowered by lower_fmod in the compiler");
1792 
1793    case nir_op_fsub:
1794    case nir_op_isub:
1795       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1796 
1797    default:
1798       unreachable("Unimplemented ALU operation");
1799    }
1800 
1801    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1802     * to sign extend the low bit to 0/~0
1803     */
1804    if (devinfo->ver <= 5 &&
1805        (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) ==
1806        ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1807       dst_reg masked = dst_reg(this, glsl_int_type());
1808       masked.writemask = dst.writemask;
1809       emit(AND(masked, src_reg(dst), elk_imm_d(1)));
1810       src_reg masked_neg = src_reg(masked);
1811       masked_neg.negate = true;
1812       emit(MOV(retype(dst, ELK_REGISTER_TYPE_D), masked_neg));
1813    }
1814 }
1815 
1816 void
nir_emit_jump(nir_jump_instr * instr)1817 vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
1818 {
1819    switch (instr->type) {
1820    case nir_jump_break:
1821       emit(ELK_OPCODE_BREAK);
1822       break;
1823 
1824    case nir_jump_continue:
1825       emit(ELK_OPCODE_CONTINUE);
1826       break;
1827 
1828    case nir_jump_return:
1829       FALLTHROUGH;
1830    default:
1831       unreachable("unknown jump");
1832    }
1833 }
1834 
1835 static bool
is_high_sampler(const struct intel_device_info * devinfo,src_reg sampler)1836 is_high_sampler(const struct intel_device_info *devinfo, src_reg sampler)
1837 {
1838    if (devinfo->verx10 != 75)
1839       return false;
1840 
1841    return sampler.file != IMM || sampler.ud >= 16;
1842 }
1843 
1844 void
nir_emit_texture(nir_tex_instr * instr)1845 vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
1846 {
1847    unsigned texture = instr->texture_index;
1848    unsigned sampler = instr->sampler_index;
1849    src_reg texture_reg = elk_imm_ud(texture);
1850    src_reg sampler_reg = elk_imm_ud(sampler);
1851    src_reg coordinate;
1852    const glsl_type *coord_type = NULL;
1853    src_reg shadow_comparator;
1854    src_reg offset_value;
1855    src_reg lod, lod2;
1856    src_reg sample_index;
1857    src_reg mcs;
1858 
1859    dst_reg dest = get_nir_def(instr->def, instr->dest_type);
1860 
1861    /* The hardware requires a LOD for buffer textures */
1862    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
1863       lod = elk_imm_d(0);
1864 
1865    /* Load the texture operation sources */
1866    uint32_t constant_offset = 0;
1867    for (unsigned i = 0; i < instr->num_srcs; i++) {
1868       switch (instr->src[i].src_type) {
1869       case nir_tex_src_comparator:
1870          shadow_comparator = get_nir_src(instr->src[i].src,
1871                                          ELK_REGISTER_TYPE_F, 1);
1872          break;
1873 
1874       case nir_tex_src_coord: {
1875          unsigned src_size = nir_tex_instr_src_size(instr, i);
1876 
1877          switch (instr->op) {
1878          case nir_texop_txf:
1879          case nir_texop_txf_ms:
1880          case nir_texop_samples_identical:
1881             coordinate = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D,
1882                                      src_size);
1883             coord_type = glsl_ivec_type(src_size);
1884             break;
1885 
1886          default:
1887             coordinate = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1888                                      src_size);
1889             coord_type = glsl_vec_type(src_size);
1890             break;
1891          }
1892          break;
1893       }
1894 
1895       case nir_tex_src_ddx:
1896          lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1897                            nir_tex_instr_src_size(instr, i));
1898          break;
1899 
1900       case nir_tex_src_ddy:
1901          lod2 = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1902                            nir_tex_instr_src_size(instr, i));
1903          break;
1904 
1905       case nir_tex_src_lod:
1906          switch (instr->op) {
1907          case nir_texop_txs:
1908          case nir_texop_txf:
1909             lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 1);
1910             break;
1911 
1912          default:
1913             lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F, 1);
1914             break;
1915          }
1916          break;
1917 
1918       case nir_tex_src_ms_index: {
1919          sample_index = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 1);
1920          break;
1921       }
1922 
1923       case nir_tex_src_offset:
1924          if (!elk_texture_offset(instr, i, &constant_offset)) {
1925             offset_value =
1926                get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 2);
1927          }
1928          break;
1929 
1930       case nir_tex_src_texture_offset: {
1931          assert(texture_reg.is_zero());
1932          texture_reg = emit_uniformize(get_nir_src(instr->src[i].src,
1933                                                    ELK_REGISTER_TYPE_UD, 1));
1934          break;
1935       }
1936 
1937       case nir_tex_src_sampler_offset: {
1938          assert(sampler_reg.is_zero());
1939          sampler_reg = emit_uniformize(get_nir_src(instr->src[i].src,
1940                                                    ELK_REGISTER_TYPE_UD, 1));
1941          break;
1942       }
1943 
1944       case nir_tex_src_projector:
1945          unreachable("Should be lowered by nir_lower_tex");
1946 
1947       case nir_tex_src_bias:
1948          unreachable("LOD bias is not valid for vertex shaders.\n");
1949 
1950       default:
1951          unreachable("unknown texture source");
1952       }
1953    }
1954 
1955    if (instr->op == nir_texop_txf_ms ||
1956        instr->op == nir_texop_samples_identical) {
1957       assert(coord_type != NULL);
1958       if (devinfo->ver >= 7) {
1959          mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
1960       } else {
1961          mcs = elk_imm_ud(0u);
1962       }
1963    }
1964 
1965    /* Stuff the channel select bits in the top of the texture offset */
1966    if (instr->op == nir_texop_tg4) {
1967       if (instr->component == 1 &&
1968           (key_tex->gather_channel_quirk_mask & (1 << texture))) {
1969          /* gather4 sampler is broken for green channel on RG32F --
1970           * we must ask for blue instead.
1971           */
1972          constant_offset |= 2 << 16;
1973       } else {
1974          constant_offset |= instr->component << 16;
1975       }
1976    }
1977 
1978    enum elk_opcode opcode;
1979    switch (instr->op) {
1980    case nir_texop_tex:             opcode = ELK_SHADER_OPCODE_TXL;        break;
1981    case nir_texop_txl:             opcode = ELK_SHADER_OPCODE_TXL;        break;
1982    case nir_texop_txd:             opcode = ELK_SHADER_OPCODE_TXD;        break;
1983    case nir_texop_txf:             opcode = ELK_SHADER_OPCODE_TXF;        break;
1984    case nir_texop_txf_ms:          opcode = ELK_SHADER_OPCODE_TXF_CMS;    break;
1985    case nir_texop_txs:             opcode = ELK_SHADER_OPCODE_TXS;        break;
1986    case nir_texop_query_levels:    opcode = ELK_SHADER_OPCODE_TXS;        break;
1987    case nir_texop_texture_samples: opcode = ELK_SHADER_OPCODE_SAMPLEINFO; break;
1988    case nir_texop_tg4:
1989       opcode = offset_value.file != BAD_FILE ? ELK_SHADER_OPCODE_TG4_OFFSET
1990                                              : ELK_SHADER_OPCODE_TG4;
1991       break;
1992    case nir_texop_samples_identical: {
1993       /* There are some challenges implementing this for vec4, and it seems
1994        * unlikely to be used anyway.  For now, just return false ways.
1995        */
1996       emit(MOV(dest, elk_imm_ud(0u)));
1997       return;
1998    }
1999    case nir_texop_txb:
2000    case nir_texop_lod:
2001       unreachable("Implicit LOD is only valid inside fragment shaders.");
2002    default:
2003       unreachable("Unrecognized tex op");
2004    }
2005 
2006    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
2007 
2008    inst->offset = constant_offset;
2009 
2010    /* The message header is necessary for:
2011     * - Gfx4 (always)
2012     * - Texel offsets
2013     * - Gather channel selection
2014     * - Sampler indices too large to fit in a 4-bit value.
2015     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2016     */
2017    inst->header_size =
2018       (devinfo->ver < 5 ||
2019        inst->offset != 0 ||
2020        opcode == ELK_SHADER_OPCODE_TG4 ||
2021        opcode == ELK_SHADER_OPCODE_TG4_OFFSET ||
2022        opcode == ELK_SHADER_OPCODE_SAMPLEINFO ||
2023        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2024    inst->base_mrf = 2;
2025    inst->mlen = inst->header_size;
2026    inst->dst.writemask = WRITEMASK_XYZW;
2027    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
2028 
2029    inst->src[1] = texture_reg;
2030    inst->src[2] = sampler_reg;
2031 
2032    /* MRF for the first parameter */
2033    int param_base = inst->base_mrf + inst->header_size;
2034 
2035    if (opcode == ELK_SHADER_OPCODE_TXS) {
2036       int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X;
2037       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2038       inst->mlen++;
2039    } else if (opcode == ELK_SHADER_OPCODE_SAMPLEINFO) {
2040       inst->dst.writemask = WRITEMASK_X;
2041    } else {
2042       /* Load the coordinate */
2043       /* FINISHME: gl_clamp_mask and saturate */
2044       int coord_mask = (1 << instr->coord_components) - 1;
2045       int zero_mask = 0xf & ~coord_mask;
2046 
2047       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2048                coordinate));
2049       inst->mlen++;
2050 
2051       if (zero_mask != 0) {
2052          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2053                   elk_imm_d(0)));
2054       }
2055       /* Load the shadow comparator */
2056       if (shadow_comparator.file != BAD_FILE &&
2057           opcode != ELK_SHADER_OPCODE_TXD &&
2058           opcode != ELK_SHADER_OPCODE_TG4_OFFSET) {
2059 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
2060 			  WRITEMASK_X),
2061 		  shadow_comparator));
2062 	 inst->mlen++;
2063       }
2064 
2065       /* Load the LOD info */
2066       switch (opcode) {
2067       case ELK_SHADER_OPCODE_TXL: {
2068 	 int mrf, writemask;
2069 	 if (devinfo->ver >= 5) {
2070 	    mrf = param_base + 1;
2071 	    if (shadow_comparator.file != BAD_FILE) {
2072 	       writemask = WRITEMASK_Y;
2073 	       /* mlen already incremented */
2074 	    } else {
2075 	       writemask = WRITEMASK_X;
2076 	       inst->mlen++;
2077 	    }
2078 	 } else /* devinfo->ver == 4 */ {
2079 	    mrf = param_base;
2080 	    writemask = WRITEMASK_W;
2081 	 }
2082 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2083          break;
2084       }
2085 
2086       case ELK_SHADER_OPCODE_TXF:
2087          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2088          break;
2089 
2090       case ELK_SHADER_OPCODE_TXF_CMS:
2091          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2092                   sample_index));
2093          if (devinfo->ver >= 7) {
2094             /* MCS data is in the first channel of `mcs`, but we need to get it into
2095              * the .y channel of the second vec4 of params, so replicate .x across
2096              * the whole vec4 and then mask off everything except .y
2097              */
2098             mcs.swizzle = ELK_SWIZZLE_XXXX;
2099             emit(MOV(dst_reg(MRF, param_base + 1, glsl_uint_type(), WRITEMASK_Y),
2100                      mcs));
2101          }
2102          inst->mlen++;
2103          break;
2104 
2105       case ELK_SHADER_OPCODE_TXD: {
2106          const elk_reg_type type = lod.type;
2107 
2108 	 if (devinfo->ver >= 5) {
2109 	    lod.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_X,ELK_SWIZZLE_X,ELK_SWIZZLE_Y,ELK_SWIZZLE_Y);
2110 	    lod2.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_X,ELK_SWIZZLE_X,ELK_SWIZZLE_Y,ELK_SWIZZLE_Y);
2111 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2112 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2113 	    inst->mlen++;
2114 
2115 	    if (nir_tex_instr_dest_size(instr) == 3 ||
2116                 shadow_comparator.file != BAD_FILE) {
2117 	       lod.swizzle = ELK_SWIZZLE_ZZZZ;
2118 	       lod2.swizzle = ELK_SWIZZLE_ZZZZ;
2119 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2120 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2121 	       inst->mlen++;
2122 
2123                if (shadow_comparator.file != BAD_FILE) {
2124                   emit(MOV(dst_reg(MRF, param_base + 2,
2125                                    shadow_comparator.type, WRITEMASK_Z),
2126                            shadow_comparator));
2127                }
2128 	    }
2129 	 } else /* devinfo->ver == 4 */ {
2130 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2131 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2132 	    inst->mlen += 2;
2133 	 }
2134          break;
2135       }
2136 
2137       case ELK_SHADER_OPCODE_TG4_OFFSET:
2138          if (shadow_comparator.file != BAD_FILE) {
2139             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
2140                      shadow_comparator));
2141          }
2142 
2143          emit(MOV(dst_reg(MRF, param_base + 1, glsl_ivec2_type(), WRITEMASK_XY),
2144                   offset_value));
2145          inst->mlen++;
2146          break;
2147 
2148       default:
2149          break;
2150       }
2151    }
2152 
2153    emit(inst);
2154 
2155    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2156     * spec requires layers.
2157     */
2158    if (instr->op == nir_texop_txs && devinfo->ver < 7) {
2159       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
2160       emit_minmax(ELK_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
2161                   src_reg(inst->dst), elk_imm_d(1));
2162    }
2163 
2164    if (instr->op == nir_texop_query_levels) {
2165       /* # levels is in .w */
2166       src_reg swizzled(dest);
2167       swizzled.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_W, ELK_SWIZZLE_W,
2168                                       ELK_SWIZZLE_W, ELK_SWIZZLE_W);
2169       emit(MOV(dest, swizzled));
2170    }
2171 }
2172 
2173 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)2174 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2175                              src_reg coordinate, src_reg surface)
2176 {
2177    vec4_instruction *inst =
2178       new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_TXF_MCS,
2179                                     dst_reg(this, glsl_uvec4_type()));
2180    inst->base_mrf = 2;
2181    inst->src[1] = surface;
2182    inst->src[2] = elk_imm_ud(0); /* sampler */
2183    inst->mlen = 1;
2184 
2185    const int param_base = inst->base_mrf;
2186 
2187    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2188    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2189    int zero_mask = 0xf & ~coord_mask;
2190 
2191    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2192             coordinate));
2193 
2194    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2195             elk_imm_d(0)));
2196 
2197    emit(inst);
2198    return src_reg(inst->dst);
2199 }
2200 
2201 void
nir_emit_undef(nir_undef_instr * instr)2202 vec4_visitor::nir_emit_undef(nir_undef_instr *instr)
2203 {
2204    nir_ssa_values[instr->def.index] =
2205       dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
2206 }
2207 
2208 /* SIMD4x2 64bit data is stored in register space like this:
2209  *
2210  * r0.0:DF  x0 y0 z0 w0
2211  * r1.0:DF  x1 y1 z1 w1
2212  *
2213  * When we need to write data such as this to memory using 32-bit write
2214  * messages we need to shuffle it in this fashion:
2215  *
2216  * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
2217  * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
2218  *
2219  * We need to do the inverse operation when we read using 32-bit messages,
2220  * which we can do by applying the same exact shuffling on the 64-bit data
2221  * read, only that because the data for each vertex is positioned differently
2222  * we need to apply different channel enables.
2223  *
2224  * This function takes 64bit data and shuffles it as explained above.
2225  *
2226  * The @for_write parameter is used to specify if the shuffling is being done
2227  * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
2228  * write message (for_write = true), or instead we are doing the inverse
2229  * operation and we have just read 64-bit data using a 32-bit messages that we
2230  * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
2231  *
2232  * If @block and @ref are non-NULL, then the shuffling is done after @ref,
2233  * otherwise the instructions are emitted normally at the end. The function
2234  * returns the last instruction inserted.
2235  *
2236  * Notice that @src and @dst cannot be the same register.
2237  */
2238 vec4_instruction *
shuffle_64bit_data(dst_reg dst,src_reg src,bool for_write,bool for_scratch,elk_bblock_t * block,vec4_instruction * ref)2239 vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
2240                                  bool for_scratch,
2241                                  elk_bblock_t *block, vec4_instruction *ref)
2242 {
2243    assert(type_sz(src.type) == 8);
2244    assert(type_sz(dst.type) == 8);
2245    assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
2246    assert(!ref == !block);
2247 
2248    elk_opcode mov_op = for_scratch ? ELK_VEC4_OPCODE_MOV_FOR_SCRATCH : ELK_OPCODE_MOV;
2249 
2250    const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
2251                                    vec4_builder(this).at(block, ref->next);
2252 
2253    /* Resolve swizzle in src */
2254    if (src.swizzle != ELK_SWIZZLE_XYZW) {
2255       dst_reg data = dst_reg(this, glsl_dvec4_type());
2256       bld.emit(mov_op, data, src);
2257       src = src_reg(data);
2258    }
2259 
2260    /* dst+0.XY = src+0.XY */
2261    bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src);
2262 
2263    /* dst+0.ZW = src+1.XY */
2264    bld.group(4, for_write ? 1 : 0)
2265             .emit(mov_op, writemask(dst, WRITEMASK_ZW),
2266                   swizzle(byte_offset(src, REG_SIZE), ELK_SWIZZLE_XYXY));
2267 
2268    /* dst+1.XY = src+0.ZW */
2269    bld.group(4, for_write ? 0 : 1)
2270             .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
2271                   swizzle(src, ELK_SWIZZLE_ZWZW));
2272 
2273    /* dst+1.ZW = src+1.ZW */
2274    return bld.group(4, 1)
2275             .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
2276                   byte_offset(src, REG_SIZE));
2277 }
2278 
2279 }
2280