1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 #include "elk_nir.h"
27 #include "elk_nir_private.h"
28 #include "elk_eu.h"
29 #include "nir.h"
30 #include "nir_intrinsics.h"
31 #include "nir_search_helpers.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34
35 #include <vector>
36
37 using namespace elk;
38
39 struct elk_fs_bind_info {
40 bool valid;
41 bool bindless;
42 unsigned block;
43 unsigned set;
44 unsigned binding;
45 };
46
47 struct nir_to_elk_state {
48 elk_fs_visitor &s;
49 const nir_shader *nir;
50 const intel_device_info *devinfo;
51 void *mem_ctx;
52
53 /* Points to the end of the program. Annotated with the current NIR
54 * instruction when applicable.
55 */
56 fs_builder bld;
57
58 elk_fs_reg *ssa_values;
59 elk_fs_inst **resource_insts;
60 struct elk_fs_bind_info *ssa_bind_infos;
61 elk_fs_reg *resource_values;
62 elk_fs_reg *system_values;
63 };
64
65 static elk_fs_reg get_nir_src(nir_to_elk_state &ntb, const nir_src &src);
66 static elk_fs_reg get_nir_def(nir_to_elk_state &ntb, const nir_def &def);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68
69 static void fs_nir_emit_intrinsic(nir_to_elk_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static elk_fs_reg emit_samplepos_setup(nir_to_elk_state &ntb);
71 static elk_fs_reg emit_sampleid_setup(nir_to_elk_state &ntb);
72 static elk_fs_reg emit_samplemaskin_setup(nir_to_elk_state &ntb);
73
74 static void fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl);
75 static void fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list);
76 static void fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt);
77 static void fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop);
78 static void fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block);
79 static void fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr);
80
81 static void fs_nir_emit_surface_atomic(nir_to_elk_state &ntb,
82 const fs_builder &bld,
83 nir_intrinsic_instr *instr,
84 elk_fs_reg surface,
85 bool bindless);
86 static void fs_nir_emit_global_atomic(nir_to_elk_state &ntb,
87 const fs_builder &bld,
88 nir_intrinsic_instr *instr);
89
90 static void
fs_nir_setup_outputs(nir_to_elk_state & ntb)91 fs_nir_setup_outputs(nir_to_elk_state &ntb)
92 {
93 elk_fs_visitor &s = ntb.s;
94
95 if (s.stage == MESA_SHADER_TESS_CTRL ||
96 s.stage == MESA_SHADER_FRAGMENT)
97 return;
98
99 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
100
101 /* Calculate the size of output registers in a separate pass, before
102 * allocating them. With ARB_enhanced_layouts, multiple output variables
103 * may occupy the same slot, but have different type sizes.
104 */
105 nir_foreach_shader_out_variable(var, s.nir) {
106 const int loc = var->data.driver_location;
107 const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
108 vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
109 }
110
111 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
112 if (vec4s[loc] == 0) {
113 loc++;
114 continue;
115 }
116
117 unsigned reg_size = vec4s[loc];
118
119 /* Check if there are any ranges that start within this range and extend
120 * past it. If so, include them in this allocation.
121 */
122 for (unsigned i = 1; i < reg_size; i++) {
123 assert(i + loc < ARRAY_SIZE(vec4s));
124 reg_size = MAX2(vec4s[i + loc] + i, reg_size);
125 }
126
127 elk_fs_reg reg = ntb.bld.vgrf(ELK_REGISTER_TYPE_F, 4 * reg_size);
128 for (unsigned i = 0; i < reg_size; i++) {
129 assert(loc + i < ARRAY_SIZE(s.outputs));
130 s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
131 }
132
133 loc += reg_size;
134 }
135 }
136
137 static void
fs_nir_setup_uniforms(elk_fs_visitor & s)138 fs_nir_setup_uniforms(elk_fs_visitor &s)
139 {
140 /* Only the first compile gets to set up uniforms. */
141 if (s.push_constant_loc)
142 return;
143
144 s.uniforms = s.nir->num_uniforms / 4;
145
146 if (gl_shader_stage_is_compute(s.stage)) {
147 /* Add uniforms for builtins after regular NIR uniforms. */
148 assert(s.uniforms == s.prog_data->nr_params);
149
150 /* Subgroup ID must be the last uniform on the list. This will make
151 * easier later to split between cross thread and per thread
152 * uniforms.
153 */
154 uint32_t *param = elk_stage_prog_data_add_params(s.prog_data, 1);
155 *param = ELK_PARAM_BUILTIN_SUBGROUP_ID;
156 s.uniforms++;
157 }
158 }
159
160 static elk_fs_reg
emit_work_group_id_setup(nir_to_elk_state & ntb)161 emit_work_group_id_setup(nir_to_elk_state &ntb)
162 {
163 elk_fs_visitor &s = ntb.s;
164 const fs_builder &bld = ntb.bld;
165
166 assert(gl_shader_stage_is_compute(s.stage));
167
168 elk_fs_reg id = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
169
170 struct elk_reg r0_1(retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
171 bld.MOV(id, r0_1);
172
173 struct elk_reg r0_6(retype(elk_vec1_grf(0, 6), ELK_REGISTER_TYPE_UD));
174 struct elk_reg r0_7(retype(elk_vec1_grf(0, 7), ELK_REGISTER_TYPE_UD));
175 bld.MOV(offset(id, bld, 1), r0_6);
176 bld.MOV(offset(id, bld, 2), r0_7);
177
178 return id;
179 }
180
181 static bool
emit_system_values_block(nir_to_elk_state & ntb,nir_block * block)182 emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
183 {
184 elk_fs_visitor &s = ntb.s;
185 elk_fs_reg *reg;
186
187 nir_foreach_instr(instr, block) {
188 if (instr->type != nir_instr_type_intrinsic)
189 continue;
190
191 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
192 switch (intrin->intrinsic) {
193 case nir_intrinsic_load_vertex_id:
194 case nir_intrinsic_load_base_vertex:
195 unreachable("should be lowered by nir_lower_system_values().");
196
197 case nir_intrinsic_load_vertex_id_zero_base:
198 case nir_intrinsic_load_is_indexed_draw:
199 case nir_intrinsic_load_first_vertex:
200 case nir_intrinsic_load_instance_id:
201 case nir_intrinsic_load_base_instance:
202 unreachable("should be lowered by elk_nir_lower_vs_inputs().");
203 break;
204
205 case nir_intrinsic_load_draw_id:
206 unreachable("should be lowered by elk_nir_lower_vs_inputs().");
207 break;
208
209 case nir_intrinsic_load_invocation_id:
210 if (s.stage == MESA_SHADER_TESS_CTRL)
211 break;
212 assert(s.stage == MESA_SHADER_GEOMETRY);
213 reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
214 if (reg->file == BAD_FILE) {
215 *reg = s.gs_payload().instance_id;
216 }
217 break;
218
219 case nir_intrinsic_load_sample_pos:
220 case nir_intrinsic_load_sample_pos_or_center:
221 assert(s.stage == MESA_SHADER_FRAGMENT);
222 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
223 if (reg->file == BAD_FILE)
224 *reg = emit_samplepos_setup(ntb);
225 break;
226
227 case nir_intrinsic_load_sample_id:
228 assert(s.stage == MESA_SHADER_FRAGMENT);
229 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
230 if (reg->file == BAD_FILE)
231 *reg = emit_sampleid_setup(ntb);
232 break;
233
234 case nir_intrinsic_load_sample_mask_in:
235 assert(s.stage == MESA_SHADER_FRAGMENT);
236 assert(s.devinfo->ver >= 7);
237 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
238 if (reg->file == BAD_FILE)
239 *reg = emit_samplemaskin_setup(ntb);
240 break;
241
242 case nir_intrinsic_load_workgroup_id:
243 assert(gl_shader_stage_is_compute(s.stage));
244 reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
245 if (reg->file == BAD_FILE)
246 *reg = emit_work_group_id_setup(ntb);
247 break;
248
249 case nir_intrinsic_load_helper_invocation:
250 assert(s.stage == MESA_SHADER_FRAGMENT);
251 reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
252 if (reg->file == BAD_FILE) {
253 const fs_builder abld =
254 ntb.bld.annotate("gl_HelperInvocation", NULL);
255
256 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
257 * pixel mask is in g1.7 of the thread payload.
258 *
259 * We move the per-channel pixel enable bit to the low bit of each
260 * channel by shifting the byte containing the pixel mask by the
261 * vector immediate 0x76543210UV.
262 *
263 * The region of <1,8,0> reads only 1 byte (the pixel masks for
264 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
265 * masks for 2 and 3) in SIMD16.
266 */
267 elk_fs_reg shifted = abld.vgrf(ELK_REGISTER_TYPE_UW, 1);
268
269 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
270 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
271 /* According to the "PS Thread Payload for Normal
272 * Dispatch" pages on the BSpec, the dispatch mask is
273 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
274 * gfx6+.
275 */
276 const struct elk_reg reg = elk_vec1_grf(i + 1, 7);
277 hbld.SHR(offset(shifted, hbld, i),
278 stride(retype(reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
279 elk_imm_v(0x76543210));
280 }
281
282 /* A set bit in the pixel mask means the channel is enabled, but
283 * that is the opposite of gl_HelperInvocation so we need to invert
284 * the mask.
285 *
286 * The negate source-modifier bit of logical instructions on Gfx8+
287 * performs 1's complement negation, so we can use that instead of
288 * a NOT instruction.
289 */
290 elk_fs_reg inverted = negate(shifted);
291 if (s.devinfo->ver < 8) {
292 inverted = abld.vgrf(ELK_REGISTER_TYPE_UW);
293 abld.NOT(inverted, shifted);
294 }
295
296 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
297 * with 1 and negating.
298 */
299 elk_fs_reg anded = abld.vgrf(ELK_REGISTER_TYPE_UD, 1);
300 abld.AND(anded, inverted, elk_imm_uw(1));
301
302 elk_fs_reg dst = abld.vgrf(ELK_REGISTER_TYPE_D, 1);
303 abld.MOV(dst, negate(retype(anded, ELK_REGISTER_TYPE_D)));
304 *reg = dst;
305 }
306 break;
307
308 default:
309 break;
310 }
311 }
312
313 return true;
314 }
315
316 static void
fs_nir_emit_system_values(nir_to_elk_state & ntb)317 fs_nir_emit_system_values(nir_to_elk_state &ntb)
318 {
319 const fs_builder &bld = ntb.bld;
320 elk_fs_visitor &s = ntb.s;
321
322 ntb.system_values = ralloc_array(ntb.mem_ctx, elk_fs_reg, SYSTEM_VALUE_MAX);
323 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
324 ntb.system_values[i] = elk_fs_reg();
325 }
326
327 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we
328 * never end up using it.
329 */
330 {
331 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
332 elk_fs_reg ® = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
333 reg = abld.vgrf(ELK_REGISTER_TYPE_UW);
334 abld.UNDEF(reg);
335
336 const fs_builder allbld8 = abld.group(8, 0).exec_all();
337 allbld8.MOV(reg, elk_imm_v(0x76543210));
338 if (s.dispatch_width > 8)
339 allbld8.ADD(byte_offset(reg, 16), reg, elk_imm_uw(8u));
340 if (s.dispatch_width > 16) {
341 const fs_builder allbld16 = abld.group(16, 0).exec_all();
342 allbld16.ADD(byte_offset(reg, 32), reg, elk_imm_uw(16u));
343 }
344 }
345
346 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
347 nir_foreach_block(block, impl)
348 emit_system_values_block(ntb, block);
349 }
350
351 static void
fs_nir_emit_impl(nir_to_elk_state & ntb,nir_function_impl * impl)352 fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl)
353 {
354 ntb.ssa_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
355 ntb.resource_insts = rzalloc_array(ntb.mem_ctx, elk_fs_inst *, impl->ssa_alloc);
356 ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct elk_fs_bind_info, impl->ssa_alloc);
357 ntb.resource_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
358
359 fs_nir_emit_cf_list(ntb, &impl->body);
360 }
361
362 static void
fs_nir_emit_cf_list(nir_to_elk_state & ntb,exec_list * list)363 fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list)
364 {
365 exec_list_validate(list);
366 foreach_list_typed(nir_cf_node, node, node, list) {
367 switch (node->type) {
368 case nir_cf_node_if:
369 fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
370 break;
371
372 case nir_cf_node_loop:
373 fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
374 break;
375
376 case nir_cf_node_block:
377 fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
378 break;
379
380 default:
381 unreachable("Invalid CFG node block");
382 }
383 }
384 }
385
386 static void
fs_nir_emit_if(nir_to_elk_state & ntb,nir_if * if_stmt)387 fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt)
388 {
389 const intel_device_info *devinfo = ntb.devinfo;
390 const fs_builder &bld = ntb.bld;
391
392 bool invert;
393 elk_fs_reg cond_reg;
394
395 /* If the condition has the form !other_condition, use other_condition as
396 * the source, but invert the predicate on the if instruction.
397 */
398 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
399 if (cond != NULL && cond->op == nir_op_inot) {
400 invert = true;
401 cond_reg = get_nir_src(ntb, cond->src[0].src);
402 cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
403
404 if (devinfo->ver <= 5 &&
405 (cond->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
406 /* redo boolean resolve on gen5 */
407 elk_fs_reg masked = ntb.s.vgrf(glsl_int_type());
408 bld.AND(masked, cond_reg, elk_imm_d(1));
409 masked.negate = true;
410 elk_fs_reg tmp = bld.vgrf(cond_reg.type);
411 bld.MOV(retype(tmp, ELK_REGISTER_TYPE_D), masked);
412 cond_reg = tmp;
413 }
414 } else {
415 invert = false;
416 cond_reg = get_nir_src(ntb, if_stmt->condition);
417 }
418
419 /* first, put the condition into f0 */
420 elk_fs_inst *inst = bld.MOV(bld.null_reg_d(),
421 retype(cond_reg, ELK_REGISTER_TYPE_D));
422 inst->conditional_mod = ELK_CONDITIONAL_NZ;
423
424 bld.IF(ELK_PREDICATE_NORMAL)->predicate_inverse = invert;
425
426 fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
427
428 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
429 bld.emit(ELK_OPCODE_ELSE);
430 fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
431 }
432
433 bld.emit(ELK_OPCODE_ENDIF);
434
435 if (devinfo->ver < 7)
436 ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
437 "in SIMD32 mode.");
438 }
439
440 static void
fs_nir_emit_loop(nir_to_elk_state & ntb,nir_loop * loop)441 fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop)
442 {
443 const intel_device_info *devinfo = ntb.devinfo;
444 const fs_builder &bld = ntb.bld;
445
446 assert(!nir_loop_has_continue_construct(loop));
447 bld.emit(ELK_OPCODE_DO);
448
449 fs_nir_emit_cf_list(ntb, &loop->body);
450
451 bld.emit(ELK_OPCODE_WHILE);
452
453 if (devinfo->ver < 7)
454 ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
455 "in SIMD32 mode.");
456 }
457
458 static void
fs_nir_emit_block(nir_to_elk_state & ntb,nir_block * block)459 fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block)
460 {
461 fs_builder bld = ntb.bld;
462
463 nir_foreach_instr(instr, block) {
464 fs_nir_emit_instr(ntb, instr);
465 }
466
467 ntb.bld = bld;
468 }
469
470 /**
471 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
472 * match instr.
473 */
474 static bool
optimize_extract_to_float(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)475 optimize_extract_to_float(nir_to_elk_state &ntb, nir_alu_instr *instr,
476 const elk_fs_reg &result)
477 {
478 const intel_device_info *devinfo = ntb.devinfo;
479 const fs_builder &bld = ntb.bld;
480
481 /* No fast path for f16 or f64. */
482 assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
483
484 if (!instr->src[0].src.ssa->parent_instr)
485 return false;
486
487 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
488 return false;
489
490 nir_alu_instr *src0 =
491 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
492
493 unsigned bytes;
494 bool is_signed;
495
496 switch (src0->op) {
497 case nir_op_extract_u8:
498 case nir_op_extract_u16:
499 bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
500
501 /* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
502 * result. Ditto for extract_u16.
503 */
504 is_signed = false;
505 break;
506
507 case nir_op_extract_i8:
508 case nir_op_extract_i16:
509 bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
510
511 /* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
512 * sign extension of the extract_i8 is lost. For example,
513 * u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
514 * fast path could either give 255.0 (by implementing the fast path as
515 * u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
516 * i2f(extract_i8(x))). At one point in time, we incorrectly implemented
517 * the former.
518 */
519 if (instr->op != nir_op_i2f32)
520 return false;
521
522 is_signed = true;
523 break;
524
525 default:
526 return false;
527 }
528
529 unsigned element = nir_src_as_uint(src0->src[1].src);
530
531 /* Element type to extract.*/
532 const elk_reg_type type = elk_int_type(bytes, is_signed);
533
534 elk_fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
535 op0.type = elk_type_for_nir_type(devinfo,
536 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
537 nir_src_bit_size(src0->src[0].src)));
538 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
539
540 bld.MOV(result, subscript(op0, type, element));
541 return true;
542 }
543
544 static bool
optimize_frontfacing_ternary(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)545 optimize_frontfacing_ternary(nir_to_elk_state &ntb,
546 nir_alu_instr *instr,
547 const elk_fs_reg &result)
548 {
549 const intel_device_info *devinfo = ntb.devinfo;
550 elk_fs_visitor &s = ntb.s;
551
552 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
553 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
554 return false;
555
556 if (!nir_src_is_const(instr->src[1].src) ||
557 !nir_src_is_const(instr->src[2].src))
558 return false;
559
560 const float value1 = nir_src_as_float(instr->src[1].src);
561 const float value2 = nir_src_as_float(instr->src[2].src);
562 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
563 return false;
564
565 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
566 assert(value1 == -value2);
567
568 elk_fs_reg tmp = s.vgrf(glsl_int_type());
569
570 if (devinfo->ver >= 6) {
571 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
572 elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
573
574 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
575 *
576 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
577 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
578 *
579 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
580 *
581 * This negation looks like it's safe in practice, because bits 0:4 will
582 * surely be TRIANGLES
583 */
584
585 if (value1 == -1.0f) {
586 g0.negate = true;
587 }
588
589 ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
590 g0, elk_imm_uw(0x3f80));
591 } else {
592 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
593 elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
594
595 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
596 *
597 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
598 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
599 *
600 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
601 *
602 * This negation looks like it's safe in practice, because bits 0:4 will
603 * surely be TRIANGLES
604 */
605
606 if (value1 == -1.0f) {
607 g1_6.negate = true;
608 }
609
610 ntb.bld.OR(tmp, g1_6, elk_imm_d(0x3f800000));
611 }
612 ntb.bld.AND(retype(result, ELK_REGISTER_TYPE_D), tmp, elk_imm_d(0xbf800000));
613
614 return true;
615 }
616
617 static elk_rnd_mode
elk_rnd_mode_from_nir_op(const nir_op op)618 elk_rnd_mode_from_nir_op (const nir_op op) {
619 switch (op) {
620 case nir_op_f2f16_rtz:
621 return ELK_RND_MODE_RTZ;
622 case nir_op_f2f16_rtne:
623 return ELK_RND_MODE_RTNE;
624 default:
625 unreachable("Operation doesn't support rounding mode");
626 }
627 }
628
629 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)630 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
631 {
632 if (nir_has_any_rounding_mode_rtne(execution_mode))
633 return ELK_RND_MODE_RTNE;
634 if (nir_has_any_rounding_mode_rtz(execution_mode))
635 return ELK_RND_MODE_RTZ;
636 return ELK_RND_MODE_UNSPECIFIED;
637 }
638
639 static elk_fs_reg
prepare_alu_destination_and_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op,bool need_dest)640 prepare_alu_destination_and_sources(nir_to_elk_state &ntb,
641 const fs_builder &bld,
642 nir_alu_instr *instr,
643 elk_fs_reg *op,
644 bool need_dest)
645 {
646 const intel_device_info *devinfo = ntb.devinfo;
647
648 elk_fs_reg result =
649 need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
650
651 result.type = elk_type_for_nir_type(devinfo,
652 (nir_alu_type)(nir_op_infos[instr->op].output_type |
653 instr->def.bit_size));
654
655 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
656 op[i] = get_nir_src(ntb, instr->src[i].src);
657 op[i].type = elk_type_for_nir_type(devinfo,
658 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
659 nir_src_bit_size(instr->src[i].src)));
660 }
661
662 /* Move and vecN instrutions may still be vectored. Return the raw,
663 * vectored source and destination so that elk_fs_visitor::nir_emit_alu can
664 * handle it. Other callers should not have to handle these kinds of
665 * instructions.
666 */
667 switch (instr->op) {
668 case nir_op_mov:
669 case nir_op_vec2:
670 case nir_op_vec3:
671 case nir_op_vec4:
672 case nir_op_vec8:
673 case nir_op_vec16:
674 return result;
675 default:
676 break;
677 }
678
679 /* At this point, we have dealt with any instruction that operates on
680 * more than a single channel. Therefore, we can just adjust the source
681 * and destination registers for that channel and emit the instruction.
682 */
683 unsigned channel = 0;
684 if (nir_op_infos[instr->op].output_size == 0) {
685 /* Since NIR is doing the scalarizing for us, we should only ever see
686 * vectorized operations with a single channel.
687 */
688 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
689 assert(util_bitcount(write_mask) == 1);
690 channel = ffs(write_mask) - 1;
691
692 result = offset(result, bld, channel);
693 }
694
695 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
696 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
697 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
698 }
699
700 return result;
701 }
702
703 static elk_fs_reg
resolve_source_modifiers(const fs_builder & bld,const elk_fs_reg & src)704 resolve_source_modifiers(const fs_builder &bld, const elk_fs_reg &src)
705 {
706 if (!src.abs && !src.negate)
707 return src;
708
709 elk_fs_reg temp = bld.vgrf(src.type);
710 bld.MOV(temp, src);
711
712 return temp;
713 }
714
715 static void
resolve_inot_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op)716 resolve_inot_sources(nir_to_elk_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
717 elk_fs_reg *op)
718 {
719 for (unsigned i = 0; i < 2; i++) {
720 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
721
722 if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
723 /* The source of the inot is now the source of instr. */
724 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
725
726 assert(!op[i].negate);
727 op[i].negate = true;
728 } else {
729 op[i] = resolve_source_modifiers(bld, op[i]);
730 }
731 }
732 }
733
734 static bool
try_emit_b2fi_of_inot(nir_to_elk_state & ntb,const fs_builder & bld,elk_fs_reg result,nir_alu_instr * instr)735 try_emit_b2fi_of_inot(nir_to_elk_state &ntb, const fs_builder &bld,
736 elk_fs_reg result,
737 nir_alu_instr *instr)
738 {
739 const intel_device_info *devinfo = bld.shader->devinfo;
740
741 if (devinfo->ver < 6)
742 return false;
743
744 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
745
746 if (inot_instr == NULL || inot_instr->op != nir_op_inot)
747 return false;
748
749 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
750 * of valid size-changing combinations is a bit more complex.
751 *
752 * The source restriction is just because I was lazy about generating the
753 * constant below.
754 */
755 if (instr->def.bit_size != 32 ||
756 nir_src_bit_size(inot_instr->src[0].src) != 32)
757 return false;
758
759 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
760 * this is float(1 + a).
761 */
762 elk_fs_reg op;
763
764 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
765
766 /* Ignore the saturate modifier, if there is one. The result of the
767 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
768 */
769 bld.ADD(result, op, elk_imm_d(1));
770
771 return true;
772 }
773
774 /**
775 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
776 *
777 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
778 * the source of \c instr that is a \c nir_op_fsign.
779 */
780 static void
emit_fsign(nir_to_elk_state & ntb,const fs_builder & bld,const nir_alu_instr * instr,elk_fs_reg result,elk_fs_reg * op,unsigned fsign_src)781 emit_fsign(nir_to_elk_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
782 elk_fs_reg result, elk_fs_reg *op, unsigned fsign_src)
783 {
784 const intel_device_info *devinfo = ntb.devinfo;
785
786 elk_fs_inst *inst;
787
788 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
789 assert(fsign_src < nir_op_infos[instr->op].num_inputs);
790
791 if (instr->op != nir_op_fsign) {
792 const nir_alu_instr *const fsign_instr =
793 nir_src_as_alu_instr(instr->src[fsign_src].src);
794
795 /* op[fsign_src] has the nominal result of the fsign, and op[1 -
796 * fsign_src] has the other multiply source. This must be rearranged so
797 * that op[0] is the source of the fsign op[1] is the other multiply
798 * source.
799 */
800 if (fsign_src != 0)
801 op[1] = op[0];
802
803 op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
804
805 const nir_alu_type t =
806 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
807 nir_src_bit_size(fsign_instr->src[0].src));
808
809 op[0].type = elk_type_for_nir_type(devinfo, t);
810
811 unsigned channel = 0;
812 if (nir_op_infos[instr->op].output_size == 0) {
813 /* Since NIR is doing the scalarizing for us, we should only ever see
814 * vectorized operations with a single channel.
815 */
816 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
817 assert(util_bitcount(write_mask) == 1);
818 channel = ffs(write_mask) - 1;
819 }
820
821 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
822 }
823
824 if (type_sz(op[0].type) == 2) {
825 /* AND(val, 0x8000) gives the sign bit.
826 *
827 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
828 */
829 elk_fs_reg zero = retype(elk_imm_uw(0), ELK_REGISTER_TYPE_HF);
830 bld.CMP(bld.null_reg_f(), op[0], zero, ELK_CONDITIONAL_NZ);
831
832 op[0].type = ELK_REGISTER_TYPE_UW;
833 result.type = ELK_REGISTER_TYPE_UW;
834 bld.AND(result, op[0], elk_imm_uw(0x8000u));
835
836 if (instr->op == nir_op_fsign)
837 inst = bld.OR(result, result, elk_imm_uw(0x3c00u));
838 else {
839 /* Use XOR here to get the result sign correct. */
840 inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UW));
841 }
842
843 inst->predicate = ELK_PREDICATE_NORMAL;
844 } else if (type_sz(op[0].type) == 4) {
845 /* AND(val, 0x80000000) gives the sign bit.
846 *
847 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
848 * zero.
849 */
850 bld.CMP(bld.null_reg_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ);
851
852 op[0].type = ELK_REGISTER_TYPE_UD;
853 result.type = ELK_REGISTER_TYPE_UD;
854 bld.AND(result, op[0], elk_imm_ud(0x80000000u));
855
856 if (instr->op == nir_op_fsign)
857 inst = bld.OR(result, result, elk_imm_ud(0x3f800000u));
858 else {
859 /* Use XOR here to get the result sign correct. */
860 inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UD));
861 }
862
863 inst->predicate = ELK_PREDICATE_NORMAL;
864 } else {
865 unreachable("Should have been lowered by nir_opt_algebraic.");
866 }
867 }
868
869 /**
870 * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
871 *
872 * Checks the operands of a \c nir_op_fmul to determine whether or not
873 * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
874 *
875 * \param instr The multiplication instruction
876 *
877 * \param fsign_src The source of \c instr that may or may not be a
878 * \c nir_op_fsign
879 */
880 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)881 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
882 {
883 assert(instr->op == nir_op_fmul);
884
885 nir_alu_instr *const fsign_instr =
886 nir_src_as_alu_instr(instr->src[fsign_src].src);
887
888 /* Rules:
889 *
890 * 1. instr->src[fsign_src] must be a nir_op_fsign.
891 * 2. The nir_op_fsign can only be used by this multiplication.
892 * 3. The source that is the nir_op_fsign does not have source modifiers.
893 * \c emit_fsign only examines the source modifiers of the source of the
894 * \c nir_op_fsign.
895 *
896 * The nir_op_fsign must also not have the saturate modifier, but steps
897 * have already been taken (in nir_opt_algebraic) to ensure that.
898 */
899 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
900 is_used_once(fsign_instr);
901 }
902
903 static bool
is_const_zero(const nir_src & src)904 is_const_zero(const nir_src &src)
905 {
906 return nir_src_is_const(src) && nir_src_as_int(src) == 0;
907 }
908
909 static void
fs_nir_emit_alu(nir_to_elk_state & ntb,nir_alu_instr * instr,bool need_dest)910 fs_nir_emit_alu(nir_to_elk_state &ntb, nir_alu_instr *instr,
911 bool need_dest)
912 {
913 const intel_device_info *devinfo = ntb.devinfo;
914 const fs_builder &bld = ntb.bld;
915 elk_fs_visitor &s = ntb.s;
916
917 elk_fs_inst *inst;
918 unsigned execution_mode =
919 bld.shader->nir->info.float_controls_execution_mode;
920
921 elk_fs_reg op[NIR_MAX_VEC_COMPONENTS];
922 elk_fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
923
924 #ifndef NDEBUG
925 /* Everything except raw moves, some type conversions, iabs, and ineg
926 * should have 8-bit sources lowered by nir_lower_bit_size in
927 * elk_preprocess_nir or by elk_nir_lower_conversions in
928 * elk_postprocess_nir.
929 */
930 switch (instr->op) {
931 case nir_op_mov:
932 case nir_op_vec2:
933 case nir_op_vec3:
934 case nir_op_vec4:
935 case nir_op_vec8:
936 case nir_op_vec16:
937 case nir_op_i2f16:
938 case nir_op_i2f32:
939 case nir_op_i2i16:
940 case nir_op_i2i32:
941 case nir_op_u2f16:
942 case nir_op_u2f32:
943 case nir_op_u2u16:
944 case nir_op_u2u32:
945 case nir_op_iabs:
946 case nir_op_ineg:
947 case nir_op_pack_32_4x8_split:
948 break;
949
950 default:
951 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
952 assert(type_sz(op[i].type) > 1);
953 }
954 }
955 #endif
956
957 switch (instr->op) {
958 case nir_op_mov:
959 case nir_op_vec2:
960 case nir_op_vec3:
961 case nir_op_vec4:
962 case nir_op_vec8:
963 case nir_op_vec16: {
964 elk_fs_reg temp = result;
965 bool need_extra_copy = false;
966
967 nir_intrinsic_instr *store_reg =
968 nir_store_reg_for_def(&instr->def);
969 if (store_reg != NULL) {
970 nir_def *dest_reg = store_reg->src[1].ssa;
971 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
972 nir_intrinsic_instr *load_reg =
973 nir_load_reg_for_def(instr->src[i].src.ssa);
974 if (load_reg == NULL)
975 continue;
976
977 if (load_reg->src[0].ssa == dest_reg) {
978 need_extra_copy = true;
979 temp = bld.vgrf(result.type, 4);
980 break;
981 }
982 }
983 }
984
985 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
986 unsigned last_bit = util_last_bit(write_mask);
987
988 for (unsigned i = 0; i < last_bit; i++) {
989 if (!(write_mask & (1 << i)))
990 continue;
991
992 if (instr->op == nir_op_mov) {
993 bld.MOV(offset(temp, bld, i),
994 offset(op[0], bld, instr->src[0].swizzle[i]));
995 } else {
996 bld.MOV(offset(temp, bld, i),
997 offset(op[i], bld, instr->src[i].swizzle[0]));
998 }
999 }
1000
1001 /* In this case the source and destination registers were the same,
1002 * so we need to insert an extra set of moves in order to deal with
1003 * any swizzling.
1004 */
1005 if (need_extra_copy) {
1006 for (unsigned i = 0; i < last_bit; i++) {
1007 if (!(write_mask & (1 << i)))
1008 continue;
1009
1010 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1011 }
1012 }
1013 return;
1014 }
1015
1016 case nir_op_i2f32:
1017 case nir_op_u2f32:
1018 if (optimize_extract_to_float(ntb, instr, result))
1019 return;
1020 inst = bld.MOV(result, op[0]);
1021 break;
1022
1023 case nir_op_f2f16_rtne:
1024 case nir_op_f2f16_rtz:
1025 case nir_op_f2f16: {
1026 elk_rnd_mode rnd = ELK_RND_MODE_UNSPECIFIED;
1027
1028 if (nir_op_f2f16 == instr->op)
1029 rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1030 else
1031 rnd = elk_rnd_mode_from_nir_op(instr->op);
1032
1033 if (ELK_RND_MODE_UNSPECIFIED != rnd)
1034 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), elk_imm_d(rnd));
1035
1036 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1037 inst = bld.F32TO16(result, op[0]);
1038 break;
1039 }
1040
1041 case nir_op_b2i8:
1042 case nir_op_b2i16:
1043 case nir_op_b2i32:
1044 case nir_op_b2i64:
1045 case nir_op_b2f16:
1046 case nir_op_b2f32:
1047 case nir_op_b2f64:
1048 if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1049 break;
1050 op[0].type = ELK_REGISTER_TYPE_D;
1051 op[0].negate = !op[0].negate;
1052 FALLTHROUGH;
1053 case nir_op_i2f64:
1054 case nir_op_i2i64:
1055 case nir_op_u2f64:
1056 case nir_op_u2u64:
1057 case nir_op_f2f64:
1058 case nir_op_f2i64:
1059 case nir_op_f2u64:
1060 case nir_op_i2i32:
1061 case nir_op_u2u32:
1062 case nir_op_f2i32:
1063 case nir_op_f2u32:
1064 case nir_op_i2f16:
1065 case nir_op_u2f16:
1066 case nir_op_f2i16:
1067 case nir_op_f2u16:
1068 case nir_op_f2i8:
1069 case nir_op_f2u8:
1070 if (result.type == ELK_REGISTER_TYPE_B ||
1071 result.type == ELK_REGISTER_TYPE_UB ||
1072 result.type == ELK_REGISTER_TYPE_HF)
1073 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1074
1075 if (op[0].type == ELK_REGISTER_TYPE_B ||
1076 op[0].type == ELK_REGISTER_TYPE_UB ||
1077 op[0].type == ELK_REGISTER_TYPE_HF)
1078 assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1079
1080 inst = bld.MOV(result, op[0]);
1081 break;
1082
1083 case nir_op_i2i8:
1084 case nir_op_u2u8:
1085 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1086 FALLTHROUGH;
1087 case nir_op_i2i16:
1088 case nir_op_u2u16: {
1089 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1090 * Emitting the instructions one by one results in two MOV instructions
1091 * that won't be propagated. By handling both instructions here, a
1092 * single MOV is emitted.
1093 */
1094 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1095 if (extract_instr != NULL) {
1096 if (extract_instr->op == nir_op_extract_u8 ||
1097 extract_instr->op == nir_op_extract_i8) {
1098 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1099
1100 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1101 const elk_reg_type type =
1102 elk_int_type(1, extract_instr->op == nir_op_extract_i8);
1103
1104 op[0] = subscript(op[0], type, byte);
1105 } else if (extract_instr->op == nir_op_extract_u16 ||
1106 extract_instr->op == nir_op_extract_i16) {
1107 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1108
1109 const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1110 const elk_reg_type type =
1111 elk_int_type(2, extract_instr->op == nir_op_extract_i16);
1112
1113 op[0] = subscript(op[0], type, word);
1114 }
1115 }
1116
1117 inst = bld.MOV(result, op[0]);
1118 break;
1119 }
1120
1121 case nir_op_fsat:
1122 inst = bld.MOV(result, op[0]);
1123 inst->saturate = true;
1124 break;
1125
1126 case nir_op_fneg:
1127 case nir_op_ineg:
1128 op[0].negate = true;
1129 inst = bld.MOV(result, op[0]);
1130 break;
1131
1132 case nir_op_fabs:
1133 case nir_op_iabs:
1134 op[0].negate = false;
1135 op[0].abs = true;
1136 inst = bld.MOV(result, op[0]);
1137 break;
1138
1139 case nir_op_f2f32:
1140 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1141 elk_rnd_mode rnd =
1142 elk_rnd_mode_from_execution_mode(execution_mode);
1143 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1144 elk_imm_d(rnd));
1145 }
1146
1147 if (op[0].type == ELK_REGISTER_TYPE_HF)
1148 assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1149
1150 inst = bld.MOV(result, op[0]);
1151 break;
1152
1153 case nir_op_fsign:
1154 emit_fsign(ntb, bld, instr, result, op, 0);
1155 break;
1156
1157 case nir_op_frcp:
1158 inst = bld.emit(ELK_SHADER_OPCODE_RCP, result, op[0]);
1159 break;
1160
1161 case nir_op_fexp2:
1162 inst = bld.emit(ELK_SHADER_OPCODE_EXP2, result, op[0]);
1163 break;
1164
1165 case nir_op_flog2:
1166 inst = bld.emit(ELK_SHADER_OPCODE_LOG2, result, op[0]);
1167 break;
1168
1169 case nir_op_fsin:
1170 inst = bld.emit(ELK_SHADER_OPCODE_SIN, result, op[0]);
1171 break;
1172
1173 case nir_op_fcos:
1174 inst = bld.emit(ELK_SHADER_OPCODE_COS, result, op[0]);
1175 break;
1176
1177 case nir_op_fadd:
1178 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1179 elk_rnd_mode rnd =
1180 elk_rnd_mode_from_execution_mode(execution_mode);
1181 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1182 elk_imm_d(rnd));
1183 }
1184 FALLTHROUGH;
1185 case nir_op_iadd:
1186 inst = bld.ADD(result, op[0], op[1]);
1187 break;
1188
1189 case nir_op_iadd_sat:
1190 case nir_op_uadd_sat:
1191 inst = bld.ADD(result, op[0], op[1]);
1192 inst->saturate = true;
1193 break;
1194
1195 case nir_op_isub_sat:
1196 bld.emit(ELK_SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1197 break;
1198
1199 case nir_op_usub_sat:
1200 bld.emit(ELK_SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1201 break;
1202
1203 case nir_op_irhadd:
1204 case nir_op_urhadd:
1205 assert(instr->def.bit_size < 64);
1206 inst = bld.AVG(result, op[0], op[1]);
1207 break;
1208
1209 case nir_op_ihadd:
1210 case nir_op_uhadd: {
1211 assert(instr->def.bit_size < 64);
1212 elk_fs_reg tmp = bld.vgrf(result.type);
1213
1214 if (devinfo->ver >= 8) {
1215 op[0] = resolve_source_modifiers(bld, op[0]);
1216 op[1] = resolve_source_modifiers(bld, op[1]);
1217 }
1218
1219 /* AVG(x, y) - ((x ^ y) & 1) */
1220 bld.XOR(tmp, op[0], op[1]);
1221 bld.AND(tmp, tmp, retype(elk_imm_ud(1), result.type));
1222 bld.AVG(result, op[0], op[1]);
1223 inst = bld.ADD(result, result, tmp);
1224 inst->src[1].negate = true;
1225 break;
1226 }
1227
1228 case nir_op_fmul:
1229 for (unsigned i = 0; i < 2; i++) {
1230 if (can_fuse_fmul_fsign(instr, i)) {
1231 emit_fsign(ntb, bld, instr, result, op, i);
1232 return;
1233 }
1234 }
1235
1236 /* We emit the rounding mode after the previous fsign optimization since
1237 * it won't result in a MUL, but will try to negate the value by other
1238 * means.
1239 */
1240 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1241 elk_rnd_mode rnd =
1242 elk_rnd_mode_from_execution_mode(execution_mode);
1243 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1244 elk_imm_d(rnd));
1245 }
1246
1247 inst = bld.MUL(result, op[0], op[1]);
1248 break;
1249
1250 case nir_op_imul_2x32_64:
1251 case nir_op_umul_2x32_64:
1252 bld.MUL(result, op[0], op[1]);
1253 break;
1254
1255 case nir_op_imul_32x16:
1256 case nir_op_umul_32x16: {
1257 const bool ud = instr->op == nir_op_umul_32x16;
1258 const enum elk_reg_type word_type =
1259 ud ? ELK_REGISTER_TYPE_UW : ELK_REGISTER_TYPE_W;
1260 const enum elk_reg_type dword_type =
1261 ud ? ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_D;
1262
1263 assert(instr->def.bit_size == 32);
1264
1265 /* Before copy propagation there are no immediate values. */
1266 assert(op[0].file != IMM && op[1].file != IMM);
1267
1268 op[1] = subscript(op[1], word_type, 0);
1269
1270 if (devinfo->ver >= 7)
1271 bld.MUL(result, retype(op[0], dword_type), op[1]);
1272 else
1273 bld.MUL(result, op[1], retype(op[0], dword_type));
1274
1275 break;
1276 }
1277
1278 case nir_op_imul:
1279 assert(instr->def.bit_size < 64);
1280 bld.MUL(result, op[0], op[1]);
1281 break;
1282
1283 case nir_op_imul_high:
1284 case nir_op_umul_high:
1285 assert(instr->def.bit_size < 64);
1286 if (instr->def.bit_size == 32) {
1287 bld.emit(ELK_SHADER_OPCODE_MULH, result, op[0], op[1]);
1288 } else {
1289 elk_fs_reg tmp = bld.vgrf(elk_reg_type_from_bit_size(32, op[0].type));
1290 bld.MUL(tmp, op[0], op[1]);
1291 bld.MOV(result, subscript(tmp, result.type, 1));
1292 }
1293 break;
1294
1295 case nir_op_idiv:
1296 case nir_op_udiv:
1297 assert(instr->def.bit_size < 64);
1298 bld.emit(ELK_SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1299 break;
1300
1301 case nir_op_uadd_carry:
1302 unreachable("Should have been lowered by carry_to_arith().");
1303
1304 case nir_op_usub_borrow:
1305 unreachable("Should have been lowered by borrow_to_arith().");
1306
1307 case nir_op_umod:
1308 case nir_op_irem:
1309 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1310 * appears that our hardware just does the right thing for signed
1311 * remainder.
1312 */
1313 assert(instr->def.bit_size < 64);
1314 bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1315 break;
1316
1317 case nir_op_imod: {
1318 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
1319 bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1320
1321 /* Math instructions don't support conditional mod */
1322 inst = bld.MOV(bld.null_reg_d(), result);
1323 inst->conditional_mod = ELK_CONDITIONAL_NZ;
1324
1325 /* Now, we need to determine if signs of the sources are different.
1326 * When we XOR the sources, the top bit is 0 if they are the same and 1
1327 * if they are different. We can then use a conditional modifier to
1328 * turn that into a predicate. This leads us to an XOR.l instruction.
1329 *
1330 * Technically, according to the PRM, you're not allowed to use .l on a
1331 * XOR instruction. However, empirical experiments and Curro's reading
1332 * of the simulator source both indicate that it's safe.
1333 */
1334 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_D);
1335 inst = bld.XOR(tmp, op[0], op[1]);
1336 inst->predicate = ELK_PREDICATE_NORMAL;
1337 inst->conditional_mod = ELK_CONDITIONAL_L;
1338
1339 /* If the result of the initial remainder operation is non-zero and the
1340 * two sources have different signs, add in a copy of op[1] to get the
1341 * final integer modulus value.
1342 */
1343 inst = bld.ADD(result, result, op[1]);
1344 inst->predicate = ELK_PREDICATE_NORMAL;
1345 break;
1346 }
1347
1348 case nir_op_flt32:
1349 case nir_op_fge32:
1350 case nir_op_feq32:
1351 case nir_op_fneu32: {
1352 elk_fs_reg dest = result;
1353
1354 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1355 if (bit_size != 32) {
1356 dest = bld.vgrf(op[0].type, 1);
1357 bld.UNDEF(dest);
1358 }
1359
1360 bld.CMP(dest, op[0], op[1], elk_cmod_for_nir_comparison(instr->op));
1361
1362 if (bit_size > 32) {
1363 bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1364 } else if(bit_size < 32) {
1365 /* When we convert the result to 32-bit we need to be careful and do
1366 * it as a signed conversion to get sign extension (for 32-bit true)
1367 */
1368 const elk_reg_type src_type =
1369 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1370
1371 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1372 }
1373 break;
1374 }
1375
1376 case nir_op_ilt32:
1377 case nir_op_ult32:
1378 case nir_op_ige32:
1379 case nir_op_uge32:
1380 case nir_op_ieq32:
1381 case nir_op_ine32: {
1382 elk_fs_reg dest = result;
1383
1384 const uint32_t bit_size = type_sz(op[0].type) * 8;
1385 if (bit_size != 32) {
1386 dest = bld.vgrf(op[0].type, 1);
1387 bld.UNDEF(dest);
1388 }
1389
1390 bld.CMP(dest, op[0], op[1],
1391 elk_cmod_for_nir_comparison(instr->op));
1392
1393 if (bit_size > 32) {
1394 bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1395 } else if (bit_size < 32) {
1396 /* When we convert the result to 32-bit we need to be careful and do
1397 * it as a signed conversion to get sign extension (for 32-bit true)
1398 */
1399 const elk_reg_type src_type =
1400 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1401
1402 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1403 }
1404 break;
1405 }
1406
1407 case nir_op_inot:
1408 if (devinfo->ver >= 8) {
1409 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1410
1411 if (inot_src_instr != NULL &&
1412 (inot_src_instr->op == nir_op_ior ||
1413 inot_src_instr->op == nir_op_ixor ||
1414 inot_src_instr->op == nir_op_iand)) {
1415 /* The sources of the source logical instruction are now the
1416 * sources of the instruction that will be generated.
1417 */
1418 prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
1419 resolve_inot_sources(ntb, bld, inot_src_instr, op);
1420
1421 /* Smash all of the sources and destination to be signed. This
1422 * doesn't matter for the operation of the instruction, but cmod
1423 * propagation fails on unsigned sources with negation (due to
1424 * elk_fs_inst::can_do_cmod returning false).
1425 */
1426 result.type =
1427 elk_type_for_nir_type(devinfo,
1428 (nir_alu_type)(nir_type_int |
1429 instr->def.bit_size));
1430 op[0].type =
1431 elk_type_for_nir_type(devinfo,
1432 (nir_alu_type)(nir_type_int |
1433 nir_src_bit_size(inot_src_instr->src[0].src)));
1434 op[1].type =
1435 elk_type_for_nir_type(devinfo,
1436 (nir_alu_type)(nir_type_int |
1437 nir_src_bit_size(inot_src_instr->src[1].src)));
1438
1439 /* For XOR, only invert one of the sources. Arbitrarily choose
1440 * the first source.
1441 */
1442 op[0].negate = !op[0].negate;
1443 if (inot_src_instr->op != nir_op_ixor)
1444 op[1].negate = !op[1].negate;
1445
1446 switch (inot_src_instr->op) {
1447 case nir_op_ior:
1448 bld.AND(result, op[0], op[1]);
1449 return;
1450
1451 case nir_op_iand:
1452 bld.OR(result, op[0], op[1]);
1453 return;
1454
1455 case nir_op_ixor:
1456 bld.XOR(result, op[0], op[1]);
1457 return;
1458
1459 default:
1460 unreachable("impossible opcode");
1461 }
1462 }
1463 op[0] = resolve_source_modifiers(bld, op[0]);
1464 }
1465 bld.NOT(result, op[0]);
1466 break;
1467 case nir_op_ixor:
1468 if (devinfo->ver >= 8) {
1469 resolve_inot_sources(ntb, bld, instr, op);
1470 }
1471 bld.XOR(result, op[0], op[1]);
1472 break;
1473 case nir_op_ior:
1474 if (devinfo->ver >= 8) {
1475 resolve_inot_sources(ntb, bld, instr, op);
1476 }
1477 bld.OR(result, op[0], op[1]);
1478 break;
1479 case nir_op_iand:
1480 if (devinfo->ver >= 8) {
1481 resolve_inot_sources(ntb, bld, instr, op);
1482 }
1483 bld.AND(result, op[0], op[1]);
1484 break;
1485
1486 case nir_op_fdot2:
1487 case nir_op_fdot3:
1488 case nir_op_fdot4:
1489 case nir_op_b32all_fequal2:
1490 case nir_op_b32all_iequal2:
1491 case nir_op_b32all_fequal3:
1492 case nir_op_b32all_iequal3:
1493 case nir_op_b32all_fequal4:
1494 case nir_op_b32all_iequal4:
1495 case nir_op_b32any_fnequal2:
1496 case nir_op_b32any_inequal2:
1497 case nir_op_b32any_fnequal3:
1498 case nir_op_b32any_inequal3:
1499 case nir_op_b32any_fnequal4:
1500 case nir_op_b32any_inequal4:
1501 unreachable("Lowered by nir_lower_alu_reductions");
1502
1503 case nir_op_ldexp:
1504 unreachable("not reached: should be handled by ldexp_to_arith()");
1505
1506 case nir_op_fsqrt:
1507 inst = bld.emit(ELK_SHADER_OPCODE_SQRT, result, op[0]);
1508 break;
1509
1510 case nir_op_frsq:
1511 inst = bld.emit(ELK_SHADER_OPCODE_RSQ, result, op[0]);
1512 break;
1513
1514 case nir_op_ftrunc:
1515 inst = bld.RNDZ(result, op[0]);
1516 if (devinfo->ver < 6) {
1517 set_condmod(ELK_CONDITIONAL_R, inst);
1518 set_predicate(ELK_PREDICATE_NORMAL,
1519 bld.ADD(result, result, elk_imm_f(1.0f)));
1520 inst = bld.MOV(result, result); /* for potential saturation */
1521 }
1522 break;
1523
1524 case nir_op_fceil: {
1525 op[0].negate = !op[0].negate;
1526 elk_fs_reg temp = s.vgrf(glsl_float_type());
1527 bld.RNDD(temp, op[0]);
1528 temp.negate = true;
1529 inst = bld.MOV(result, temp);
1530 break;
1531 }
1532 case nir_op_ffloor:
1533 inst = bld.RNDD(result, op[0]);
1534 break;
1535 case nir_op_ffract:
1536 inst = bld.FRC(result, op[0]);
1537 break;
1538 case nir_op_fround_even:
1539 inst = bld.RNDE(result, op[0]);
1540 if (devinfo->ver < 6) {
1541 set_condmod(ELK_CONDITIONAL_R, inst);
1542 set_predicate(ELK_PREDICATE_NORMAL,
1543 bld.ADD(result, result, elk_imm_f(1.0f)));
1544 inst = bld.MOV(result, result); /* for potential saturation */
1545 }
1546 break;
1547
1548 case nir_op_fquantize2f16: {
1549 elk_fs_reg tmp16 = bld.vgrf(ELK_REGISTER_TYPE_D);
1550 elk_fs_reg tmp32 = bld.vgrf(ELK_REGISTER_TYPE_F);
1551 elk_fs_reg zero = bld.vgrf(ELK_REGISTER_TYPE_F);
1552
1553 /* The destination stride must be at least as big as the source stride. */
1554 tmp16 = subscript(tmp16, ELK_REGISTER_TYPE_HF, 0);
1555
1556 /* Check for denormal */
1557 elk_fs_reg abs_src0 = op[0];
1558 abs_src0.abs = true;
1559 bld.CMP(bld.null_reg_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1560 ELK_CONDITIONAL_L);
1561 /* Get the appropriately signed zero */
1562 bld.AND(retype(zero, ELK_REGISTER_TYPE_UD),
1563 retype(op[0], ELK_REGISTER_TYPE_UD),
1564 elk_imm_ud(0x80000000));
1565 /* Do the actual F32 -> F16 -> F32 conversion */
1566 bld.F32TO16(tmp16, op[0]);
1567 bld.F16TO32(tmp32, tmp16);
1568 /* Select that or zero based on normal status */
1569 inst = bld.SEL(result, zero, tmp32);
1570 inst->predicate = ELK_PREDICATE_NORMAL;
1571 break;
1572 }
1573
1574 case nir_op_imin:
1575 case nir_op_umin:
1576 case nir_op_fmin:
1577 inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_L);
1578 break;
1579
1580 case nir_op_imax:
1581 case nir_op_umax:
1582 case nir_op_fmax:
1583 inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_GE);
1584 break;
1585
1586 case nir_op_pack_snorm_2x16:
1587 case nir_op_pack_snorm_4x8:
1588 case nir_op_pack_unorm_2x16:
1589 case nir_op_pack_unorm_4x8:
1590 case nir_op_unpack_snorm_2x16:
1591 case nir_op_unpack_snorm_4x8:
1592 case nir_op_unpack_unorm_2x16:
1593 case nir_op_unpack_unorm_4x8:
1594 case nir_op_unpack_half_2x16:
1595 case nir_op_pack_half_2x16:
1596 unreachable("not reached: should be handled by lower_packing_builtins");
1597
1598 case nir_op_unpack_half_2x16_split_x:
1599 inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 0));
1600 break;
1601
1602 case nir_op_unpack_half_2x16_split_y:
1603 inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 1));
1604 break;
1605
1606 case nir_op_pack_64_2x32_split:
1607 case nir_op_pack_32_2x16_split:
1608 bld.emit(ELK_FS_OPCODE_PACK, result, op[0], op[1]);
1609 break;
1610
1611 case nir_op_pack_32_4x8_split:
1612 bld.emit(ELK_FS_OPCODE_PACK, result, op, 4);
1613 break;
1614
1615 case nir_op_unpack_64_2x32_split_x:
1616 case nir_op_unpack_64_2x32_split_y: {
1617 if (instr->op == nir_op_unpack_64_2x32_split_x)
1618 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 0));
1619 else
1620 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 1));
1621 break;
1622 }
1623
1624 case nir_op_unpack_32_2x16_split_x:
1625 case nir_op_unpack_32_2x16_split_y: {
1626 if (instr->op == nir_op_unpack_32_2x16_split_x)
1627 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 0));
1628 else
1629 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 1));
1630 break;
1631 }
1632
1633 case nir_op_fpow:
1634 inst = bld.emit(ELK_SHADER_OPCODE_POW, result, op[0], op[1]);
1635 break;
1636
1637 case nir_op_bitfield_reverse:
1638 assert(instr->def.bit_size == 32);
1639 assert(nir_src_bit_size(instr->src[0].src) == 32);
1640 bld.BFREV(result, op[0]);
1641 break;
1642
1643 case nir_op_bit_count:
1644 assert(instr->def.bit_size == 32);
1645 assert(nir_src_bit_size(instr->src[0].src) < 64);
1646 bld.CBIT(result, op[0]);
1647 break;
1648
1649 case nir_op_uclz:
1650 assert(instr->def.bit_size == 32);
1651 assert(nir_src_bit_size(instr->src[0].src) == 32);
1652 bld.LZD(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1653 break;
1654
1655 case nir_op_ifind_msb: {
1656 assert(instr->def.bit_size == 32);
1657 assert(nir_src_bit_size(instr->src[0].src) == 32);
1658 assert(devinfo->ver >= 7);
1659
1660 bld.FBH(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1661
1662 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1663 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1664 * subtract the result from 31 to convert the MSB count into an LSB
1665 * count.
1666 */
1667 bld.CMP(bld.null_reg_d(), result, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1668
1669 inst = bld.ADD(result, result, elk_imm_d(31));
1670 inst->predicate = ELK_PREDICATE_NORMAL;
1671 inst->src[0].negate = true;
1672 break;
1673 }
1674
1675 case nir_op_find_lsb:
1676 assert(instr->def.bit_size == 32);
1677 assert(nir_src_bit_size(instr->src[0].src) == 32);
1678 assert(devinfo->ver >= 7);
1679 bld.FBL(result, op[0]);
1680 break;
1681
1682 case nir_op_ubitfield_extract:
1683 case nir_op_ibitfield_extract:
1684 unreachable("should have been lowered");
1685 case nir_op_ubfe:
1686 case nir_op_ibfe:
1687 assert(instr->def.bit_size < 64);
1688 bld.BFE(result, op[2], op[1], op[0]);
1689 break;
1690 case nir_op_bfm:
1691 assert(instr->def.bit_size < 64);
1692 bld.BFI1(result, op[0], op[1]);
1693 break;
1694 case nir_op_bfi:
1695 assert(instr->def.bit_size < 64);
1696
1697 /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1698 * either 0 or src0. Replacing the 0 with another value can eliminate a
1699 * temporary register.
1700 */
1701 if (is_const_zero(instr->src[2].src))
1702 bld.BFI2(result, op[0], op[1], op[0]);
1703 else
1704 bld.BFI2(result, op[0], op[1], op[2]);
1705
1706 break;
1707
1708 case nir_op_bitfield_insert:
1709 unreachable("not reached: should have been lowered");
1710
1711 /* With regards to implicit masking of the shift counts for 8- and 16-bit
1712 * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1713 * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1714 * src0) are used. The Bspec (backed by data from experimentation) state
1715 * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1716 * types.
1717 *
1718 * The match the behavior expected for the NIR opcodes, explicit masks for
1719 * 8- and 16-bit types must be added.
1720 */
1721 case nir_op_ishl:
1722 if (instr->def.bit_size < 32) {
1723 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1724 bld.SHL(result, op[0], result);
1725 } else {
1726 bld.SHL(result, op[0], op[1]);
1727 }
1728
1729 break;
1730 case nir_op_ishr:
1731 if (instr->def.bit_size < 32) {
1732 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1733 bld.ASR(result, op[0], result);
1734 } else {
1735 bld.ASR(result, op[0], op[1]);
1736 }
1737
1738 break;
1739 case nir_op_ushr:
1740 if (instr->def.bit_size < 32) {
1741 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1742 bld.SHR(result, op[0], result);
1743 } else {
1744 bld.SHR(result, op[0], op[1]);
1745 }
1746
1747 break;
1748
1749 case nir_op_pack_half_2x16_split:
1750 bld.emit(ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1751 break;
1752
1753 case nir_op_ffma:
1754 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1755 elk_rnd_mode rnd =
1756 elk_rnd_mode_from_execution_mode(execution_mode);
1757 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1758 elk_imm_d(rnd));
1759 }
1760
1761 inst = bld.MAD(result, op[2], op[1], op[0]);
1762 break;
1763
1764 case nir_op_flrp:
1765 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1766 elk_rnd_mode rnd =
1767 elk_rnd_mode_from_execution_mode(execution_mode);
1768 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1769 elk_imm_d(rnd));
1770 }
1771
1772 inst = bld.LRP(result, op[0], op[1], op[2]);
1773 break;
1774
1775 case nir_op_b32csel:
1776 if (optimize_frontfacing_ternary(ntb, instr, result))
1777 return;
1778
1779 bld.CMP(bld.null_reg_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ);
1780 inst = bld.SEL(result, op[1], op[2]);
1781 inst->predicate = ELK_PREDICATE_NORMAL;
1782 break;
1783
1784 case nir_op_extract_u8:
1785 case nir_op_extract_i8: {
1786 unsigned byte = nir_src_as_uint(instr->src[1].src);
1787
1788 /* The PRMs say:
1789 *
1790 * BDW+
1791 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1792 * Use two instructions and a word or DWord intermediate integer type.
1793 */
1794 if (instr->def.bit_size == 64) {
1795 const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1796
1797 if (instr->op == nir_op_extract_i8) {
1798 /* If we need to sign extend, extract to a word first */
1799 elk_fs_reg w_temp = bld.vgrf(ELK_REGISTER_TYPE_W);
1800 bld.MOV(w_temp, subscript(op[0], type, byte));
1801 bld.MOV(result, w_temp);
1802 } else if (byte & 1) {
1803 /* Extract the high byte from the word containing the desired byte
1804 * offset.
1805 */
1806 bld.SHR(result,
1807 subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1808 elk_imm_uw(8));
1809 } else {
1810 /* Otherwise use an AND with 0xff and a word type */
1811 bld.AND(result,
1812 subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1813 elk_imm_uw(0xff));
1814 }
1815 } else {
1816 const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1817 bld.MOV(result, subscript(op[0], type, byte));
1818 }
1819 break;
1820 }
1821
1822 case nir_op_extract_u16:
1823 case nir_op_extract_i16: {
1824 const elk_reg_type type = elk_int_type(2, instr->op == nir_op_extract_i16);
1825 unsigned word = nir_src_as_uint(instr->src[1].src);
1826 bld.MOV(result, subscript(op[0], type, word));
1827 break;
1828 }
1829
1830 default:
1831 unreachable("unhandled instruction");
1832 }
1833
1834 /* If we need to do a boolean resolve, replace the result with -(x & 1)
1835 * to sign extend the low bit to 0/~0
1836 */
1837 if (devinfo->ver <= 5 &&
1838 !result.is_null() &&
1839 (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1840 elk_fs_reg masked = s.vgrf(glsl_int_type());
1841 bld.AND(masked, result, elk_imm_d(1));
1842 masked.negate = true;
1843 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), masked);
1844 }
1845 }
1846
1847 static void
fs_nir_emit_load_const(nir_to_elk_state & ntb,nir_load_const_instr * instr)1848 fs_nir_emit_load_const(nir_to_elk_state &ntb,
1849 nir_load_const_instr *instr)
1850 {
1851 const intel_device_info *devinfo = ntb.devinfo;
1852 const fs_builder &bld = ntb.bld;
1853
1854 const elk_reg_type reg_type =
1855 elk_reg_type_from_bit_size(instr->def.bit_size, ELK_REGISTER_TYPE_D);
1856 elk_fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1857
1858 switch (instr->def.bit_size) {
1859 case 8:
1860 for (unsigned i = 0; i < instr->def.num_components; i++)
1861 bld.MOV(offset(reg, bld, i), elk_setup_imm_b(bld, instr->value[i].i8));
1862 break;
1863
1864 case 16:
1865 for (unsigned i = 0; i < instr->def.num_components; i++)
1866 bld.MOV(offset(reg, bld, i), elk_imm_w(instr->value[i].i16));
1867 break;
1868
1869 case 32:
1870 for (unsigned i = 0; i < instr->def.num_components; i++)
1871 bld.MOV(offset(reg, bld, i), elk_imm_d(instr->value[i].i32));
1872 break;
1873
1874 case 64:
1875 assert(devinfo->ver >= 7);
1876 if (!devinfo->has_64bit_int) {
1877 for (unsigned i = 0; i < instr->def.num_components; i++) {
1878 bld.MOV(retype(offset(reg, bld, i), ELK_REGISTER_TYPE_DF),
1879 elk_setup_imm_df(bld, instr->value[i].f64));
1880 }
1881 } else {
1882 for (unsigned i = 0; i < instr->def.num_components; i++)
1883 bld.MOV(offset(reg, bld, i), elk_imm_q(instr->value[i].i64));
1884 }
1885 break;
1886
1887 default:
1888 unreachable("Invalid bit size");
1889 }
1890
1891 ntb.ssa_values[instr->def.index] = reg;
1892 }
1893
1894 static bool
get_nir_src_bindless(nir_to_elk_state & ntb,const nir_src & src)1895 get_nir_src_bindless(nir_to_elk_state &ntb, const nir_src &src)
1896 {
1897 return ntb.ssa_bind_infos[src.ssa->index].bindless;
1898 }
1899
1900 static bool
is_resource_src(nir_src src)1901 is_resource_src(nir_src src)
1902 {
1903 return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
1904 nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
1905 }
1906
1907 static elk_fs_reg
get_resource_nir_src(nir_to_elk_state & ntb,const nir_src & src)1908 get_resource_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1909 {
1910 if (!is_resource_src(src))
1911 return elk_fs_reg();
1912 return ntb.resource_values[src.ssa->index];
1913 }
1914
1915 static elk_fs_reg
get_nir_src(nir_to_elk_state & ntb,const nir_src & src)1916 get_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1917 {
1918 const intel_device_info *devinfo = ntb.devinfo;
1919
1920 nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1921
1922 elk_fs_reg reg;
1923 if (!load_reg) {
1924 if (nir_src_is_undef(src)) {
1925 const elk_reg_type reg_type =
1926 elk_reg_type_from_bit_size(src.ssa->bit_size,
1927 ELK_REGISTER_TYPE_D);
1928 reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
1929 } else {
1930 reg = ntb.ssa_values[src.ssa->index];
1931 }
1932 } else {
1933 nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
1934 /* We don't handle indirects on locals */
1935 assert(nir_intrinsic_base(load_reg) == 0);
1936 assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
1937 reg = ntb.ssa_values[decl_reg->def.index];
1938 }
1939
1940 if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
1941 /* The only 64-bit type available on gfx7 is DF, so use that. */
1942 reg.type = ELK_REGISTER_TYPE_DF;
1943 } else {
1944 /* To avoid floating-point denorm flushing problems, set the type by
1945 * default to an integer type - instructions that need floating point
1946 * semantics will set this to F if they need to
1947 */
1948 reg.type = elk_reg_type_from_bit_size(nir_src_bit_size(src),
1949 ELK_REGISTER_TYPE_D);
1950 }
1951
1952 return reg;
1953 }
1954
1955 /**
1956 * Return an IMM for constants; otherwise call get_nir_src() as normal.
1957 *
1958 * This function should not be called on any value which may be 64 bits.
1959 * We could theoretically support 64-bit on gfx8+ but we choose not to
1960 * because it wouldn't work in general (no gfx7 support) and there are
1961 * enough restrictions in 64-bit immediates that you can't take the return
1962 * value and treat it the same as the result of get_nir_src().
1963 */
1964 static elk_fs_reg
get_nir_src_imm(nir_to_elk_state & ntb,const nir_src & src)1965 get_nir_src_imm(nir_to_elk_state &ntb, const nir_src &src)
1966 {
1967 assert(nir_src_bit_size(src) == 32);
1968 return nir_src_is_const(src) ?
1969 elk_fs_reg(elk_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
1970 }
1971
1972 static elk_fs_reg
get_nir_def(nir_to_elk_state & ntb,const nir_def & def)1973 get_nir_def(nir_to_elk_state &ntb, const nir_def &def)
1974 {
1975 const fs_builder &bld = ntb.bld;
1976
1977 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
1978 if (!store_reg) {
1979 const elk_reg_type reg_type =
1980 elk_reg_type_from_bit_size(def.bit_size,
1981 def.bit_size == 8 ?
1982 ELK_REGISTER_TYPE_D :
1983 ELK_REGISTER_TYPE_F);
1984 ntb.ssa_values[def.index] =
1985 bld.vgrf(reg_type, def.num_components);
1986 bld.UNDEF(ntb.ssa_values[def.index]);
1987 return ntb.ssa_values[def.index];
1988 } else {
1989 nir_intrinsic_instr *decl_reg =
1990 nir_reg_get_decl(store_reg->src[1].ssa);
1991 /* We don't handle indirects on locals */
1992 assert(nir_intrinsic_base(store_reg) == 0);
1993 assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
1994 return ntb.ssa_values[decl_reg->def.index];
1995 }
1996 }
1997
1998 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)1999 get_nir_write_mask(const nir_def &def)
2000 {
2001 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2002 if (!store_reg) {
2003 return nir_component_mask(def.num_components);
2004 } else {
2005 return nir_intrinsic_write_mask(store_reg);
2006 }
2007 }
2008
2009 static elk_fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum elk_opcode opcode,const elk_fs_reg & dst,const elk_fs_reg & src,const elk_fs_reg & desc,const elk_fs_reg & flag_reg,glsl_interp_mode interpolation)2010 emit_pixel_interpolater_send(const fs_builder &bld,
2011 enum elk_opcode opcode,
2012 const elk_fs_reg &dst,
2013 const elk_fs_reg &src,
2014 const elk_fs_reg &desc,
2015 const elk_fs_reg &flag_reg,
2016 glsl_interp_mode interpolation)
2017 {
2018 struct elk_wm_prog_data *wm_prog_data =
2019 elk_wm_prog_data(bld.shader->stage_prog_data);
2020
2021 elk_fs_reg srcs[INTERP_NUM_SRCS];
2022 srcs[INTERP_SRC_OFFSET] = src;
2023 srcs[INTERP_SRC_MSG_DESC] = desc;
2024 srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2025
2026 elk_fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2027 /* 2 floats per slot returned */
2028 inst->size_written = 2 * dst.component_size(inst->exec_size);
2029 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2030 inst->pi_noperspective = true;
2031 /* TGL BSpec says:
2032 * This field cannot be set to "Linear Interpolation"
2033 * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2034 */
2035 wm_prog_data->uses_nonperspective_interp_modes = true;
2036 }
2037
2038 wm_prog_data->pulls_bary = true;
2039
2040 return inst;
2041 }
2042
2043 /**
2044 * Computes 1 << x, given a D/UD register containing some value x.
2045 */
2046 static elk_fs_reg
intexp2(const fs_builder & bld,const elk_fs_reg & x)2047 intexp2(const fs_builder &bld, const elk_fs_reg &x)
2048 {
2049 assert(x.type == ELK_REGISTER_TYPE_UD || x.type == ELK_REGISTER_TYPE_D);
2050
2051 elk_fs_reg result = bld.vgrf(x.type, 1);
2052 elk_fs_reg one = bld.vgrf(x.type, 1);
2053
2054 bld.MOV(one, retype(elk_imm_d(1), one.type));
2055 bld.SHL(result, one, x);
2056 return result;
2057 }
2058
2059 static void
emit_gs_end_primitive(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src)2060 emit_gs_end_primitive(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src)
2061 {
2062 elk_fs_visitor &s = ntb.s;
2063 assert(s.stage == MESA_SHADER_GEOMETRY);
2064
2065 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2066
2067 if (s.gs_compile->control_data_header_size_bits == 0)
2068 return;
2069
2070 /* We can only do EndPrimitive() functionality when the control data
2071 * consists of cut bits. Fortunately, the only time it isn't is when the
2072 * output type is points, in which case EndPrimitive() is a no-op.
2073 */
2074 if (gs_prog_data->control_data_format !=
2075 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2076 return;
2077 }
2078
2079 /* Cut bits use one bit per vertex. */
2080 assert(s.gs_compile->control_data_bits_per_vertex == 1);
2081
2082 elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2083 vertex_count.type = ELK_REGISTER_TYPE_UD;
2084
2085 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2086 * vertex n, 0 otherwise. So all we need to do here is mark bit
2087 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2088 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2089 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2090 *
2091 * Note that if EndPrimitive() is called before emitting any vertices, this
2092 * will cause us to set bit 31 of the control_data_bits register to 1.
2093 * That's fine because:
2094 *
2095 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2096 * output, so the hardware will ignore cut bit 31.
2097 *
2098 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2099 * last vertex, so setting cut bit 31 has no effect (since the primitive
2100 * is automatically ended when the GS terminates).
2101 *
2102 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2103 * control_data_bits register to 0 when the first vertex is emitted.
2104 */
2105
2106 const fs_builder abld = ntb.bld.annotate("end primitive");
2107
2108 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2109 elk_fs_reg prev_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2110 abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2111 elk_fs_reg mask = intexp2(abld, prev_count);
2112 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2113 * attention to the lower 5 bits of its second source argument, so on this
2114 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2115 * ((vertex_count - 1) % 32).
2116 */
2117 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2118 }
2119
2120 void
emit_gs_control_data_bits(const elk_fs_reg & vertex_count)2121 elk_fs_visitor::emit_gs_control_data_bits(const elk_fs_reg &vertex_count)
2122 {
2123 assert(stage == MESA_SHADER_GEOMETRY);
2124 assert(gs_compile->control_data_bits_per_vertex != 0);
2125
2126 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
2127
2128 const fs_builder bld = fs_builder(this).at_end();
2129 const fs_builder abld = bld.annotate("emit control data bits");
2130 const fs_builder fwa_bld = bld.exec_all();
2131
2132 /* We use a single UD register to accumulate control data bits (32 bits
2133 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
2134 * at a time.
2135 *
2136 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2137 * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2138 * use the Channel Mask phase to enable/disable which DWord within that
2139 * group to write. (Remember, different SIMD8 channels may have emitted
2140 * different numbers of vertices, so we may need per-slot offsets.)
2141 *
2142 * Channel masking presents an annoying problem: we may have to replicate
2143 * the data up to 4 times:
2144 *
2145 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2146 *
2147 * To avoid penalizing shaders that emit a small number of vertices, we
2148 * can avoid these sometimes: if the size of the control data header is
2149 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
2150 * land in the same 128-bit group, so we can skip per-slot offsets.
2151 *
2152 * Similarly, if the control data header is <= 32 bits, there is only one
2153 * DWord, so we can skip channel masks.
2154 */
2155 elk_fs_reg channel_mask, per_slot_offset;
2156
2157 if (gs_compile->control_data_header_size_bits > 32)
2158 channel_mask = vgrf(glsl_uint_type());
2159
2160 if (gs_compile->control_data_header_size_bits > 128)
2161 per_slot_offset = vgrf(glsl_uint_type());
2162
2163 /* Figure out which DWord we're trying to write to using the formula:
2164 *
2165 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
2166 *
2167 * Since bits_per_vertex is a power of two, and is known at compile
2168 * time, this can be optimized to:
2169 *
2170 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2171 */
2172 if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2173 elk_fs_reg dword_index = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2174 elk_fs_reg prev_count = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2175 abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2176 unsigned log2_bits_per_vertex =
2177 util_last_bit(gs_compile->control_data_bits_per_vertex);
2178 abld.SHR(dword_index, prev_count, elk_imm_ud(6u - log2_bits_per_vertex));
2179
2180 if (per_slot_offset.file != BAD_FILE) {
2181 /* Set the per-slot offset to dword_index / 4, so that we'll write to
2182 * the appropriate OWord within the control data header.
2183 */
2184 abld.SHR(per_slot_offset, dword_index, elk_imm_ud(2u));
2185 }
2186
2187 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2188 * write to the appropriate DWORD within the OWORD.
2189 */
2190 elk_fs_reg channel = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2191 fwa_bld.AND(channel, dword_index, elk_imm_ud(3u));
2192 channel_mask = intexp2(fwa_bld, channel);
2193 /* Then the channel masks need to be in bits 23:16. */
2194 fwa_bld.SHL(channel_mask, channel_mask, elk_imm_ud(16u));
2195 }
2196
2197 /* If there are channel masks, add 3 extra copies of the data. */
2198 const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2199 elk_fs_reg sources[4];
2200
2201 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2202 sources[i] = this->control_data_bits;
2203
2204 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2205 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2206 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2207 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2208 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, length);
2209 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
2210 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2211
2212 elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2213 srcs, ARRAY_SIZE(srcs));
2214
2215 /* We need to increment Global Offset by 256-bits to make room for
2216 * Broadwell's extra "Vertex Count" payload at the beginning of the
2217 * URB entry. Since this is an OWord message, Global Offset is counted
2218 * in 128-bit units, so we must set it to 2.
2219 */
2220 if (gs_prog_data->static_vertex_count == -1)
2221 inst->offset = 2;
2222 }
2223
2224 static void
set_gs_stream_control_data_bits(nir_to_elk_state & ntb,const elk_fs_reg & vertex_count,unsigned stream_id)2225 set_gs_stream_control_data_bits(nir_to_elk_state &ntb, const elk_fs_reg &vertex_count,
2226 unsigned stream_id)
2227 {
2228 elk_fs_visitor &s = ntb.s;
2229
2230 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2231
2232 /* Note: we are calling this *before* increasing vertex_count, so
2233 * this->vertex_count == vertex_count - 1 in the formula above.
2234 */
2235
2236 /* Stream mode uses 2 bits per vertex */
2237 assert(s.gs_compile->control_data_bits_per_vertex == 2);
2238
2239 /* Must be a valid stream */
2240 assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2241
2242 /* Control data bits are initialized to 0 so we don't have to set any
2243 * bits when sending vertices to stream 0.
2244 */
2245 if (stream_id == 0)
2246 return;
2247
2248 const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
2249
2250 /* reg::sid = stream_id */
2251 elk_fs_reg sid = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2252 abld.MOV(sid, elk_imm_ud(stream_id));
2253
2254 /* reg:shift_count = 2 * (vertex_count - 1) */
2255 elk_fs_reg shift_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2256 abld.SHL(shift_count, vertex_count, elk_imm_ud(1u));
2257
2258 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2259 * attention to the lower 5 bits of its second source argument, so on this
2260 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2261 * stream_id << ((2 * (vertex_count - 1)) % 32).
2262 */
2263 elk_fs_reg mask = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2264 abld.SHL(mask, sid, shift_count);
2265 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2266 }
2267
2268 static void
emit_gs_vertex(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2269 emit_gs_vertex(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src,
2270 unsigned stream_id)
2271 {
2272 elk_fs_visitor &s = ntb.s;
2273
2274 assert(s.stage == MESA_SHADER_GEOMETRY);
2275
2276 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2277
2278 elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2279 vertex_count.type = ELK_REGISTER_TYPE_UD;
2280
2281 /* Haswell and later hardware ignores the "Render Stream Select" bits
2282 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2283 * and instead sends all primitives down the pipeline for rasterization.
2284 * If the SOL stage is enabled, "Render Stream Select" is honored and
2285 * primitives bound to non-zero streams are discarded after stream output.
2286 *
2287 * Since the only purpose of primives sent to non-zero streams is to
2288 * be recorded by transform feedback, we can simply discard all geometry
2289 * bound to these streams when transform feedback is disabled.
2290 */
2291 if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2292 return;
2293
2294 /* If we're outputting 32 control data bits or less, then we can wait
2295 * until the shader is over to output them all. Otherwise we need to
2296 * output them as we go. Now is the time to do it, since we're about to
2297 * output the vertex_count'th vertex, so it's guaranteed that the
2298 * control data bits associated with the (vertex_count - 1)th vertex are
2299 * correct.
2300 */
2301 if (s.gs_compile->control_data_header_size_bits > 32) {
2302 const fs_builder abld =
2303 ntb.bld.annotate("emit vertex: emit control data bits");
2304
2305 /* Only emit control data bits if we've finished accumulating a batch
2306 * of 32 bits. This is the case when:
2307 *
2308 * (vertex_count * bits_per_vertex) % 32 == 0
2309 *
2310 * (in other words, when the last 5 bits of vertex_count *
2311 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
2312 * integer n (which is always the case, since bits_per_vertex is
2313 * always 1 or 2), this is equivalent to requiring that the last 5-n
2314 * bits of vertex_count are 0:
2315 *
2316 * vertex_count & (2^(5-n) - 1) == 0
2317 *
2318 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2319 * equivalent to:
2320 *
2321 * vertex_count & (32 / bits_per_vertex - 1) == 0
2322 *
2323 * TODO: If vertex_count is an immediate, we could do some of this math
2324 * at compile time...
2325 */
2326 elk_fs_inst *inst =
2327 abld.AND(ntb.bld.null_reg_d(), vertex_count,
2328 elk_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2329 inst->conditional_mod = ELK_CONDITIONAL_Z;
2330
2331 abld.IF(ELK_PREDICATE_NORMAL);
2332 /* If vertex_count is 0, then no control data bits have been
2333 * accumulated yet, so we can skip emitting them.
2334 */
2335 abld.CMP(ntb.bld.null_reg_d(), vertex_count, elk_imm_ud(0u),
2336 ELK_CONDITIONAL_NEQ);
2337 abld.IF(ELK_PREDICATE_NORMAL);
2338 s.emit_gs_control_data_bits(vertex_count);
2339 abld.emit(ELK_OPCODE_ENDIF);
2340
2341 /* Reset control_data_bits to 0 so we can start accumulating a new
2342 * batch.
2343 *
2344 * Note: in the case where vertex_count == 0, this neutralizes the
2345 * effect of any call to EndPrimitive() that the shader may have
2346 * made before outputting its first vertex.
2347 */
2348 inst = abld.MOV(s.control_data_bits, elk_imm_ud(0u));
2349 inst->force_writemask_all = true;
2350 abld.emit(ELK_OPCODE_ENDIF);
2351 }
2352
2353 s.emit_urb_writes(vertex_count);
2354
2355 /* In stream mode we have to set control data bits for all vertices
2356 * unless we have disabled control data bits completely (which we do
2357 * do for MESA_PRIM_POINTS outputs that don't use streams).
2358 */
2359 if (s.gs_compile->control_data_header_size_bits > 0 &&
2360 gs_prog_data->control_data_format ==
2361 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2362 set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2363 }
2364 }
2365
2366 static void
emit_gs_input_load(nir_to_elk_state & ntb,const elk_fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2367 emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
2368 const nir_src &vertex_src,
2369 unsigned base_offset,
2370 const nir_src &offset_src,
2371 unsigned num_components,
2372 unsigned first_component)
2373 {
2374 const fs_builder &bld = ntb.bld;
2375 elk_fs_visitor &s = ntb.s;
2376
2377 assert(type_sz(dst.type) == 4);
2378 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2379 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2380
2381 /* TODO: figure out push input layout for invocations == 1 */
2382 if (gs_prog_data->invocations == 1 &&
2383 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2384 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2385 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2386 nir_src_as_uint(vertex_src) * push_reg_count;
2387 const elk_fs_reg attr = elk_fs_reg(ATTR, 0, dst.type);
2388 for (unsigned i = 0; i < num_components; i++) {
2389 ntb.bld.MOV(offset(dst, bld, i),
2390 offset(attr, bld, imm_offset + i + first_component));
2391 }
2392 return;
2393 }
2394
2395 /* Resort to the pull model. Ensure the VUE handles are provided. */
2396 assert(gs_prog_data->base.include_vue_handles);
2397
2398 elk_fs_reg start = s.gs_payload().icp_handle_start;
2399 elk_fs_reg icp_handle = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2400
2401 if (gs_prog_data->invocations == 1) {
2402 if (nir_src_is_const(vertex_src)) {
2403 /* The vertex index is constant; just select the proper URB handle. */
2404 icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
2405 } else {
2406 /* The vertex index is non-constant. We need to use indirect
2407 * addressing to fetch the proper URB handle.
2408 *
2409 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2410 * indicating that channel <n> should read the handle from
2411 * DWord <n>. We convert that to bytes by multiplying by 4.
2412 *
2413 * Next, we convert the vertex index to bytes by multiplying
2414 * by 32 (shifting by 5), and add the two together. This is
2415 * the final indirect byte offset.
2416 */
2417 elk_fs_reg sequence =
2418 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2419 elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2420 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2421 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2422
2423 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2424 bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2425 /* Convert vertex_index to bytes (multiply by 32) */
2426 bld.SHL(vertex_offset_bytes,
2427 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2428 elk_imm_ud(5u));
2429 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2430
2431 /* Use first_icp_handle as the base offset. There is one register
2432 * of URB handles per vertex, so inform the register allocator that
2433 * we might read up to nir->info.gs.vertices_in registers.
2434 */
2435 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2436 elk_fs_reg(icp_offset_bytes),
2437 elk_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
2438 }
2439 } else {
2440 assert(gs_prog_data->invocations > 1);
2441
2442 if (nir_src_is_const(vertex_src)) {
2443 unsigned vertex = nir_src_as_uint(vertex_src);
2444 assert(vertex <= 5);
2445 bld.MOV(icp_handle, component(start, vertex));
2446 } else {
2447 /* The vertex index is non-constant. We need to use indirect
2448 * addressing to fetch the proper URB handle.
2449 *
2450 */
2451 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2452
2453 /* Convert vertex_index to bytes (multiply by 4) */
2454 bld.SHL(icp_offset_bytes,
2455 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2456 elk_imm_ud(2u));
2457
2458 /* Use first_icp_handle as the base offset. There is one DWord
2459 * of URB handles per vertex, so inform the register allocator that
2460 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2461 */
2462 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2463 elk_fs_reg(icp_offset_bytes),
2464 elk_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2465 REG_SIZE));
2466 }
2467 }
2468
2469 elk_fs_inst *inst;
2470 elk_fs_reg indirect_offset = get_nir_src(ntb, offset_src);
2471
2472 if (nir_src_is_const(offset_src)) {
2473 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2474 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2475
2476 /* Constant indexing - use global offset. */
2477 if (first_component != 0) {
2478 unsigned read_components = num_components + first_component;
2479 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2480 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2481 ARRAY_SIZE(srcs));
2482 inst->size_written = read_components *
2483 tmp.component_size(inst->exec_size);
2484 for (unsigned i = 0; i < num_components; i++) {
2485 bld.MOV(offset(dst, bld, i),
2486 offset(tmp, bld, i + first_component));
2487 }
2488 } else {
2489 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2490 ARRAY_SIZE(srcs));
2491 inst->size_written = num_components *
2492 dst.component_size(inst->exec_size);
2493 }
2494 inst->offset = base_offset + nir_src_as_uint(offset_src);
2495 } else {
2496 /* Indirect indexing - use per-slot offsets as well. */
2497 unsigned read_components = num_components + first_component;
2498 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2499
2500 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2501 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2502 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2503
2504 if (first_component != 0) {
2505 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2506 srcs, ARRAY_SIZE(srcs));
2507 inst->size_written = read_components *
2508 tmp.component_size(inst->exec_size);
2509 for (unsigned i = 0; i < num_components; i++) {
2510 bld.MOV(offset(dst, bld, i),
2511 offset(tmp, bld, i + first_component));
2512 }
2513 } else {
2514 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2515 srcs, ARRAY_SIZE(srcs));
2516 inst->size_written = num_components *
2517 dst.component_size(inst->exec_size);
2518 }
2519 inst->offset = base_offset;
2520 }
2521 }
2522
2523 static elk_fs_reg
get_indirect_offset(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2524 get_indirect_offset(nir_to_elk_state &ntb, nir_intrinsic_instr *instr)
2525 {
2526 nir_src *offset_src = nir_get_io_offset_src(instr);
2527
2528 if (nir_src_is_const(*offset_src)) {
2529 /* The only constant offset we should find is 0. elk_nir.c's
2530 * add_const_offset_to_base() will fold other constant offsets
2531 * into the "base" index.
2532 */
2533 assert(nir_src_as_uint(*offset_src) == 0);
2534 return elk_fs_reg();
2535 }
2536
2537 return get_nir_src(ntb, *offset_src);
2538 }
2539
2540 static void
fs_nir_emit_vs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2541 fs_nir_emit_vs_intrinsic(nir_to_elk_state &ntb,
2542 nir_intrinsic_instr *instr)
2543 {
2544 const fs_builder &bld = ntb.bld;
2545 elk_fs_visitor &s = ntb.s;
2546 assert(s.stage == MESA_SHADER_VERTEX);
2547
2548 elk_fs_reg dest;
2549 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2550 dest = get_nir_def(ntb, instr->def);
2551
2552 switch (instr->intrinsic) {
2553 case nir_intrinsic_load_vertex_id:
2554 case nir_intrinsic_load_base_vertex:
2555 unreachable("should be lowered by nir_lower_system_values()");
2556
2557 case nir_intrinsic_load_input: {
2558 assert(instr->def.bit_size == 32);
2559 const elk_fs_reg src = offset(elk_fs_reg(ATTR, 0, dest.type), bld,
2560 nir_intrinsic_base(instr) * 4 +
2561 nir_intrinsic_component(instr) +
2562 nir_src_as_uint(instr->src[0]));
2563
2564 for (unsigned i = 0; i < instr->num_components; i++)
2565 bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2566 break;
2567 }
2568
2569 case nir_intrinsic_load_vertex_id_zero_base:
2570 case nir_intrinsic_load_instance_id:
2571 case nir_intrinsic_load_base_instance:
2572 case nir_intrinsic_load_draw_id:
2573 case nir_intrinsic_load_first_vertex:
2574 case nir_intrinsic_load_is_indexed_draw:
2575 unreachable("lowered by elk_nir_lower_vs_inputs");
2576
2577 default:
2578 fs_nir_emit_intrinsic(ntb, bld, instr);
2579 break;
2580 }
2581 }
2582
2583 static elk_fs_reg
get_tcs_single_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2584 get_tcs_single_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2585 nir_intrinsic_instr *instr)
2586 {
2587 elk_fs_visitor &s = ntb.s;
2588
2589 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2590 const nir_src &vertex_src = instr->src[0];
2591 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2592
2593 const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2594
2595 elk_fs_reg icp_handle;
2596
2597 if (nir_src_is_const(vertex_src)) {
2598 /* Emit a MOV to resolve <0,1,0> regioning. */
2599 icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2600 unsigned vertex = nir_src_as_uint(vertex_src);
2601 bld.MOV(icp_handle, component(start, vertex));
2602 } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2603 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2604 /* For the common case of only 1 instance, an array index of
2605 * gl_InvocationID means reading the handles from the start. Skip all
2606 * the indirect work.
2607 */
2608 icp_handle = start;
2609 } else {
2610 /* The vertex index is non-constant. We need to use indirect
2611 * addressing to fetch the proper URB handle.
2612 */
2613 icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2614
2615 /* Each ICP handle is a single DWord (4 bytes) */
2616 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2617 bld.SHL(vertex_offset_bytes,
2618 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2619 elk_imm_ud(2u));
2620
2621 /* We might read up to 4 registers. */
2622 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2623 start, vertex_offset_bytes,
2624 elk_imm_ud(4 * REG_SIZE));
2625 }
2626
2627 return icp_handle;
2628 }
2629
2630 static elk_fs_reg
get_tcs_multi_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2631 get_tcs_multi_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2632 nir_intrinsic_instr *instr)
2633 {
2634 elk_fs_visitor &s = ntb.s;
2635 const intel_device_info *devinfo = s.devinfo;
2636
2637 struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) s.key;
2638 const nir_src &vertex_src = instr->src[0];
2639 const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2640
2641 const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2642
2643 if (nir_src_is_const(vertex_src))
2644 return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2645
2646 /* The vertex index is non-constant. We need to use indirect
2647 * addressing to fetch the proper URB handle.
2648 *
2649 * First, we start with the sequence indicating that channel <n>
2650 * should read the handle from DWord <n>. We convert that to bytes
2651 * by multiplying by 4.
2652 *
2653 * Next, we convert the vertex index to bytes by multiplying
2654 * by the GRF size (by shifting), and add the two together. This is
2655 * the final indirect byte offset.
2656 */
2657 elk_fs_reg icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2658 elk_fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2659 elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2660 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2661 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2662
2663 /* Offsets will be 0, 4, 8, ... */
2664 bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2665 /* Convert vertex_index to bytes (multiply by 32) */
2666 assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2667 bld.SHL(vertex_offset_bytes,
2668 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2669 elk_imm_ud(ffs(grf_size_bytes) - 1));
2670 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2671
2672 /* Use start of ICP handles as the base offset. There is one register
2673 * of URB handles per vertex, so inform the register allocator that
2674 * we might read up to nir->info.gs.vertices_in registers.
2675 */
2676 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2677 icp_offset_bytes,
2678 elk_imm_ud(elk_tcs_prog_key_input_vertices(tcs_key) *
2679 grf_size_bytes));
2680
2681 return icp_handle;
2682 }
2683
2684 static void
emit_barrier(nir_to_elk_state & ntb)2685 emit_barrier(nir_to_elk_state &ntb)
2686 {
2687 const intel_device_info *devinfo = ntb.devinfo;
2688 const fs_builder &bld = ntb.bld;
2689 elk_fs_visitor &s = ntb.s;
2690
2691 /* We are getting the barrier ID from the compute shader header */
2692 assert(gl_shader_stage_uses_workgroup(s.stage));
2693
2694 elk_fs_reg payload = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
2695
2696 /* Clear the message payload */
2697 bld.exec_all().group(8, 0).MOV(payload, elk_imm_ud(0u));
2698
2699 assert(gl_shader_stage_is_compute(s.stage));
2700
2701 uint32_t barrier_id_mask;
2702 switch (devinfo->ver) {
2703 case 7:
2704 case 8:
2705 barrier_id_mask = 0x0f000000u; break;
2706 default:
2707 unreachable("barrier is only available on gen >= 7");
2708 }
2709
2710 /* Copy the barrier id from r0.2 to the message payload reg.2 */
2711 elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
2712 bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
2713 elk_imm_ud(barrier_id_mask));
2714
2715 /* Emit a gateway "barrier" message using the payload we set up, followed
2716 * by a wait instruction.
2717 */
2718 bld.exec_all().emit(ELK_SHADER_OPCODE_BARRIER, reg_undef, payload);
2719 }
2720
2721 static void
emit_tcs_barrier(nir_to_elk_state & ntb)2722 emit_tcs_barrier(nir_to_elk_state &ntb)
2723 {
2724 const fs_builder &bld = ntb.bld;
2725 elk_fs_visitor &s = ntb.s;
2726
2727 assert(s.stage == MESA_SHADER_TESS_CTRL);
2728 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2729
2730 elk_fs_reg m0 = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2731 elk_fs_reg m0_2 = component(m0, 2);
2732
2733 const fs_builder chanbld = bld.exec_all().group(1, 0);
2734
2735 /* Zero the message header */
2736 bld.exec_all().MOV(m0, elk_imm_ud(0u));
2737
2738 /* Copy "Barrier ID" from r0.2, bits 16:13 */
2739 chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
2740 elk_imm_ud(INTEL_MASK(16, 13)));
2741
2742 /* Shift it up to bits 27:24. */
2743 chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
2744
2745 /* Set the Barrier Count and the enable bit */
2746 chanbld.OR(m0_2, m0_2,
2747 elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2748
2749 bld.emit(ELK_SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2750 }
2751
2752 static void
fs_nir_emit_tcs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2753 fs_nir_emit_tcs_intrinsic(nir_to_elk_state &ntb,
2754 nir_intrinsic_instr *instr)
2755 {
2756 const intel_device_info *devinfo = ntb.devinfo;
2757 const fs_builder &bld = ntb.bld;
2758 elk_fs_visitor &s = ntb.s;
2759
2760 assert(s.stage == MESA_SHADER_TESS_CTRL);
2761 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2762 struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2763
2764 elk_fs_reg dst;
2765 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2766 dst = get_nir_def(ntb, instr->def);
2767
2768 switch (instr->intrinsic) {
2769 case nir_intrinsic_load_primitive_id:
2770 bld.MOV(dst, s.tcs_payload().primitive_id);
2771 break;
2772 case nir_intrinsic_load_invocation_id:
2773 bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
2774 break;
2775
2776 case nir_intrinsic_barrier:
2777 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
2778 fs_nir_emit_intrinsic(ntb, bld, instr);
2779 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
2780 if (tcs_prog_data->instances != 1)
2781 emit_tcs_barrier(ntb);
2782 }
2783 break;
2784
2785 case nir_intrinsic_load_input:
2786 unreachable("nir_lower_io should never give us these.");
2787 break;
2788
2789 case nir_intrinsic_load_per_vertex_input: {
2790 assert(instr->def.bit_size == 32);
2791 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2792 unsigned imm_offset = nir_intrinsic_base(instr);
2793 elk_fs_inst *inst;
2794
2795 const bool multi_patch =
2796 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
2797
2798 elk_fs_reg icp_handle = multi_patch ?
2799 get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
2800 get_tcs_single_patch_icp_handle(ntb, bld, instr);
2801
2802 /* We can only read two double components with each URB read, so
2803 * we send two read messages in that case, each one loading up to
2804 * two double components.
2805 */
2806 unsigned num_components = instr->num_components;
2807 unsigned first_component = nir_intrinsic_component(instr);
2808
2809 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2810 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2811
2812 if (indirect_offset.file == BAD_FILE) {
2813 /* Constant indexing - use global offset. */
2814 if (first_component != 0) {
2815 unsigned read_components = num_components + first_component;
2816 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2817 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2818 ARRAY_SIZE(srcs));
2819 for (unsigned i = 0; i < num_components; i++) {
2820 bld.MOV(offset(dst, bld, i),
2821 offset(tmp, bld, i + first_component));
2822 }
2823 } else {
2824 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2825 ARRAY_SIZE(srcs));
2826 }
2827 inst->offset = imm_offset;
2828 } else {
2829 /* Indirect indexing - use per-slot offsets as well. */
2830 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2831
2832 if (first_component != 0) {
2833 unsigned read_components = num_components + first_component;
2834 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2835 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2836 srcs, ARRAY_SIZE(srcs));
2837 for (unsigned i = 0; i < num_components; i++) {
2838 bld.MOV(offset(dst, bld, i),
2839 offset(tmp, bld, i + first_component));
2840 }
2841 } else {
2842 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2843 srcs, ARRAY_SIZE(srcs));
2844 }
2845 inst->offset = imm_offset;
2846 }
2847 inst->size_written = (num_components + first_component) *
2848 inst->dst.component_size(inst->exec_size);
2849
2850 /* Copy the temporary to the destination to deal with writemasking.
2851 *
2852 * Also attempt to deal with gl_PointSize being in the .w component.
2853 */
2854 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2855 assert(type_sz(dst.type) == 4);
2856 inst->dst = bld.vgrf(dst.type, 4);
2857 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
2858 bld.MOV(dst, offset(inst->dst, bld, 3));
2859 }
2860 break;
2861 }
2862
2863 case nir_intrinsic_load_output:
2864 case nir_intrinsic_load_per_vertex_output: {
2865 assert(instr->def.bit_size == 32);
2866 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2867 unsigned imm_offset = nir_intrinsic_base(instr);
2868 unsigned first_component = nir_intrinsic_component(instr);
2869
2870 elk_fs_inst *inst;
2871 if (indirect_offset.file == BAD_FILE) {
2872 /* This MOV replicates the output handle to all enabled channels
2873 * is SINGLE_PATCH mode.
2874 */
2875 elk_fs_reg patch_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2876 bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
2877
2878 {
2879 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2880 srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
2881
2882 if (first_component != 0) {
2883 unsigned read_components =
2884 instr->num_components + first_component;
2885 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2886 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2887 srcs, ARRAY_SIZE(srcs));
2888 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2889 for (unsigned i = 0; i < instr->num_components; i++) {
2890 bld.MOV(offset(dst, bld, i),
2891 offset(tmp, bld, i + first_component));
2892 }
2893 } else {
2894 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2895 srcs, ARRAY_SIZE(srcs));
2896 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2897 }
2898 inst->offset = imm_offset;
2899 }
2900 } else {
2901 /* Indirect indexing - use per-slot offsets as well. */
2902 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2903 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2904 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2905
2906 if (first_component != 0) {
2907 unsigned read_components =
2908 instr->num_components + first_component;
2909 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2910 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2911 srcs, ARRAY_SIZE(srcs));
2912 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2913 for (unsigned i = 0; i < instr->num_components; i++) {
2914 bld.MOV(offset(dst, bld, i),
2915 offset(tmp, bld, i + first_component));
2916 }
2917 } else {
2918 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2919 srcs, ARRAY_SIZE(srcs));
2920 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2921 }
2922 inst->offset = imm_offset;
2923 }
2924 break;
2925 }
2926
2927 case nir_intrinsic_store_output:
2928 case nir_intrinsic_store_per_vertex_output: {
2929 assert(nir_src_bit_size(instr->src[0]) == 32);
2930 elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
2931 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2932 unsigned imm_offset = nir_intrinsic_base(instr);
2933 unsigned mask = nir_intrinsic_write_mask(instr);
2934
2935 if (mask == 0)
2936 break;
2937
2938 unsigned num_components = util_last_bit(mask);
2939 unsigned first_component = nir_intrinsic_component(instr);
2940 assert((first_component + num_components) <= 4);
2941
2942 mask = mask << first_component;
2943
2944 elk_fs_reg mask_reg;
2945 if (mask != WRITEMASK_XYZW)
2946 mask_reg = elk_imm_ud(mask << 16);
2947
2948 elk_fs_reg sources[4];
2949
2950 unsigned m = first_component;
2951 for (unsigned i = 0; i < num_components; i++) {
2952 int c = i + first_component;
2953 if (mask & (1 << c)) {
2954 sources[m++] = offset(value, bld, i);
2955 } else {
2956 m++;
2957 }
2958 }
2959
2960 assert(m == (first_component + num_components));
2961
2962 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2963 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2964 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2965 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
2966 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, m);
2967 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(m);
2968 bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
2969
2970 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2971 srcs, ARRAY_SIZE(srcs));
2972 inst->offset = imm_offset;
2973 break;
2974 }
2975
2976 default:
2977 fs_nir_emit_intrinsic(ntb, bld, instr);
2978 break;
2979 }
2980 }
2981
2982 static void
fs_nir_emit_tes_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2983 fs_nir_emit_tes_intrinsic(nir_to_elk_state &ntb,
2984 nir_intrinsic_instr *instr)
2985 {
2986 const intel_device_info *devinfo = ntb.devinfo;
2987 const fs_builder &bld = ntb.bld;
2988 elk_fs_visitor &s = ntb.s;
2989
2990 assert(s.stage == MESA_SHADER_TESS_EVAL);
2991 struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(s.prog_data);
2992
2993 elk_fs_reg dest;
2994 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2995 dest = get_nir_def(ntb, instr->def);
2996
2997 switch (instr->intrinsic) {
2998 case nir_intrinsic_load_primitive_id:
2999 bld.MOV(dest, s.tes_payload().primitive_id);
3000 break;
3001
3002 case nir_intrinsic_load_tess_coord:
3003 for (unsigned i = 0; i < 3; i++)
3004 bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3005 break;
3006
3007 case nir_intrinsic_load_input:
3008 case nir_intrinsic_load_per_vertex_input: {
3009 assert(instr->def.bit_size == 32);
3010 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3011 unsigned imm_offset = nir_intrinsic_base(instr);
3012 unsigned first_component = nir_intrinsic_component(instr);
3013
3014 elk_fs_inst *inst;
3015 if (indirect_offset.file == BAD_FILE) {
3016 /* Arbitrarily only push up to 32 vec4 slots worth of data,
3017 * which is 16 registers (since each holds 2 vec4 slots).
3018 */
3019 const unsigned max_push_slots = 32;
3020 if (imm_offset < max_push_slots) {
3021 const elk_fs_reg src = horiz_offset(elk_fs_reg(ATTR, 0, dest.type),
3022 4 * imm_offset + first_component);
3023 for (int i = 0; i < instr->num_components; i++)
3024 bld.MOV(offset(dest, bld, i), component(src, i));
3025
3026 tes_prog_data->base.urb_read_length =
3027 MAX2(tes_prog_data->base.urb_read_length,
3028 (imm_offset / 2) + 1);
3029 } else {
3030 /* Replicate the patch handle to all enabled channels */
3031 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3032 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3033
3034 if (first_component != 0) {
3035 unsigned read_components =
3036 instr->num_components + first_component;
3037 elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3038 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3039 srcs, ARRAY_SIZE(srcs));
3040 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3041 for (unsigned i = 0; i < instr->num_components; i++) {
3042 bld.MOV(offset(dest, bld, i),
3043 offset(tmp, bld, i + first_component));
3044 }
3045 } else {
3046 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3047 srcs, ARRAY_SIZE(srcs));
3048 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3049 }
3050 inst->offset = imm_offset;
3051 }
3052 } else {
3053 /* Indirect indexing - use per-slot offsets as well. */
3054
3055 /* We can only read two double components with each URB read, so
3056 * we send two read messages in that case, each one loading up to
3057 * two double components.
3058 */
3059 unsigned num_components = instr->num_components;
3060
3061 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3062 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3063 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3064
3065 if (first_component != 0) {
3066 unsigned read_components =
3067 num_components + first_component;
3068 elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3069 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3070 srcs, ARRAY_SIZE(srcs));
3071 for (unsigned i = 0; i < num_components; i++) {
3072 bld.MOV(offset(dest, bld, i),
3073 offset(tmp, bld, i + first_component));
3074 }
3075 } else {
3076 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3077 srcs, ARRAY_SIZE(srcs));
3078 }
3079 inst->offset = imm_offset;
3080 inst->size_written = (num_components + first_component) *
3081 inst->dst.component_size(inst->exec_size);
3082 }
3083 break;
3084 }
3085 default:
3086 fs_nir_emit_intrinsic(ntb, bld, instr);
3087 break;
3088 }
3089 }
3090
3091 static void
fs_nir_emit_gs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3092 fs_nir_emit_gs_intrinsic(nir_to_elk_state &ntb,
3093 nir_intrinsic_instr *instr)
3094 {
3095 const fs_builder &bld = ntb.bld;
3096 elk_fs_visitor &s = ntb.s;
3097
3098 assert(s.stage == MESA_SHADER_GEOMETRY);
3099 elk_fs_reg indirect_offset;
3100
3101 elk_fs_reg dest;
3102 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3103 dest = get_nir_def(ntb, instr->def);
3104
3105 switch (instr->intrinsic) {
3106 case nir_intrinsic_load_primitive_id:
3107 assert(s.stage == MESA_SHADER_GEOMETRY);
3108 assert(elk_gs_prog_data(s.prog_data)->include_primitive_id);
3109 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
3110 break;
3111
3112 case nir_intrinsic_load_input:
3113 unreachable("load_input intrinsics are invalid for the GS stage");
3114
3115 case nir_intrinsic_load_per_vertex_input:
3116 emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3117 instr->src[1], instr->num_components,
3118 nir_intrinsic_component(instr));
3119 break;
3120
3121 case nir_intrinsic_emit_vertex_with_counter:
3122 emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3123 break;
3124
3125 case nir_intrinsic_end_primitive_with_counter:
3126 emit_gs_end_primitive(ntb, instr->src[0]);
3127 break;
3128
3129 case nir_intrinsic_set_vertex_and_primitive_count:
3130 bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3131 break;
3132
3133 case nir_intrinsic_load_invocation_id: {
3134 elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3135 assert(val.file != BAD_FILE);
3136 dest.type = val.type;
3137 bld.MOV(dest, val);
3138 break;
3139 }
3140
3141 default:
3142 fs_nir_emit_intrinsic(ntb, bld, instr);
3143 break;
3144 }
3145 }
3146
3147 /**
3148 * Fetch the current render target layer index.
3149 */
3150 static elk_fs_reg
fetch_render_target_array_index(const fs_builder & bld)3151 fetch_render_target_array_index(const fs_builder &bld)
3152 {
3153 if (bld.shader->devinfo->ver >= 6) {
3154 /* The render target array index is provided in the thread payload as
3155 * bits 26:16 of r0.0.
3156 */
3157 const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3158 bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 0, 1),
3159 elk_imm_uw(0x7ff));
3160 return idx;
3161 } else {
3162 /* Pre-SNB we only ever render into the first layer of the framebuffer
3163 * since layered rendering is not implemented.
3164 */
3165 return elk_imm_ud(0);
3166 }
3167 }
3168
3169 /* Sample from the MCS surface attached to this multisample texture. */
3170 static elk_fs_reg
emit_mcs_fetch(nir_to_elk_state & ntb,const elk_fs_reg & coordinate,unsigned components,const elk_fs_reg & texture,const elk_fs_reg & texture_handle)3171 emit_mcs_fetch(nir_to_elk_state &ntb, const elk_fs_reg &coordinate, unsigned components,
3172 const elk_fs_reg &texture,
3173 const elk_fs_reg &texture_handle)
3174 {
3175 const fs_builder &bld = ntb.bld;
3176
3177 const elk_fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
3178
3179 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3180 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3181 srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3182 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3183 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3184 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(components);
3185 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
3186 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
3187
3188 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3189 ARRAY_SIZE(srcs));
3190
3191 /* We only care about one or two regs of response, but the sampler always
3192 * writes 4/8.
3193 */
3194 inst->size_written = 4 * dest.component_size(inst->exec_size);
3195
3196 return dest;
3197 }
3198
3199 /**
3200 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3201 * framebuffer at the current fragment coordinates and sample index.
3202 */
3203 static elk_fs_inst *
emit_non_coherent_fb_read(nir_to_elk_state & ntb,const fs_builder & bld,const elk_fs_reg & dst,unsigned target)3204 emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const elk_fs_reg &dst,
3205 unsigned target)
3206 {
3207 elk_fs_visitor &s = ntb.s;
3208
3209 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3210 const elk_wm_prog_key *wm_key =
3211 reinterpret_cast<const elk_wm_prog_key *>(s.key);
3212 assert(!wm_key->coherent_fb_fetch);
3213
3214 /* Calculate the fragment coordinates. */
3215 const elk_fs_reg coords = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
3216 bld.MOV(offset(coords, bld, 0), s.pixel_x);
3217 bld.MOV(offset(coords, bld, 1), s.pixel_y);
3218 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3219
3220 /* Calculate the sample index and MCS payload when multisampling. Luckily
3221 * the MCS fetch message behaves deterministically for UMS surfaces, so it
3222 * shouldn't be necessary to recompile based on whether the framebuffer is
3223 * CMS or UMS.
3224 */
3225 assert(wm_key->multisample_fbo == ELK_ALWAYS ||
3226 wm_key->multisample_fbo == ELK_NEVER);
3227 if (wm_key->multisample_fbo &&
3228 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3229 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3230
3231 const elk_fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3232 const elk_fs_reg mcs = wm_key->multisample_fbo ?
3233 emit_mcs_fetch(ntb, coords, 3, elk_imm_ud(target), elk_fs_reg()) : elk_fs_reg();
3234
3235 /* Use either a normal or a CMS texel fetch message depending on whether
3236 * the framebuffer is single or multisample. On SKL+ use the wide CMS
3237 * message just in case the framebuffer uses 16x multisampling, it should
3238 * be equivalent to the normal CMS fetch for lower multisampling modes.
3239 */
3240 elk_opcode op;
3241 if (wm_key->multisample_fbo) {
3242 op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
3243 } else {
3244 op = ELK_SHADER_OPCODE_TXF_LOGICAL;
3245 }
3246
3247 /* Emit the instruction. */
3248 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3249 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;
3250 srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_ud(0);
3251 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;
3252 srcs[TEX_LOGICAL_SRC_MCS] = mcs;
3253 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(target);
3254 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3255 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_ud(3);
3256 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_ud(0);
3257 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(0);
3258
3259 elk_fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3260 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3261
3262 return inst;
3263 }
3264
3265 static elk_fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,elk_fs_reg * regs,unsigned n)3266 alloc_temporary(const fs_builder &bld, unsigned size, elk_fs_reg *regs, unsigned n)
3267 {
3268 if (n && regs[0].file != BAD_FILE) {
3269 return regs[0];
3270
3271 } else {
3272 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, size);
3273
3274 for (unsigned i = 0; i < n; i++)
3275 regs[i] = tmp;
3276
3277 return tmp;
3278 }
3279 }
3280
3281 static elk_fs_reg
alloc_frag_output(nir_to_elk_state & ntb,unsigned location)3282 alloc_frag_output(nir_to_elk_state &ntb, unsigned location)
3283 {
3284 elk_fs_visitor &s = ntb.s;
3285
3286 assert(s.stage == MESA_SHADER_FRAGMENT);
3287 const elk_wm_prog_key *const key =
3288 reinterpret_cast<const elk_wm_prog_key *>(s.key);
3289 const unsigned l = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_LOCATION);
3290 const unsigned i = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_INDEX);
3291
3292 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3293 return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3294
3295 else if (l == FRAG_RESULT_COLOR)
3296 return alloc_temporary(ntb.bld, 4, s.outputs,
3297 MAX2(key->nr_color_regions, 1));
3298
3299 else if (l == FRAG_RESULT_DEPTH)
3300 return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3301
3302 else if (l == FRAG_RESULT_STENCIL)
3303 return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3304
3305 else if (l == FRAG_RESULT_SAMPLE_MASK)
3306 return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3307
3308 else if (l >= FRAG_RESULT_DATA0 &&
3309 l < FRAG_RESULT_DATA0 + ELK_MAX_DRAW_BUFFERS)
3310 return alloc_temporary(ntb.bld, 4,
3311 &s.outputs[l - FRAG_RESULT_DATA0], 1);
3312
3313 else
3314 unreachable("Invalid location");
3315 }
3316
3317 static void
emit_is_helper_invocation(nir_to_elk_state & ntb,elk_fs_reg result)3318 emit_is_helper_invocation(nir_to_elk_state &ntb, elk_fs_reg result)
3319 {
3320 const fs_builder &bld = ntb.bld;
3321
3322 /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3323 * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3324 * consideration demoted invocations.
3325 */
3326 result.type = ELK_REGISTER_TYPE_UD;
3327
3328 bld.MOV(result, elk_imm_ud(0));
3329
3330 /* See elk_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3331 unsigned width = bld.dispatch_width();
3332 for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3333 const fs_builder b = bld.group(MIN2(width, 16), i);
3334
3335 elk_fs_inst *mov = b.MOV(offset(result, b, i), elk_imm_ud(~0));
3336
3337 /* The at() ensures that any code emitted to get the predicate happens
3338 * before the mov right above. This is not an issue elsewhere because
3339 * lowering code already set up the builder this way.
3340 */
3341 elk_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3342 mov->predicate_inverse = true;
3343 }
3344 }
3345
3346 static void
emit_fragcoord_interpolation(nir_to_elk_state & ntb,elk_fs_reg wpos)3347 emit_fragcoord_interpolation(nir_to_elk_state &ntb, elk_fs_reg wpos)
3348 {
3349 const intel_device_info *devinfo = ntb.devinfo;
3350 const fs_builder &bld = ntb.bld;
3351 elk_fs_visitor &s = ntb.s;
3352
3353 assert(s.stage == MESA_SHADER_FRAGMENT);
3354
3355 /* gl_FragCoord.x */
3356 bld.MOV(wpos, s.pixel_x);
3357 wpos = offset(wpos, bld, 1);
3358
3359 /* gl_FragCoord.y */
3360 bld.MOV(wpos, s.pixel_y);
3361 wpos = offset(wpos, bld, 1);
3362
3363 /* gl_FragCoord.z */
3364 if (devinfo->ver >= 6) {
3365 bld.MOV(wpos, s.pixel_z);
3366 } else {
3367 bld.emit(ELK_FS_OPCODE_LINTERP, wpos,
3368 s.delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL],
3369 s.interp_reg(bld, VARYING_SLOT_POS, 2, 0));
3370 }
3371 wpos = offset(wpos, bld, 1);
3372
3373 /* gl_FragCoord.w: Already set up in emit_interpolation */
3374 bld.MOV(wpos, s.wpos_w);
3375 }
3376
3377 static elk_fs_reg
emit_frontfacing_interpolation(nir_to_elk_state & ntb)3378 emit_frontfacing_interpolation(nir_to_elk_state &ntb)
3379 {
3380 const intel_device_info *devinfo = ntb.devinfo;
3381 const fs_builder &bld = ntb.bld;
3382
3383 elk_fs_reg ff = bld.vgrf(ELK_REGISTER_TYPE_D);
3384
3385 if (devinfo->ver >= 6) {
3386 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3387 * a boolean result from this (~0/true or 0/false).
3388 *
3389 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3390 * this task in only one instruction:
3391 * - a negation source modifier will flip the bit; and
3392 * - a W -> D type conversion will sign extend the bit into the high
3393 * word of the destination.
3394 *
3395 * An ASR 15 fills the low word of the destination.
3396 */
3397 elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
3398 g0.negate = true;
3399
3400 bld.ASR(ff, g0, elk_imm_d(15));
3401 } else {
3402 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
3403 * a boolean result from this (1/true or 0/false).
3404 *
3405 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
3406 * the negation source modifier to flip it. Unfortunately the SHR
3407 * instruction only operates on UD (or D with an abs source modifier)
3408 * sources without negation.
3409 *
3410 * Instead, use ASR (which will give ~0/true or 0/false).
3411 */
3412 elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
3413 g1_6.negate = true;
3414
3415 bld.ASR(ff, g1_6, elk_imm_d(31));
3416 }
3417
3418 return ff;
3419 }
3420
3421 static elk_fs_reg
emit_samplepos_setup(nir_to_elk_state & ntb)3422 emit_samplepos_setup(nir_to_elk_state &ntb)
3423 {
3424 const intel_device_info *devinfo = ntb.devinfo;
3425 const fs_builder &bld = ntb.bld;
3426 elk_fs_visitor &s = ntb.s;
3427
3428 assert(s.stage == MESA_SHADER_FRAGMENT);
3429 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3430 assert(devinfo->ver >= 6);
3431
3432 const fs_builder abld = bld.annotate("compute sample position");
3433 elk_fs_reg pos = abld.vgrf(ELK_REGISTER_TYPE_F, 2);
3434
3435 if (wm_prog_data->persample_dispatch == ELK_NEVER) {
3436 /* From ARB_sample_shading specification:
3437 * "When rendering to a non-multisample buffer, or if multisample
3438 * rasterization is disabled, gl_SamplePosition will always be
3439 * (0.5, 0.5).
3440 */
3441 bld.MOV(offset(pos, bld, 0), elk_imm_f(0.5f));
3442 bld.MOV(offset(pos, bld, 1), elk_imm_f(0.5f));
3443 return pos;
3444 }
3445
3446 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3447 * mode will be enabled.
3448 *
3449 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3450 * R31.1:0 Position Offset X/Y for Slot[3:0]
3451 * R31.3:2 Position Offset X/Y for Slot[7:4]
3452 * .....
3453 *
3454 * The X, Y sample positions come in as bytes in thread payload. So, read
3455 * the positions using vstride=16, width=8, hstride=2.
3456 */
3457 const elk_fs_reg sample_pos_reg =
3458 fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, ELK_REGISTER_TYPE_W);
3459
3460 for (unsigned i = 0; i < 2; i++) {
3461 elk_fs_reg tmp_d = bld.vgrf(ELK_REGISTER_TYPE_D);
3462 abld.MOV(tmp_d, subscript(sample_pos_reg, ELK_REGISTER_TYPE_B, i));
3463 /* Convert int_sample_pos to floating point */
3464 elk_fs_reg tmp_f = bld.vgrf(ELK_REGISTER_TYPE_F);
3465 abld.MOV(tmp_f, tmp_d);
3466 /* Scale to the range [0, 1] */
3467 abld.MUL(offset(pos, abld, i), tmp_f, elk_imm_f(1 / 16.0f));
3468 }
3469
3470 if (wm_prog_data->persample_dispatch == ELK_SOMETIMES) {
3471 check_dynamic_msaa_flag(abld, wm_prog_data,
3472 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3473 for (unsigned i = 0; i < 2; i++) {
3474 set_predicate(ELK_PREDICATE_NORMAL,
3475 bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3476 elk_imm_f(0.5f)));
3477 }
3478 }
3479
3480 return pos;
3481 }
3482
3483 static elk_fs_reg
emit_sampleid_setup(nir_to_elk_state & ntb)3484 emit_sampleid_setup(nir_to_elk_state &ntb)
3485 {
3486 const intel_device_info *devinfo = ntb.devinfo;
3487 const fs_builder &bld = ntb.bld;
3488 elk_fs_visitor &s = ntb.s;
3489
3490 assert(s.stage == MESA_SHADER_FRAGMENT);
3491 ASSERTED elk_wm_prog_key *key = (elk_wm_prog_key*) s.key;
3492 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3493 assert(devinfo->ver >= 6);
3494
3495 const fs_builder abld = bld.annotate("compute sample id");
3496 elk_fs_reg sample_id = abld.vgrf(ELK_REGISTER_TYPE_UD);
3497
3498 assert(key->multisample_fbo != ELK_NEVER);
3499
3500 if (devinfo->ver >= 8) {
3501 /* Sample ID comes in as 4-bit numbers in g1.0:
3502 *
3503 * 15:12 Slot 3 SampleID (only used in SIMD16)
3504 * 11:8 Slot 2 SampleID (only used in SIMD16)
3505 * 7:4 Slot 1 SampleID
3506 * 3:0 Slot 0 SampleID
3507 *
3508 * Each slot corresponds to four channels, so we want to replicate each
3509 * half-byte value to 4 channels in a row:
3510 *
3511 * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
3512 * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
3513 *
3514 * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
3515 * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
3516 *
3517 * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
3518 * channels to read the first byte (7:0), and the second group of 8
3519 * channels to read the second byte (15:8). Then, we shift right by
3520 * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
3521 * values into place. Finally, we AND with 0xf to keep the low nibble.
3522 *
3523 * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
3524 * and(16) dst<1>D tmp<8,8,1>W 0xf:W
3525 *
3526 * TODO: These payload bits exist on Gfx7 too, but they appear to always
3527 * be zero, so this code fails to work. We should find out why.
3528 */
3529 const elk_fs_reg tmp = abld.vgrf(ELK_REGISTER_TYPE_UW);
3530
3531 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3532 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
3533 /* According to the "PS Thread Payload for Normal Dispatch"
3534 * pages on the BSpec, the sample ids are stored in R1.0/R2.0 on gfx8+.
3535 */
3536 const struct elk_reg id_reg = elk_vec1_grf(i + 1, 0);
3537 hbld.SHR(offset(tmp, hbld, i),
3538 stride(retype(id_reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
3539 elk_imm_v(0x44440000));
3540 }
3541
3542 abld.AND(sample_id, tmp, elk_imm_w(0xf));
3543 } else {
3544 const elk_fs_reg t1 = component(abld.vgrf(ELK_REGISTER_TYPE_UD), 0);
3545 const elk_fs_reg t2 = abld.vgrf(ELK_REGISTER_TYPE_UW);
3546
3547 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
3548 * 8x multisampling, subspan 0 will represent sample N (where N
3549 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
3550 * 7. We can find the value of N by looking at R0.0 bits 7:6
3551 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
3552 * (since samples are always delivered in pairs). That is, we
3553 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
3554 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
3555 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
3556 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
3557 * populating a temporary variable with the sequence (0, 1, 2, 3),
3558 * and then reading from it using vstride=1, width=4, hstride=0.
3559 * These computations hold good for 4x multisampling as well.
3560 *
3561 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
3562 * the first four slots are sample 0 of subspan 0; the next four
3563 * are sample 1 of subspan 0; the third group is sample 0 of
3564 * subspan 1, and finally sample 1 of subspan 1.
3565 */
3566
3567 /* SKL+ has an extra bit for the Starting Sample Pair Index to
3568 * accommodate 16x MSAA.
3569 */
3570 abld.exec_all().group(1, 0)
3571 .AND(t1, elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD)),
3572 elk_imm_ud(0xc0));
3573 abld.exec_all().group(1, 0).SHR(t1, t1, elk_imm_d(5));
3574
3575 /* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we
3576 * can assume 4x MSAA. Disallow it on IVB+
3577 *
3578 * FINISHME: One day, we could come up with a way to do this that
3579 * actually works on gfx7.
3580 */
3581 if (devinfo->ver >= 7)
3582 s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
3583 abld.exec_all().group(8, 0).MOV(t2, elk_imm_v(0x32103210));
3584
3585 /* This special instruction takes care of setting vstride=1,
3586 * width=4, hstride=0 of t2 during an ADD instruction.
3587 */
3588 abld.emit(ELK_FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
3589 }
3590
3591 if (key->multisample_fbo == ELK_SOMETIMES) {
3592 check_dynamic_msaa_flag(abld, wm_prog_data,
3593 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3594 set_predicate(ELK_PREDICATE_NORMAL,
3595 abld.SEL(sample_id, sample_id, elk_imm_ud(0)));
3596 }
3597
3598 return sample_id;
3599 }
3600
3601 static elk_fs_reg
emit_samplemaskin_setup(nir_to_elk_state & ntb)3602 emit_samplemaskin_setup(nir_to_elk_state &ntb)
3603 {
3604 const intel_device_info *devinfo = ntb.devinfo;
3605 const fs_builder &bld = ntb.bld;
3606 elk_fs_visitor &s = ntb.s;
3607
3608 assert(s.stage == MESA_SHADER_FRAGMENT);
3609 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3610 assert(devinfo->ver >= 6);
3611
3612 elk_fs_reg coverage_mask =
3613 fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, ELK_REGISTER_TYPE_D);
3614
3615 if (wm_prog_data->persample_dispatch == ELK_NEVER)
3616 return coverage_mask;
3617
3618 /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
3619 * and a mask representing which sample is being processed by the
3620 * current shader invocation.
3621 *
3622 * From the OES_sample_variables specification:
3623 * "When per-sample shading is active due to the use of a fragment input
3624 * qualified by "sample" or due to the use of the gl_SampleID or
3625 * gl_SamplePosition variables, only the bit for the current sample is
3626 * set in gl_SampleMaskIn."
3627 */
3628 const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
3629
3630 if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3631 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3632
3633 elk_fs_reg one = s.vgrf(glsl_int_type());
3634 elk_fs_reg enabled_mask = s.vgrf(glsl_int_type());
3635 abld.MOV(one, elk_imm_d(1));
3636 abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
3637 elk_fs_reg mask = bld.vgrf(ELK_REGISTER_TYPE_D);
3638 abld.AND(mask, enabled_mask, coverage_mask);
3639
3640 if (wm_prog_data->persample_dispatch == ELK_ALWAYS)
3641 return mask;
3642
3643 check_dynamic_msaa_flag(abld, wm_prog_data,
3644 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3645 set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
3646
3647 return mask;
3648 }
3649
3650 static void
fs_nir_emit_fs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3651 fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
3652 nir_intrinsic_instr *instr)
3653 {
3654 const intel_device_info *devinfo = ntb.devinfo;
3655 const fs_builder &bld = ntb.bld;
3656 elk_fs_visitor &s = ntb.s;
3657
3658 assert(s.stage == MESA_SHADER_FRAGMENT);
3659
3660 elk_fs_reg dest;
3661 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3662 dest = get_nir_def(ntb, instr->def);
3663
3664 switch (instr->intrinsic) {
3665 case nir_intrinsic_load_front_face:
3666 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
3667 emit_frontfacing_interpolation(ntb));
3668 break;
3669
3670 case nir_intrinsic_load_sample_pos:
3671 case nir_intrinsic_load_sample_pos_or_center: {
3672 elk_fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
3673 assert(sample_pos.file != BAD_FILE);
3674 dest.type = sample_pos.type;
3675 bld.MOV(dest, sample_pos);
3676 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3677 break;
3678 }
3679
3680 case nir_intrinsic_load_layer_id:
3681 dest.type = ELK_REGISTER_TYPE_UD;
3682 bld.MOV(dest, fetch_render_target_array_index(bld));
3683 break;
3684
3685 case nir_intrinsic_is_helper_invocation:
3686 emit_is_helper_invocation(ntb, dest);
3687 break;
3688
3689 case nir_intrinsic_load_helper_invocation:
3690 case nir_intrinsic_load_sample_mask_in:
3691 case nir_intrinsic_load_sample_id:
3692 case nir_intrinsic_load_frag_shading_rate: {
3693 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3694 elk_fs_reg val = ntb.system_values[sv];
3695 assert(val.file != BAD_FILE);
3696 dest.type = val.type;
3697 bld.MOV(dest, val);
3698 break;
3699 }
3700
3701 case nir_intrinsic_store_output: {
3702 const elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
3703 const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3704 const unsigned location = nir_intrinsic_base(instr) +
3705 SET_FIELD(store_offset, ELK_NIR_FRAG_OUTPUT_LOCATION);
3706 const elk_fs_reg new_dest = retype(alloc_frag_output(ntb, location),
3707 src.type);
3708
3709 for (unsigned j = 0; j < instr->num_components; j++)
3710 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3711 offset(src, bld, j));
3712
3713 break;
3714 }
3715
3716 case nir_intrinsic_load_output: {
3717 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3718 ELK_NIR_FRAG_OUTPUT_LOCATION);
3719 assert(l >= FRAG_RESULT_DATA0);
3720 const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3721 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3722 const elk_fs_reg tmp = bld.vgrf(dest.type, 4);
3723
3724 assert(!reinterpret_cast<const elk_wm_prog_key *>(s.key)->coherent_fb_fetch);
3725 emit_non_coherent_fb_read(ntb, bld, tmp, target);
3726
3727 for (unsigned j = 0; j < instr->num_components; j++) {
3728 bld.MOV(offset(dest, bld, j),
3729 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3730 }
3731
3732 break;
3733 }
3734
3735 case nir_intrinsic_demote:
3736 case nir_intrinsic_terminate:
3737 case nir_intrinsic_demote_if:
3738 case nir_intrinsic_terminate_if: {
3739 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
3740 * can update just the flag bits that aren't yet discarded. If there's
3741 * no condition, we emit a CMP of g0 != g0, so all currently executing
3742 * channels will get turned off.
3743 */
3744 elk_fs_inst *cmp = NULL;
3745 if (instr->intrinsic == nir_intrinsic_demote_if ||
3746 instr->intrinsic == nir_intrinsic_terminate_if) {
3747 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3748
3749 if (alu != NULL &&
3750 alu->op != nir_op_bcsel &&
3751 (devinfo->ver > 5 ||
3752 (alu->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) != ELK_NIR_BOOLEAN_NEEDS_RESOLVE ||
3753 alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
3754 alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3755 alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3756 alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3757 alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3758 /* Re-emit the instruction that generated the Boolean value, but
3759 * do not store it. Since this instruction will be conditional,
3760 * other instructions that want to use the real Boolean value may
3761 * get garbage. This was a problem for piglit's fs-discard-exit-2
3762 * test.
3763 *
3764 * Ideally we'd detect that the instruction cannot have a
3765 * conditional modifier before emitting the instructions. Alas,
3766 * that is nigh impossible. Instead, we're going to assume the
3767 * instruction (or last instruction) generated can have a
3768 * conditional modifier. If it cannot, fallback to the old-style
3769 * compare, and hope dead code elimination will clean up the
3770 * extra instructions generated.
3771 */
3772 fs_nir_emit_alu(ntb, alu, false);
3773
3774 cmp = (elk_fs_inst *) s.instructions.get_tail();
3775 if (cmp->conditional_mod == ELK_CONDITIONAL_NONE) {
3776 if (cmp->can_do_cmod())
3777 cmp->conditional_mod = ELK_CONDITIONAL_Z;
3778 else
3779 cmp = NULL;
3780 } else {
3781 /* The old sequence that would have been generated is,
3782 * basically, bool_result == false. This is equivalent to
3783 * !bool_result, so negate the old modifier.
3784 */
3785 cmp->conditional_mod = elk_negate_cmod(cmp->conditional_mod);
3786 }
3787 }
3788
3789 if (cmp == NULL) {
3790 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
3791 elk_imm_d(0), ELK_CONDITIONAL_Z);
3792 }
3793 } else {
3794 elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
3795 ELK_REGISTER_TYPE_UW));
3796 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, ELK_CONDITIONAL_NZ);
3797 }
3798
3799 cmp->predicate = ELK_PREDICATE_NORMAL;
3800 cmp->flag_subreg = sample_mask_flag_subreg(s);
3801
3802 elk_fs_inst *jump = bld.emit(ELK_OPCODE_HALT);
3803 jump->flag_subreg = sample_mask_flag_subreg(s);
3804 jump->predicate_inverse = true;
3805
3806 if (instr->intrinsic == nir_intrinsic_terminate ||
3807 instr->intrinsic == nir_intrinsic_terminate_if) {
3808 jump->predicate = ELK_PREDICATE_NORMAL;
3809 } else {
3810 /* Only jump when the whole quad is demoted. For historical
3811 * reasons this is also used for discard.
3812 */
3813 jump->predicate = ELK_PREDICATE_ALIGN1_ANY4H;
3814 }
3815
3816 if (devinfo->ver < 7)
3817 s.limit_dispatch_width(
3818 16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3819 break;
3820 }
3821
3822 case nir_intrinsic_load_input:
3823 case nir_intrinsic_load_per_primitive_input: {
3824 /* In Fragment Shaders load_input is used either for flat inputs or
3825 * per-primitive inputs.
3826 */
3827 assert(instr->def.bit_size == 32);
3828 unsigned base = nir_intrinsic_base(instr);
3829 unsigned comp = nir_intrinsic_component(instr);
3830 unsigned num_components = instr->num_components;
3831
3832 /* Special case fields in the VUE header */
3833 if (base == VARYING_SLOT_LAYER)
3834 comp = 1;
3835 else if (base == VARYING_SLOT_VIEWPORT)
3836 comp = 2;
3837
3838 if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
3839 assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
3840 for (unsigned int i = 0; i < num_components; i++) {
3841 bld.MOV(offset(dest, bld, i),
3842 retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
3843 }
3844 } else {
3845 const unsigned k = 3;
3846 for (unsigned int i = 0; i < num_components; i++) {
3847 bld.MOV(offset(dest, bld, i),
3848 retype(s.interp_reg(bld, base, comp + i, k), dest.type));
3849 }
3850 }
3851 break;
3852 }
3853
3854 case nir_intrinsic_load_fs_input_interp_deltas: {
3855 assert(s.stage == MESA_SHADER_FRAGMENT);
3856 assert(nir_src_as_uint(instr->src[0]) == 0);
3857 const unsigned base = nir_intrinsic_base(instr);
3858 const unsigned comp = nir_intrinsic_component(instr);
3859 dest.type = ELK_REGISTER_TYPE_F;
3860
3861 bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
3862 bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
3863 bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
3864
3865 break;
3866 }
3867
3868 case nir_intrinsic_load_barycentric_pixel:
3869 case nir_intrinsic_load_barycentric_centroid:
3870 case nir_intrinsic_load_barycentric_sample: {
3871 /* Use the delta_xy values computed from the payload */
3872 enum elk_barycentric_mode bary = elk_barycentric_mode(instr);
3873 const elk_fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
3874 offset(s.delta_xy[bary], bld, 1) };
3875 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3876 break;
3877 }
3878
3879 case nir_intrinsic_load_barycentric_at_sample: {
3880 const glsl_interp_mode interpolation =
3881 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3882
3883 elk_fs_reg msg_data;
3884 if (nir_src_is_const(instr->src[0])) {
3885 msg_data = elk_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
3886 } else {
3887 const elk_fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
3888 ELK_REGISTER_TYPE_UD);
3889 const elk_fs_reg sample_id = bld.emit_uniformize(sample_src);
3890 msg_data = component(bld.group(8, 0).vgrf(ELK_REGISTER_TYPE_UD), 0);
3891 bld.exec_all().group(1, 0).SHL(msg_data, sample_id, elk_imm_ud(4u));
3892 }
3893
3894 elk_fs_reg flag_reg;
3895 struct elk_wm_prog_key *wm_prog_key = (struct elk_wm_prog_key *) s.key;
3896 if (wm_prog_key->multisample_fbo == ELK_SOMETIMES) {
3897 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3898
3899 check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
3900 wm_prog_data,
3901 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3902 flag_reg = elk_flag_reg(0, 0);
3903 }
3904
3905 emit_pixel_interpolater_send(bld,
3906 ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3907 dest,
3908 elk_fs_reg(), /* src */
3909 msg_data,
3910 flag_reg,
3911 interpolation);
3912 break;
3913 }
3914
3915 case nir_intrinsic_load_barycentric_at_offset: {
3916 const glsl_interp_mode interpolation =
3917 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3918
3919 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3920
3921 if (const_offset) {
3922 assert(nir_src_bit_size(instr->src[0]) == 32);
3923 unsigned off_x = const_offset[0].u32 & 0xf;
3924 unsigned off_y = const_offset[1].u32 & 0xf;
3925
3926 emit_pixel_interpolater_send(bld,
3927 ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3928 dest,
3929 elk_fs_reg(), /* src */
3930 elk_imm_ud(off_x | (off_y << 4)),
3931 elk_fs_reg(), /* flag_reg */
3932 interpolation);
3933 } else {
3934 elk_fs_reg src = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_D);
3935 const enum elk_opcode opcode = ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3936 emit_pixel_interpolater_send(bld,
3937 opcode,
3938 dest,
3939 src,
3940 elk_imm_ud(0u),
3941 elk_fs_reg(), /* flag_reg */
3942 interpolation);
3943 }
3944 break;
3945 }
3946
3947 case nir_intrinsic_load_frag_coord:
3948 emit_fragcoord_interpolation(ntb, dest);
3949 break;
3950
3951 case nir_intrinsic_load_interpolated_input: {
3952 assert(instr->src[0].ssa &&
3953 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3954 nir_intrinsic_instr *bary_intrinsic =
3955 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3956 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3957 enum glsl_interp_mode interp_mode =
3958 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3959 elk_fs_reg dst_xy;
3960
3961 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3962 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3963 /* Use the result of the PI message. */
3964 dst_xy = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F);
3965 } else {
3966 /* Use the delta_xy values computed from the payload */
3967 enum elk_barycentric_mode bary = elk_barycentric_mode(bary_intrinsic);
3968 dst_xy = s.delta_xy[bary];
3969 }
3970
3971 for (unsigned int i = 0; i < instr->num_components; i++) {
3972 elk_fs_reg interp =
3973 s.interp_reg(bld, nir_intrinsic_base(instr),
3974 nir_intrinsic_component(instr) + i, 0);
3975 interp.type = ELK_REGISTER_TYPE_F;
3976 dest.type = ELK_REGISTER_TYPE_F;
3977
3978 if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3979 elk_fs_reg tmp = s.vgrf(glsl_float_type());
3980 bld.emit(ELK_FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3981 bld.MUL(offset(dest, bld, i), tmp, s.pixel_w);
3982 } else {
3983 bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3984 }
3985 }
3986 break;
3987 }
3988
3989 default:
3990 fs_nir_emit_intrinsic(ntb, bld, instr);
3991 break;
3992 }
3993 }
3994
3995 static void
fs_nir_emit_cs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3996 fs_nir_emit_cs_intrinsic(nir_to_elk_state &ntb,
3997 nir_intrinsic_instr *instr)
3998 {
3999 const intel_device_info *devinfo = ntb.devinfo;
4000 const fs_builder &bld = ntb.bld;
4001 elk_fs_visitor &s = ntb.s;
4002
4003 assert(gl_shader_stage_uses_workgroup(s.stage));
4004 struct elk_cs_prog_data *cs_prog_data = elk_cs_prog_data(s.prog_data);
4005
4006 elk_fs_reg dest;
4007 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4008 dest = get_nir_def(ntb, instr->def);
4009
4010 switch (instr->intrinsic) {
4011 case nir_intrinsic_barrier:
4012 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4013 fs_nir_emit_intrinsic(ntb, bld, instr);
4014 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4015 /* The whole workgroup fits in a single HW thread, so all the
4016 * invocations are already executed lock-step. Instead of an actual
4017 * barrier just emit a scheduling fence, that will generate no code.
4018 */
4019 if (!s.nir->info.workgroup_size_variable &&
4020 s.workgroup_size() <= s.dispatch_width) {
4021 bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE);
4022 break;
4023 }
4024
4025 emit_barrier(ntb);
4026 cs_prog_data->uses_barrier = true;
4027 }
4028 break;
4029
4030 case nir_intrinsic_load_subgroup_id:
4031 s.cs_payload().load_subgroup_id(bld, dest);
4032 break;
4033
4034 case nir_intrinsic_load_workgroup_id: {
4035 elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4036 assert(val.file != BAD_FILE);
4037 dest.type = val.type;
4038 for (unsigned i = 0; i < 3; i++)
4039 bld.MOV(offset(dest, bld, i), offset(val, bld, i));
4040 break;
4041 }
4042
4043 case nir_intrinsic_load_num_workgroups: {
4044 assert(instr->def.bit_size == 32);
4045
4046 cs_prog_data->uses_num_work_groups = true;
4047
4048 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4049 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(0);
4050 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4051 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(3); /* num components */
4052 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = elk_imm_ud(0);
4053 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4054 elk_fs_inst *inst =
4055 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4056 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4057 inst->size_written = 3 * s.dispatch_width * 4;
4058 break;
4059 }
4060
4061 case nir_intrinsic_shared_atomic:
4062 case nir_intrinsic_shared_atomic_swap:
4063 fs_nir_emit_surface_atomic(ntb, bld, instr, elk_imm_ud(GFX7_BTI_SLM),
4064 false /* bindless */);
4065 break;
4066
4067 case nir_intrinsic_load_shared: {
4068 assert(devinfo->ver >= 7);
4069
4070 const unsigned bit_size = instr->def.bit_size;
4071 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4072 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4073
4074 elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
4075 int base = nir_intrinsic_base(instr);
4076 if (base) {
4077 elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4078 bld.ADD(addr_off, addr, elk_imm_d(base));
4079 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4080 } else {
4081 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4082 }
4083
4084 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4085 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4086
4087 /* Make dest unsigned because that's what the temporary will be */
4088 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4089
4090 /* Read the vector */
4091 assert(bit_size <= 32);
4092 assert(nir_intrinsic_align(instr) > 0);
4093 if (bit_size == 32 &&
4094 nir_intrinsic_align(instr) >= 4) {
4095 assert(instr->def.num_components <= 4);
4096 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4097 elk_fs_inst *inst =
4098 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4099 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4100 inst->size_written = instr->num_components * s.dispatch_width * 4;
4101 } else {
4102 assert(instr->def.num_components == 1);
4103 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4104
4105 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
4106 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4107 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4108 bld.MOV(dest, subscript(read_result, dest.type, 0));
4109 }
4110 break;
4111 }
4112
4113 case nir_intrinsic_store_shared: {
4114 assert(devinfo->ver >= 7);
4115
4116 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4117 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4118 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4119
4120 elk_fs_reg addr = get_nir_src(ntb, instr->src[1]);
4121 int base = nir_intrinsic_base(instr);
4122 if (base) {
4123 elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4124 bld.ADD(addr_off, addr, elk_imm_d(base));
4125 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4126 } else {
4127 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4128 }
4129
4130 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4131 /* No point in masking with sample mask, here we're handling compute
4132 * intrinsics.
4133 */
4134 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4135
4136 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
4137 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4138
4139 assert(bit_size <= 32);
4140 assert(nir_intrinsic_write_mask(instr) ==
4141 (1u << instr->num_components) - 1);
4142 assert(nir_intrinsic_align(instr) > 0);
4143 if (bit_size == 32 &&
4144 nir_intrinsic_align(instr) >= 4) {
4145 assert(nir_src_num_components(instr->src[0]) <= 4);
4146 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4147 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4148 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4149 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4150 } else {
4151 assert(nir_src_num_components(instr->src[0]) == 1);
4152 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4153
4154 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
4155 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4156
4157 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4158 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4159 }
4160 break;
4161 }
4162
4163 case nir_intrinsic_load_workgroup_size: {
4164 /* Should have been lowered by elk_nir_lower_cs_intrinsics() or
4165 * crocus/iris_setup_uniforms() for the variable group size case.
4166 */
4167 unreachable("Should have been lowered");
4168 break;
4169 }
4170
4171 default:
4172 fs_nir_emit_intrinsic(ntb, bld, instr);
4173 break;
4174 }
4175 }
4176
4177 static elk_fs_reg
elk_nir_reduction_op_identity(const fs_builder & bld,nir_op op,elk_reg_type type)4178 elk_nir_reduction_op_identity(const fs_builder &bld,
4179 nir_op op, elk_reg_type type)
4180 {
4181 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4182 switch (type_sz(type)) {
4183 case 1:
4184 if (type == ELK_REGISTER_TYPE_UB) {
4185 return elk_imm_uw(value.u8);
4186 } else {
4187 assert(type == ELK_REGISTER_TYPE_B);
4188 return elk_imm_w(value.i8);
4189 }
4190 case 2:
4191 return retype(elk_imm_uw(value.u16), type);
4192 case 4:
4193 return retype(elk_imm_ud(value.u32), type);
4194 case 8:
4195 if (type == ELK_REGISTER_TYPE_DF)
4196 return elk_setup_imm_df(bld, value.f64);
4197 else
4198 return retype(elk_imm_u64(value.u64), type);
4199 default:
4200 unreachable("Invalid type size");
4201 }
4202 }
4203
4204 static elk_opcode
elk_op_for_nir_reduction_op(nir_op op)4205 elk_op_for_nir_reduction_op(nir_op op)
4206 {
4207 switch (op) {
4208 case nir_op_iadd: return ELK_OPCODE_ADD;
4209 case nir_op_fadd: return ELK_OPCODE_ADD;
4210 case nir_op_imul: return ELK_OPCODE_MUL;
4211 case nir_op_fmul: return ELK_OPCODE_MUL;
4212 case nir_op_imin: return ELK_OPCODE_SEL;
4213 case nir_op_umin: return ELK_OPCODE_SEL;
4214 case nir_op_fmin: return ELK_OPCODE_SEL;
4215 case nir_op_imax: return ELK_OPCODE_SEL;
4216 case nir_op_umax: return ELK_OPCODE_SEL;
4217 case nir_op_fmax: return ELK_OPCODE_SEL;
4218 case nir_op_iand: return ELK_OPCODE_AND;
4219 case nir_op_ior: return ELK_OPCODE_OR;
4220 case nir_op_ixor: return ELK_OPCODE_XOR;
4221 default:
4222 unreachable("Invalid reduction operation");
4223 }
4224 }
4225
4226 static elk_conditional_mod
elk_cond_mod_for_nir_reduction_op(nir_op op)4227 elk_cond_mod_for_nir_reduction_op(nir_op op)
4228 {
4229 switch (op) {
4230 case nir_op_iadd: return ELK_CONDITIONAL_NONE;
4231 case nir_op_fadd: return ELK_CONDITIONAL_NONE;
4232 case nir_op_imul: return ELK_CONDITIONAL_NONE;
4233 case nir_op_fmul: return ELK_CONDITIONAL_NONE;
4234 case nir_op_imin: return ELK_CONDITIONAL_L;
4235 case nir_op_umin: return ELK_CONDITIONAL_L;
4236 case nir_op_fmin: return ELK_CONDITIONAL_L;
4237 case nir_op_imax: return ELK_CONDITIONAL_GE;
4238 case nir_op_umax: return ELK_CONDITIONAL_GE;
4239 case nir_op_fmax: return ELK_CONDITIONAL_GE;
4240 case nir_op_iand: return ELK_CONDITIONAL_NONE;
4241 case nir_op_ior: return ELK_CONDITIONAL_NONE;
4242 case nir_op_ixor: return ELK_CONDITIONAL_NONE;
4243 default:
4244 unreachable("Invalid reduction operation");
4245 }
4246 }
4247
4248 struct rebuild_resource {
4249 unsigned idx;
4250 std::vector<nir_def *> array;
4251 };
4252
4253 static bool
add_rebuild_src(nir_src * src,void * state)4254 add_rebuild_src(nir_src *src, void *state)
4255 {
4256 struct rebuild_resource *res = (struct rebuild_resource *) state;
4257
4258 for (nir_def *def : res->array) {
4259 if (def == src->ssa)
4260 return true;
4261 }
4262
4263 nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
4264 res->array.push_back(src->ssa);
4265 return true;
4266 }
4267
4268 static elk_fs_reg
try_rebuild_resource(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_def * resource_def)4269 try_rebuild_resource(nir_to_elk_state &ntb, const elk::fs_builder &bld, nir_def *resource_def)
4270 {
4271 /* Create a build at the location of the resource_intel intrinsic */
4272 fs_builder ubld8 = bld.exec_all().group(8, 0);
4273
4274 struct rebuild_resource resources = {};
4275 resources.idx = 0;
4276
4277 if (!nir_foreach_src(resource_def->parent_instr,
4278 add_rebuild_src, &resources))
4279 return elk_fs_reg();
4280 resources.array.push_back(resource_def);
4281
4282 if (resources.array.size() == 1) {
4283 nir_def *def = resources.array[0];
4284
4285 if (def->parent_instr->type == nir_instr_type_load_const) {
4286 nir_load_const_instr *load_const =
4287 nir_instr_as_load_const(def->parent_instr);
4288 return elk_imm_ud(load_const->value[0].i32);
4289 } else {
4290 assert(def->parent_instr->type == nir_instr_type_intrinsic &&
4291 (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
4292 nir_intrinsic_load_uniform));
4293 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
4294 unsigned base_offset = nir_intrinsic_base(intrin);
4295 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4296 elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4297 src.offset = load_offset + base_offset % 4;
4298 return src;
4299 }
4300 }
4301
4302 for (unsigned i = 0; i < resources.array.size(); i++) {
4303 nir_def *def = resources.array[i];
4304
4305 nir_instr *instr = def->parent_instr;
4306 switch (instr->type) {
4307 case nir_instr_type_load_const: {
4308 nir_load_const_instr *load_const =
4309 nir_instr_as_load_const(instr);
4310 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4311 ntb.resource_insts[def->index] =
4312 ubld8.MOV(dst, elk_imm_ud(load_const->value[0].i32));
4313 break;
4314 }
4315
4316 case nir_instr_type_alu: {
4317 nir_alu_instr *alu = nir_instr_as_alu(instr);
4318
4319 if (nir_op_infos[alu->op].num_inputs == 2) {
4320 if (alu->src[0].swizzle[0] != 0 ||
4321 alu->src[1].swizzle[0] != 0)
4322 break;
4323 } else if (nir_op_infos[alu->op].num_inputs == 3) {
4324 if (alu->src[0].swizzle[0] != 0 ||
4325 alu->src[1].swizzle[0] != 0 ||
4326 alu->src[2].swizzle[0] != 0)
4327 break;
4328 } else {
4329 /* Not supported ALU input count */
4330 break;
4331 }
4332
4333 switch (alu->op) {
4334 case nir_op_iadd: {
4335 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4336 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4337 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4338 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4339 assert(src0.type == ELK_REGISTER_TYPE_UD);
4340 ntb.resource_insts[def->index] =
4341 ubld8.ADD(dst,
4342 src0.file != IMM ? src0 : src1,
4343 src0.file != IMM ? src1 : src0);
4344 break;
4345 }
4346 case nir_op_ushr: {
4347 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4348 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4349 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4350 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4351 assert(src0.type == ELK_REGISTER_TYPE_UD);
4352 ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
4353 break;
4354 }
4355 case nir_op_ishl: {
4356 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4357 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4358 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4359 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4360 assert(src0.type == ELK_REGISTER_TYPE_UD);
4361 ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
4362 break;
4363 }
4364 case nir_op_mov: {
4365 break;
4366 }
4367 default:
4368 break;
4369 }
4370 break;
4371 }
4372
4373 case nir_instr_type_intrinsic: {
4374 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4375 switch (intrin->intrinsic) {
4376 case nir_intrinsic_resource_intel:
4377 ntb.resource_insts[def->index] =
4378 ntb.resource_insts[intrin->src[1].ssa->index];
4379 break;
4380
4381 case nir_intrinsic_load_uniform: {
4382 if (!nir_src_is_const(intrin->src[0]))
4383 break;
4384
4385 unsigned base_offset = nir_intrinsic_base(intrin);
4386 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4387 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4388 elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4389 src.offset = load_offset + base_offset % 4;
4390 ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
4391 break;
4392 }
4393
4394 default:
4395 break;
4396 }
4397 break;
4398 }
4399
4400 default:
4401 break;
4402 }
4403
4404 if (ntb.resource_insts[def->index] == NULL)
4405 return elk_fs_reg();
4406 }
4407
4408 assert(ntb.resource_insts[resource_def->index] != NULL);
4409 return component(ntb.resource_insts[resource_def->index]->dst, 0);
4410 }
4411
4412 static elk_fs_reg
get_nir_image_intrinsic_image(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4413 get_nir_image_intrinsic_image(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4414 nir_intrinsic_instr *instr)
4415 {
4416 if (is_resource_src(instr->src[0])) {
4417 elk_fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
4418 if (surf_index.file != BAD_FILE)
4419 return surf_index;
4420 }
4421
4422 elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD);
4423 elk_fs_reg surf_index = image;
4424
4425 return bld.emit_uniformize(surf_index);
4426 }
4427
4428 static elk_fs_reg
get_nir_buffer_intrinsic_index(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4429 get_nir_buffer_intrinsic_index(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4430 nir_intrinsic_instr *instr)
4431 {
4432 /* SSBO stores are weird in that their index is in src[1] */
4433 const bool is_store =
4434 instr->intrinsic == nir_intrinsic_store_ssbo ||
4435 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4436 nir_src src = is_store ? instr->src[1] : instr->src[0];
4437
4438 if (nir_src_is_const(src)) {
4439 return elk_imm_ud(nir_src_as_uint(src));
4440 } else if (is_resource_src(src)) {
4441 elk_fs_reg surf_index = get_resource_nir_src(ntb, src);
4442 if (surf_index.file != BAD_FILE)
4443 return surf_index;
4444 }
4445 return bld.emit_uniformize(get_nir_src(ntb, src));
4446 }
4447
4448 /**
4449 * The offsets we get from NIR act as if each SIMD channel has it's own blob
4450 * of contiguous space. However, if we actually place each SIMD channel in
4451 * it's own space, we end up with terrible cache performance because each SIMD
4452 * channel accesses a different cache line even when they're all accessing the
4453 * same byte offset. To deal with this problem, we swizzle the address using
4454 * a simple algorithm which ensures that any time a SIMD message reads or
4455 * writes the same address, it's all in the same cache line. We have to keep
4456 * the bottom two bits fixed so that we can read/write up to a dword at a time
4457 * and the individual element is contiguous. We do this by splitting the
4458 * address as follows:
4459 *
4460 * 31 4-6 2 0
4461 * +-------------------------------+------------+----------+
4462 * | Hi address bits | chan index | addr low |
4463 * +-------------------------------+------------+----------+
4464 *
4465 * In other words, the bottom two address bits stay, and the top 30 get
4466 * shifted up so that we can stick the SIMD channel index in the middle. This
4467 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4468 * at the same logical offset, the scratch read/write instruction acts on
4469 * continuous elements and we get good cache locality.
4470 */
4471 static elk_fs_reg
swizzle_nir_scratch_addr(nir_to_elk_state & ntb,const elk::fs_builder & bld,const elk_fs_reg & nir_addr,bool in_dwords)4472 swizzle_nir_scratch_addr(nir_to_elk_state &ntb,
4473 const elk::fs_builder &bld,
4474 const elk_fs_reg &nir_addr,
4475 bool in_dwords)
4476 {
4477 elk_fs_visitor &s = ntb.s;
4478
4479 const elk_fs_reg &chan_index =
4480 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4481 const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4482
4483 elk_fs_reg addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4484 if (in_dwords) {
4485 /* In this case, we know the address is aligned to a DWORD and we want
4486 * the final address in DWORDs.
4487 */
4488 bld.SHL(addr, nir_addr, elk_imm_ud(chan_index_bits - 2));
4489 bld.OR(addr, addr, chan_index);
4490 } else {
4491 /* This case substantially more annoying because we have to pay
4492 * attention to those pesky two bottom bits.
4493 */
4494 elk_fs_reg addr_hi = bld.vgrf(ELK_REGISTER_TYPE_UD);
4495 bld.AND(addr_hi, nir_addr, elk_imm_ud(~0x3u));
4496 bld.SHL(addr_hi, addr_hi, elk_imm_ud(chan_index_bits));
4497 elk_fs_reg chan_addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4498 bld.SHL(chan_addr, chan_index, elk_imm_ud(2));
4499 bld.AND(addr, nir_addr, elk_imm_ud(0x3u));
4500 bld.OR(addr, addr, addr_hi);
4501 bld.OR(addr, addr, chan_addr);
4502 }
4503 return addr;
4504 }
4505
4506 static unsigned
choose_oword_block_size_dwords(const struct intel_device_info * devinfo,unsigned dwords)4507 choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
4508 unsigned dwords)
4509 {
4510 unsigned block;
4511 if (devinfo->has_lsc && dwords >= 64) {
4512 block = 64;
4513 } else if (dwords >= 32) {
4514 block = 32;
4515 } else if (dwords >= 16) {
4516 block = 16;
4517 } else {
4518 block = 8;
4519 }
4520 assert(block <= dwords);
4521 return block;
4522 }
4523
4524 static void
increment_a64_address(const fs_builder & bld,elk_fs_reg address,uint32_t v)4525 increment_a64_address(const fs_builder &bld, elk_fs_reg address, uint32_t v)
4526 {
4527 if (bld.shader->devinfo->has_64bit_int) {
4528 bld.ADD(address, address, elk_imm_ud(v));
4529 } else {
4530 elk_fs_reg low = retype(address, ELK_REGISTER_TYPE_UD);
4531 elk_fs_reg high = offset(low, bld, 1);
4532
4533 /* Add low and if that overflows, add carry to high. */
4534 bld.ADD(low, low, elk_imm_ud(v))->conditional_mod = ELK_CONDITIONAL_O;
4535 bld.ADD(high, high, elk_imm_ud(0x1))->predicate = ELK_PREDICATE_NORMAL;
4536 }
4537 }
4538
4539 static elk_fs_reg
emit_fence(const fs_builder & bld,enum elk_opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4540 emit_fence(const fs_builder &bld, enum elk_opcode opcode,
4541 uint8_t sfid, uint32_t desc,
4542 bool commit_enable, uint8_t bti)
4543 {
4544 assert(opcode == ELK_SHADER_OPCODE_INTERLOCK ||
4545 opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
4546
4547 elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
4548 elk_fs_inst *fence = bld.emit(opcode, dst, elk_vec8_grf(0, 0),
4549 elk_imm_ud(commit_enable),
4550 elk_imm_ud(bti));
4551 fence->sfid = sfid;
4552 fence->desc = desc;
4553
4554 return dst;
4555 }
4556
4557 /**
4558 * Create a MOV to read the timestamp register.
4559 */
4560 static elk_fs_reg
get_timestamp(const fs_builder & bld)4561 get_timestamp(const fs_builder &bld)
4562 {
4563 elk_fs_visitor &s = *bld.shader;
4564 const intel_device_info *devinfo = s.devinfo;
4565
4566 assert(devinfo->ver >= 7);
4567
4568 elk_fs_reg ts = elk_fs_reg(retype(elk_vec4_reg(ELK_ARCHITECTURE_REGISTER_FILE,
4569 ELK_ARF_TIMESTAMP,
4570 0),
4571 ELK_REGISTER_TYPE_UD));
4572
4573 elk_fs_reg dst = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
4574
4575 /* We want to read the 3 fields we care about even if it's not enabled in
4576 * the dispatch.
4577 */
4578 bld.group(4, 0).exec_all().MOV(dst, ts);
4579
4580 return dst;
4581 }
4582
4583 static void
fs_nir_emit_intrinsic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)4584 fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
4585 const fs_builder &bld, nir_intrinsic_instr *instr)
4586 {
4587 const intel_device_info *devinfo = ntb.devinfo;
4588 elk_fs_visitor &s = ntb.s;
4589
4590 /* We handle this as a special case */
4591 if (instr->intrinsic == nir_intrinsic_decl_reg) {
4592 assert(nir_intrinsic_num_array_elems(instr) == 0);
4593 unsigned bit_size = nir_intrinsic_bit_size(instr);
4594 unsigned num_components = nir_intrinsic_num_components(instr);
4595 const elk_reg_type reg_type =
4596 elk_reg_type_from_bit_size(bit_size, bit_size == 8 ?
4597 ELK_REGISTER_TYPE_D :
4598 ELK_REGISTER_TYPE_F);
4599
4600 /* Re-use the destination's slot in the table for the register */
4601 ntb.ssa_values[instr->def.index] =
4602 bld.vgrf(reg_type, num_components);
4603 return;
4604 }
4605
4606 elk_fs_reg dest;
4607 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4608 dest = get_nir_def(ntb, instr->def);
4609
4610 switch (instr->intrinsic) {
4611 case nir_intrinsic_resource_intel:
4612 ntb.ssa_bind_infos[instr->def.index].valid = true;
4613 ntb.ssa_bind_infos[instr->def.index].bindless =
4614 (nir_intrinsic_resource_access_intel(instr) &
4615 nir_resource_intel_bindless) != 0;
4616 ntb.ssa_bind_infos[instr->def.index].block =
4617 nir_intrinsic_resource_block_intel(instr);
4618 ntb.ssa_bind_infos[instr->def.index].set =
4619 nir_intrinsic_desc_set(instr);
4620 ntb.ssa_bind_infos[instr->def.index].binding =
4621 nir_intrinsic_binding(instr);
4622
4623 if (nir_intrinsic_resource_access_intel(instr) &
4624 nir_resource_intel_non_uniform) {
4625 ntb.resource_values[instr->def.index] = elk_fs_reg();
4626 } else {
4627 ntb.resource_values[instr->def.index] =
4628 try_rebuild_resource(ntb, bld, instr->src[1].ssa);
4629 }
4630 ntb.ssa_values[instr->def.index] =
4631 ntb.ssa_values[instr->src[1].ssa->index];
4632 break;
4633
4634 case nir_intrinsic_load_reg:
4635 case nir_intrinsic_store_reg:
4636 /* Nothing to do with these. */
4637 break;
4638
4639 case nir_intrinsic_image_load:
4640 case nir_intrinsic_image_store:
4641 case nir_intrinsic_image_atomic:
4642 case nir_intrinsic_image_atomic_swap:
4643 case nir_intrinsic_bindless_image_load:
4644 case nir_intrinsic_bindless_image_store:
4645 case nir_intrinsic_bindless_image_atomic:
4646 case nir_intrinsic_bindless_image_atomic_swap: {
4647 /* Get some metadata from the image intrinsic. */
4648 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4649
4650 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4651
4652 switch (instr->intrinsic) {
4653 case nir_intrinsic_image_load:
4654 case nir_intrinsic_image_store:
4655 case nir_intrinsic_image_atomic:
4656 case nir_intrinsic_image_atomic_swap:
4657 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4658 get_nir_image_intrinsic_image(ntb, bld, instr);
4659 break;
4660
4661 default:
4662 /* Bindless */
4663 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4664 get_nir_image_intrinsic_image(ntb, bld, instr);
4665 break;
4666 }
4667
4668 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4669 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4670 elk_imm_ud(nir_image_intrinsic_coord_components(instr));
4671
4672 /* Emit an image load, store or atomic op. */
4673 if (instr->intrinsic == nir_intrinsic_image_load ||
4674 instr->intrinsic == nir_intrinsic_bindless_image_load) {
4675 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4676 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4677 elk_fs_inst *inst =
4678 bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4679 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4680 inst->size_written = instr->num_components * s.dispatch_width * 4;
4681 } else if (instr->intrinsic == nir_intrinsic_image_store ||
4682 instr->intrinsic == nir_intrinsic_bindless_image_store) {
4683 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4684 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
4685 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4686 bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4687 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4688 } else {
4689 unsigned num_srcs = info->num_srcs;
4690 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
4691 if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
4692 assert(num_srcs == 4);
4693 num_srcs = 3;
4694 }
4695
4696 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
4697
4698 elk_fs_reg data;
4699 if (num_srcs >= 4)
4700 data = get_nir_src(ntb, instr->src[3]);
4701 if (num_srcs >= 5) {
4702 elk_fs_reg tmp = bld.vgrf(data.type, 2);
4703 elk_fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
4704 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4705 data = tmp;
4706 }
4707 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4708 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4709
4710 bld.emit(ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4711 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4712 }
4713 break;
4714 }
4715
4716 case nir_intrinsic_image_size:
4717 case nir_intrinsic_bindless_image_size: {
4718 /* Cube image sizes should have previously been lowered to a 2D array */
4719 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
4720
4721 /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4722 * into will handle the binding table index for us in the geneerator.
4723 * Incidentally, this means that we can handle bindless with exactly the
4724 * same code.
4725 */
4726 elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
4727 ELK_REGISTER_TYPE_UD);
4728 image = bld.emit_uniformize(image);
4729
4730 assert(nir_src_as_uint(instr->src[1]) == 0);
4731
4732 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4733 if (instr->intrinsic == nir_intrinsic_image_size)
4734 srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4735 else
4736 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4737 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_d(0);
4738 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(0);
4739 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
4740 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
4741
4742 /* Since the image size is always uniform, we can just emit a SIMD8
4743 * query instruction and splat the result out.
4744 */
4745 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
4746
4747 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
4748 elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4749 tmp, srcs, ARRAY_SIZE(srcs));
4750 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
4751
4752 for (unsigned c = 0; c < instr->def.num_components; ++c) {
4753 bld.MOV(offset(retype(dest, tmp.type), bld, c),
4754 component(offset(tmp, ubld, c), 0));
4755 }
4756 break;
4757 }
4758
4759 case nir_intrinsic_image_load_raw_intel: {
4760 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4761 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4762 get_nir_image_intrinsic_image(ntb, bld, instr);
4763 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4764 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4765 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4766 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4767
4768 elk_fs_inst *inst =
4769 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4770 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4771 inst->size_written = instr->num_components * s.dispatch_width * 4;
4772 break;
4773 }
4774
4775 case nir_intrinsic_image_store_raw_intel: {
4776 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4777 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4778 get_nir_image_intrinsic_image(ntb, bld, instr);
4779 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4780 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
4781 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4782 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4783 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4784
4785 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4786 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4787 break;
4788 }
4789
4790 case nir_intrinsic_barrier:
4791 case nir_intrinsic_begin_invocation_interlock:
4792 case nir_intrinsic_end_invocation_interlock: {
4793 bool ugm_fence, slm_fence, tgm_fence, urb_fence;
4794 enum elk_opcode opcode = ELK_OPCODE_NOP;
4795
4796 /* Handling interlock intrinsics here will allow the logic for IVB
4797 * render cache (see below) to be reused.
4798 */
4799
4800 switch (instr->intrinsic) {
4801 case nir_intrinsic_barrier: {
4802 /* Note we only care about the memory part of the
4803 * barrier. The execution part will be taken care
4804 * of by the stage specific intrinsic handler functions.
4805 */
4806 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4807 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
4808 slm_fence = modes & nir_var_mem_shared;
4809 tgm_fence = modes & nir_var_image;
4810 urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
4811 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4812 opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4813 break;
4814 }
4815
4816 case nir_intrinsic_begin_invocation_interlock:
4817 /* For beginInvocationInterlockARB(), we will generate a memory fence
4818 * but with a different opcode so that generator can pick SENDC
4819 * instead of SEND.
4820 */
4821 assert(s.stage == MESA_SHADER_FRAGMENT);
4822 ugm_fence = tgm_fence = true;
4823 slm_fence = urb_fence = false;
4824 opcode = ELK_SHADER_OPCODE_INTERLOCK;
4825 break;
4826
4827 case nir_intrinsic_end_invocation_interlock:
4828 /* For endInvocationInterlockARB(), we need to insert a memory fence which
4829 * stalls in the shader until the memory transactions prior to that
4830 * fence are complete. This ensures that the shader does not end before
4831 * any writes from its critical section have landed. Otherwise, you can
4832 * end up with a case where the next invocation on that pixel properly
4833 * stalls for previous FS invocation on its pixel to complete but
4834 * doesn't actually wait for the dataport memory transactions from that
4835 * thread to land before submitting its own.
4836 */
4837 assert(s.stage == MESA_SHADER_FRAGMENT);
4838 ugm_fence = tgm_fence = true;
4839 slm_fence = urb_fence = false;
4840 opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4841 break;
4842
4843 default:
4844 unreachable("invalid intrinsic");
4845 }
4846
4847 if (opcode == ELK_OPCODE_NOP)
4848 break;
4849
4850 if (s.nir->info.shared_size > 0) {
4851 assert(gl_shader_stage_uses_workgroup(s.stage));
4852 } else {
4853 slm_fence = false;
4854 }
4855
4856 /* If the workgroup fits in a single HW thread, the messages for SLM are
4857 * processed in-order and the shader itself is already synchronized so
4858 * the memory fence is not necessary.
4859 *
4860 * TODO: Check if applies for many HW threads sharing same Data Port.
4861 */
4862 if (!s.nir->info.workgroup_size_variable &&
4863 slm_fence && s.workgroup_size() <= s.dispatch_width)
4864 slm_fence = false;
4865
4866 switch (s.stage) {
4867 case MESA_SHADER_TESS_CTRL:
4868 break;
4869 default:
4870 urb_fence = false;
4871 break;
4872 }
4873
4874 unsigned fence_regs_count = 0;
4875 elk_fs_reg fence_regs[4] = {};
4876
4877 const fs_builder ubld = bld.group(8, 0);
4878
4879 /* Prior to Icelake, they're all lumped into a single cache except on
4880 * Ivy Bridge and Bay Trail where typed messages actually go through
4881 * the render cache. There, we need both fences because we may
4882 * access storage images as either typed or untyped.
4883 */
4884 const bool render_fence = tgm_fence && devinfo->verx10 == 70;
4885
4886 const bool commit_enable = render_fence ||
4887 instr->intrinsic == nir_intrinsic_end_invocation_interlock;
4888
4889 if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
4890 fence_regs[fence_regs_count++] =
4891 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4892 commit_enable, 0 /* BTI */);
4893 }
4894
4895 if (render_fence) {
4896 fence_regs[fence_regs_count++] =
4897 emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
4898 commit_enable, /* bti */ 0);
4899 }
4900
4901 assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
4902
4903 /* There are four cases where we want to insert a stall:
4904 *
4905 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is
4906 * required to ensure that the shader EOT doesn't happen until
4907 * after the fence returns. Otherwise, we might end up with the
4908 * next shader invocation for that pixel not respecting our fence
4909 * because it may happen on a different HW thread.
4910 *
4911 * 2. If we have multiple fences. This is required to ensure that
4912 * they all complete and nothing gets weirdly out-of-order.
4913 *
4914 * 3. If we have no fences. In this case, we need at least a
4915 * scheduling barrier to keep the compiler from moving things
4916 * around in an invalid way.
4917 */
4918 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4919 fence_regs_count != 1) {
4920 ubld.exec_all().group(1, 0).emit(
4921 ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4922 fence_regs, fence_regs_count);
4923 }
4924
4925 break;
4926 }
4927
4928 case nir_intrinsic_shader_clock: {
4929 /* We cannot do anything if there is an event, so ignore it for now */
4930 const elk_fs_reg shader_clock = get_timestamp(bld);
4931 const elk_fs_reg srcs[] = { component(shader_clock, 0),
4932 component(shader_clock, 1) };
4933 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4934 break;
4935 }
4936
4937 case nir_intrinsic_load_reloc_const_intel: {
4938 uint32_t id = nir_intrinsic_param_idx(instr);
4939
4940 /* Emit the reloc in the smallest SIMD size to limit register usage. */
4941 const fs_builder ubld = bld.exec_all().group(1, 0);
4942 elk_fs_reg small_dest = ubld.vgrf(dest.type);
4943 ubld.UNDEF(small_dest);
4944 ubld.exec_all().group(1, 0).emit(ELK_SHADER_OPCODE_MOV_RELOC_IMM,
4945 small_dest, elk_imm_ud(id));
4946
4947 /* Copy propagation will get rid of this MOV. */
4948 bld.MOV(dest, component(small_dest, 0));
4949 break;
4950 }
4951
4952 case nir_intrinsic_load_uniform: {
4953 /* Offsets are in bytes but they should always aligned to
4954 * the type size
4955 */
4956 unsigned base_offset = nir_intrinsic_base(instr);
4957 assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
4958
4959 elk_fs_reg src(UNIFORM, base_offset / 4, dest.type);
4960
4961 if (nir_src_is_const(instr->src[0])) {
4962 unsigned load_offset = nir_src_as_uint(instr->src[0]);
4963 assert(load_offset % type_sz(dest.type) == 0);
4964 /* The base offset can only handle 32-bit units, so for 16-bit
4965 * data take the modulo of the offset with 4 bytes and add it to
4966 * the offset to read from within the source register.
4967 */
4968 src.offset = load_offset + base_offset % 4;
4969
4970 for (unsigned j = 0; j < instr->num_components; j++) {
4971 bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4972 }
4973 } else {
4974 elk_fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
4975 ELK_REGISTER_TYPE_UD);
4976
4977 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4978 * go past the end of the uniform. In order to keep the n'th
4979 * component from running past, we subtract off the size of all but
4980 * one component of the vector.
4981 */
4982 assert(nir_intrinsic_range(instr) >=
4983 instr->num_components * type_sz(dest.type));
4984 unsigned read_size = nir_intrinsic_range(instr) -
4985 (instr->num_components - 1) * type_sz(dest.type);
4986
4987 bool supports_64bit_indirects = devinfo->platform != INTEL_PLATFORM_CHV;
4988
4989 if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4990 for (unsigned j = 0; j < instr->num_components; j++) {
4991 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
4992 offset(dest, bld, j), offset(src, bld, j),
4993 indirect, elk_imm_ud(read_size));
4994 }
4995 } else {
4996 const unsigned num_mov_indirects =
4997 type_sz(dest.type) / type_sz(ELK_REGISTER_TYPE_UD);
4998 /* We read a little bit less per MOV INDIRECT, as they are now
4999 * 32-bits ones instead of 64-bit. Fix read_size then.
5000 */
5001 const unsigned read_size_32bit = read_size -
5002 (num_mov_indirects - 1) * type_sz(ELK_REGISTER_TYPE_UD);
5003 for (unsigned j = 0; j < instr->num_components; j++) {
5004 for (unsigned i = 0; i < num_mov_indirects; i++) {
5005 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5006 subscript(offset(dest, bld, j), ELK_REGISTER_TYPE_UD, i),
5007 subscript(offset(src, bld, j), ELK_REGISTER_TYPE_UD, i),
5008 indirect, elk_imm_ud(read_size_32bit));
5009 }
5010 }
5011 }
5012 }
5013 break;
5014 }
5015
5016 case nir_intrinsic_load_ubo:
5017 case nir_intrinsic_load_ubo_uniform_block_intel: {
5018 elk_fs_reg surface, surface_handle;
5019
5020 if (get_nir_src_bindless(ntb, instr->src[0]))
5021 surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5022 else
5023 surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5024
5025 if (!nir_src_is_const(instr->src[1])) {
5026 if (instr->intrinsic == nir_intrinsic_load_ubo) {
5027 /* load_ubo with non-uniform offset */
5028 elk_fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
5029 ELK_REGISTER_TYPE_UD);
5030
5031 const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
5032
5033 for (int i = 0; i < instr->num_components; i += comps_per_load) {
5034 const unsigned remaining = instr->num_components - i;
5035 s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
5036 surface, surface_handle,
5037 base_offset,
5038 i * type_sz(dest.type),
5039 instr->def.bit_size / 8,
5040 MIN2(remaining, comps_per_load));
5041 }
5042
5043 s.prog_data->has_ubo_pull = true;
5044 } else {
5045 /* load_ubo with uniform offset */
5046 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5047 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5048 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5049
5050 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5051
5052 srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface;
5053 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
5054
5055 const nir_src load_offset = instr->src[1];
5056 if (nir_src_is_const(load_offset)) {
5057 elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5058 ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5059 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5060 } else {
5061 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5062 bld.emit_uniformize(get_nir_src(ntb, load_offset));
5063 }
5064
5065 const unsigned total_dwords =
5066 ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
5067 unsigned loaded_dwords = 0;
5068
5069 const elk_fs_reg packed_consts =
5070 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5071
5072 while (loaded_dwords < total_dwords) {
5073 const unsigned block =
5074 choose_oword_block_size_dwords(devinfo,
5075 total_dwords - loaded_dwords);
5076 const unsigned block_bytes = block * 4;
5077
5078 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5079
5080 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5081 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5082 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5083 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5084 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5085
5086 loaded_dwords += block;
5087
5088 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5089 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5090 elk_imm_ud(block_bytes));
5091 }
5092
5093 for (unsigned c = 0; c < instr->num_components; c++) {
5094 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5095 component(packed_consts, c));
5096 }
5097
5098 s.prog_data->has_ubo_pull = true;
5099 }
5100 } else {
5101 /* Even if we are loading doubles, a pull constant load will load
5102 * a 32-bit vec4, so should only reserve vgrf space for that. If we
5103 * need to load a full dvec4 we will have to emit 2 loads. This is
5104 * similar to demote_pull_constants(), except that in that case we
5105 * see individual accesses to each component of the vector and then
5106 * we let CSE deal with duplicate loads. Here we see a vector access
5107 * and we have to split it if necessary.
5108 */
5109 const unsigned type_size = type_sz(dest.type);
5110 const unsigned load_offset = nir_src_as_uint(instr->src[1]);
5111 const unsigned ubo_block =
5112 elk_nir_ubo_surface_index_get_push_block(instr->src[0]);
5113 const unsigned offset_256b = load_offset / 32;
5114 const unsigned end_256b =
5115 DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
5116
5117 /* See if we've selected this as a push constant candidate */
5118 elk_fs_reg push_reg;
5119 for (int i = 0; i < 4; i++) {
5120 const struct elk_ubo_range *range = &s.prog_data->ubo_ranges[i];
5121 if (range->block == ubo_block &&
5122 offset_256b >= range->start &&
5123 end_256b <= range->start + range->length) {
5124
5125 push_reg = elk_fs_reg(UNIFORM, UBO_START + i, dest.type);
5126 push_reg.offset = load_offset - 32 * range->start;
5127 break;
5128 }
5129 }
5130
5131 if (push_reg.file != BAD_FILE) {
5132 for (unsigned i = 0; i < instr->num_components; i++) {
5133 bld.MOV(offset(dest, bld, i),
5134 byte_offset(push_reg, i * type_size));
5135 }
5136 break;
5137 }
5138
5139 s.prog_data->has_ubo_pull = true;
5140
5141 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
5142 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
5143
5144 for (unsigned c = 0; c < instr->num_components;) {
5145 const unsigned base = load_offset + c * type_size;
5146 /* Number of usable components in the next block-aligned load. */
5147 const unsigned count = MIN2(instr->num_components - c,
5148 (block_sz - base % block_sz) / type_size);
5149
5150 const elk_fs_reg packed_consts = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5151 elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
5152 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
5153 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
5154 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = elk_imm_ud(base & ~(block_sz - 1));
5155 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = elk_imm_ud(block_sz);
5156
5157 ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
5158 srcs, PULL_UNIFORM_CONSTANT_SRCS);
5159
5160 const elk_fs_reg consts =
5161 retype(byte_offset(packed_consts, base & (block_sz - 1)),
5162 dest.type);
5163
5164 for (unsigned d = 0; d < count; d++)
5165 bld.MOV(offset(dest, bld, c + d), component(consts, d));
5166
5167 c += count;
5168 }
5169 }
5170 break;
5171 }
5172
5173 case nir_intrinsic_load_global:
5174 case nir_intrinsic_load_global_constant: {
5175 assert(devinfo->ver >= 8);
5176
5177 assert(instr->def.bit_size <= 32);
5178 assert(nir_intrinsic_align(instr) > 0);
5179 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5180 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
5181 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5182 srcs[A64_LOGICAL_ENABLE_HELPERS] =
5183 elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5184
5185 if (instr->def.bit_size == 32 &&
5186 nir_intrinsic_align(instr) >= 4) {
5187 assert(instr->def.num_components <= 4);
5188
5189 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5190
5191 elk_fs_inst *inst =
5192 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
5193 srcs, A64_LOGICAL_NUM_SRCS);
5194 inst->size_written = instr->num_components *
5195 inst->dst.component_size(inst->exec_size);
5196 } else {
5197 const unsigned bit_size = instr->def.bit_size;
5198 assert(instr->def.num_components == 1);
5199 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5200
5201 srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5202
5203 bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
5204 srcs, A64_LOGICAL_NUM_SRCS);
5205 bld.MOV(dest, subscript(tmp, dest.type, 0));
5206 }
5207 break;
5208 }
5209
5210 case nir_intrinsic_store_global: {
5211 assert(devinfo->ver >= 8);
5212
5213 assert(nir_src_bit_size(instr->src[0]) <= 32);
5214 assert(nir_intrinsic_write_mask(instr) ==
5215 (1u << instr->num_components) - 1);
5216 assert(nir_intrinsic_align(instr) > 0);
5217
5218 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5219 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5220 srcs[A64_LOGICAL_ENABLE_HELPERS] =
5221 elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5222
5223 if (nir_src_bit_size(instr->src[0]) == 32 &&
5224 nir_intrinsic_align(instr) >= 4) {
5225 assert(nir_src_num_components(instr->src[0]) <= 4);
5226
5227 srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
5228 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5229
5230 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, elk_fs_reg(),
5231 srcs, A64_LOGICAL_NUM_SRCS);
5232 } else {
5233 assert(nir_src_num_components(instr->src[0]) == 1);
5234 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5235 elk_reg_type data_type =
5236 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5237 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5238 bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
5239
5240 srcs[A64_LOGICAL_SRC] = tmp;
5241 srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5242
5243 bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, elk_fs_reg(),
5244 srcs, A64_LOGICAL_NUM_SRCS);
5245 }
5246 break;
5247 }
5248
5249 case nir_intrinsic_global_atomic:
5250 case nir_intrinsic_global_atomic_swap:
5251 fs_nir_emit_global_atomic(ntb, bld, instr);
5252 break;
5253
5254 case nir_intrinsic_load_global_constant_uniform_block_intel: {
5255 const unsigned total_dwords = ALIGN(instr->num_components,
5256 REG_SIZE * reg_unit(devinfo) / 4);
5257 unsigned loaded_dwords = 0;
5258
5259 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5260 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5261 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5262
5263 const elk_fs_reg packed_consts =
5264 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5265 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
5266
5267 while (loaded_dwords < total_dwords) {
5268 const unsigned block =
5269 choose_oword_block_size_dwords(devinfo,
5270 total_dwords - loaded_dwords);
5271 const unsigned block_bytes = block * 4;
5272
5273 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5274
5275 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5276 srcs[A64_LOGICAL_ADDRESS] = address;
5277 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5278 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
5279 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
5280 ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5281 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5282 srcs, A64_LOGICAL_NUM_SRCS)->size_written =
5283 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5284
5285 increment_a64_address(ubld1, address, block_bytes);
5286 loaded_dwords += block;
5287 }
5288
5289 for (unsigned c = 0; c < instr->num_components; c++)
5290 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5291 component(packed_consts, c));
5292
5293 break;
5294 }
5295
5296 case nir_intrinsic_load_ssbo: {
5297 assert(devinfo->ver >= 7);
5298
5299 const unsigned bit_size = instr->def.bit_size;
5300 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5301 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5302 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5303 SURFACE_LOGICAL_SRC_SURFACE] =
5304 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5305 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5306 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5307 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5308
5309 /* Make dest unsigned because that's what the temporary will be */
5310 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5311
5312 /* Read the vector */
5313 assert(bit_size <= 32);
5314 assert(nir_intrinsic_align(instr) > 0);
5315 if (bit_size == 32 &&
5316 nir_intrinsic_align(instr) >= 4) {
5317 assert(instr->def.num_components <= 4);
5318 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5319 elk_fs_inst *inst =
5320 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5321 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5322 inst->size_written = instr->num_components * s.dispatch_width * 4;
5323 } else {
5324 assert(instr->def.num_components == 1);
5325 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5326
5327 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5328 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5329 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5330 bld.MOV(dest, subscript(read_result, dest.type, 0));
5331 }
5332 break;
5333 }
5334
5335 case nir_intrinsic_store_ssbo: {
5336 assert(devinfo->ver >= 7);
5337
5338 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5339 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5340 srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
5341 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5342 SURFACE_LOGICAL_SRC_SURFACE] =
5343 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5344 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
5345 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5346 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5347
5348 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5349 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5350
5351 assert(bit_size <= 32);
5352 assert(nir_intrinsic_write_mask(instr) ==
5353 (1u << instr->num_components) - 1);
5354 assert(nir_intrinsic_align(instr) > 0);
5355 if (bit_size == 32 &&
5356 nir_intrinsic_align(instr) >= 4) {
5357 assert(nir_src_num_components(instr->src[0]) <= 4);
5358 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5359 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5360 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5361 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5362 } else {
5363 assert(nir_src_num_components(instr->src[0]) == 1);
5364 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5365
5366 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5367 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5368
5369 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5370 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5371 }
5372 break;
5373 }
5374
5375 case nir_intrinsic_load_ssbo_uniform_block_intel:
5376 case nir_intrinsic_load_shared_uniform_block_intel: {
5377 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5378
5379 const bool is_ssbo =
5380 instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
5381 if (is_ssbo) {
5382 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5383 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5384 SURFACE_LOGICAL_SRC_SURFACE] =
5385 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5386 } else {
5387 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
5388 }
5389
5390 const unsigned total_dwords = ALIGN(instr->num_components,
5391 REG_SIZE * reg_unit(devinfo) / 4);
5392 unsigned loaded_dwords = 0;
5393
5394 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5395 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5396 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5397
5398 const elk_fs_reg packed_consts =
5399 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5400
5401 const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
5402 if (nir_src_is_const(load_offset)) {
5403 elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5404 ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5405 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5406 } else {
5407 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5408 bld.emit_uniformize(get_nir_src(ntb, load_offset));
5409 }
5410
5411 while (loaded_dwords < total_dwords) {
5412 const unsigned block =
5413 choose_oword_block_size_dwords(devinfo,
5414 total_dwords - loaded_dwords);
5415 const unsigned block_bytes = block * 4;
5416
5417 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5418
5419 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5420 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5421 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5422 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5423 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5424
5425 loaded_dwords += block;
5426
5427 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5428 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5429 elk_imm_ud(block_bytes));
5430 }
5431
5432 for (unsigned c = 0; c < instr->num_components; c++)
5433 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5434 component(packed_consts, c));
5435
5436 break;
5437 }
5438
5439 case nir_intrinsic_store_output: {
5440 assert(nir_src_bit_size(instr->src[0]) == 32);
5441 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5442
5443 unsigned store_offset = nir_src_as_uint(instr->src[1]);
5444 unsigned num_components = instr->num_components;
5445 unsigned first_component = nir_intrinsic_component(instr);
5446
5447 elk_fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
5448 4 * store_offset), src.type);
5449 for (unsigned j = 0; j < num_components; j++) {
5450 bld.MOV(offset(new_dest, bld, j + first_component),
5451 offset(src, bld, j));
5452 }
5453 break;
5454 }
5455
5456 case nir_intrinsic_ssbo_atomic:
5457 case nir_intrinsic_ssbo_atomic_swap:
5458 fs_nir_emit_surface_atomic(ntb, bld, instr,
5459 get_nir_buffer_intrinsic_index(ntb, bld, instr),
5460 get_nir_src_bindless(ntb, instr->src[0]));
5461 break;
5462
5463 case nir_intrinsic_get_ssbo_size: {
5464 assert(nir_src_num_components(instr->src[0]) == 1);
5465
5466 /* A resinfo's sampler message is used to get the buffer size. The
5467 * SIMD8's writeback message consists of four registers and SIMD16's
5468 * writeback message consists of 8 destination registers (two per each
5469 * component). Because we are only interested on the first channel of
5470 * the first returned component, where resinfo returns the buffer size
5471 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
5472 * the dispatch width.
5473 */
5474 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
5475 elk_fs_reg src_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5476 elk_fs_reg ret_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
5477
5478 /* Set LOD = 0 */
5479 ubld.MOV(src_payload, elk_imm_d(0));
5480
5481 elk_fs_reg srcs[GET_BUFFER_SIZE_SRCS];
5482 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5483 GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
5484 GET_BUFFER_SIZE_SRC_SURFACE] =
5485 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5486 srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
5487 elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
5488 srcs, GET_BUFFER_SIZE_SRCS);
5489 inst->header_size = 0;
5490 inst->mlen = reg_unit(devinfo);
5491 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5492
5493 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
5494 *
5495 * "Out-of-bounds checking is always performed at a DWord granularity. If
5496 * any part of the DWord is out-of-bounds then the whole DWord is
5497 * considered out-of-bounds."
5498 *
5499 * This implies that types with size smaller than 4-bytes need to be
5500 * padded if they don't complete the last dword of the buffer. But as we
5501 * need to maintain the original size we need to reverse the padding
5502 * calculation to return the correct size to know the number of elements
5503 * of an unsized array. As we stored in the last two bits of the surface
5504 * size the needed padding for the buffer, we calculate here the
5505 * original buffer_size reversing the surface_size calculation:
5506 *
5507 * surface_size = isl_align(buffer_size, 4) +
5508 * (isl_align(buffer_size) - buffer_size)
5509 *
5510 * buffer_size = surface_size & ~3 - surface_size & 3
5511 */
5512
5513 elk_fs_reg size_aligned4 = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5514 elk_fs_reg size_padding = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5515 elk_fs_reg buffer_size = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5516
5517 ubld.AND(size_padding, ret_payload, elk_imm_ud(3));
5518 ubld.AND(size_aligned4, ret_payload, elk_imm_ud(~3));
5519 ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
5520
5521 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
5522 break;
5523 }
5524
5525 case nir_intrinsic_load_scratch: {
5526 assert(devinfo->ver >= 7);
5527
5528 assert(instr->def.num_components == 1);
5529 const unsigned bit_size = instr->def.bit_size;
5530 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5531
5532 if (devinfo->ver >= 8) {
5533 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5534 elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5535 } else {
5536 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5537 }
5538
5539 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5540 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5541 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5542 const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
5543
5544 /* Make dest unsigned because that's what the temporary will be */
5545 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5546
5547 /* Read the vector */
5548 assert(instr->def.num_components == 1);
5549 assert(bit_size <= 32);
5550 assert(nir_intrinsic_align(instr) > 0);
5551 if (bit_size == 32 &&
5552 nir_intrinsic_align(instr) >= 4) {
5553 /* The offset for a DWORD scattered message is in dwords. */
5554 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5555 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5556
5557 bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
5558 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5559 } else {
5560 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5561 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5562
5563 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5564 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5565 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5566 bld.MOV(dest, read_result);
5567 }
5568
5569 s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5570 break;
5571 }
5572
5573 case nir_intrinsic_store_scratch: {
5574 assert(devinfo->ver >= 7);
5575
5576 assert(nir_src_num_components(instr->src[0]) == 1);
5577 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5578 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5579
5580 if (devinfo->ver >= 8) {
5581 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5582 elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5583 } else {
5584 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5585 }
5586
5587 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5588 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5589 /**
5590 * While this instruction has side-effects, it should not be predicated
5591 * on sample mask, because otherwise fs helper invocations would
5592 * load undefined values from scratch memory. And scratch memory
5593 * load-stores are produced from operations without side-effects, thus
5594 * they should not have different behaviour in the helper invocations.
5595 */
5596 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5597 const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
5598
5599 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5600 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5601
5602 assert(nir_src_num_components(instr->src[0]) == 1);
5603 assert(bit_size <= 32);
5604 assert(nir_intrinsic_write_mask(instr) == 1);
5605 assert(nir_intrinsic_align(instr) > 0);
5606 if (bit_size == 32 &&
5607 nir_intrinsic_align(instr) >= 4) {
5608 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5609
5610 /* The offset for a DWORD scattered message is in dwords. */
5611 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5612 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5613
5614 bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
5615 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5616 } else {
5617 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5618 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5619
5620 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5621 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5622
5623 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5624 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5625 }
5626 s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5627 break;
5628 }
5629
5630 case nir_intrinsic_load_subgroup_size:
5631 /* This should only happen for fragment shaders because every other case
5632 * is lowered in NIR so we can optimize on it.
5633 */
5634 assert(s.stage == MESA_SHADER_FRAGMENT);
5635 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), elk_imm_d(s.dispatch_width));
5636 break;
5637
5638 case nir_intrinsic_load_subgroup_invocation:
5639 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
5640 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
5641 break;
5642
5643 case nir_intrinsic_load_subgroup_eq_mask:
5644 case nir_intrinsic_load_subgroup_ge_mask:
5645 case nir_intrinsic_load_subgroup_gt_mask:
5646 case nir_intrinsic_load_subgroup_le_mask:
5647 case nir_intrinsic_load_subgroup_lt_mask:
5648 unreachable("not reached");
5649
5650 case nir_intrinsic_vote_any: {
5651 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5652
5653 /* The any/all predicates do not consider channel enables. To prevent
5654 * dead channels from affecting the result, we initialize the flag with
5655 * with the identity value for the logical operation.
5656 */
5657 if (s.dispatch_width == 32) {
5658 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5659 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5660 elk_imm_ud(0));
5661 } else {
5662 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0));
5663 }
5664 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5665
5666 /* For some reason, the any/all predicates don't work properly with
5667 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
5668 * doesn't read the correct subset of the flag register and you end up
5669 * getting garbage in the second half. Work around this by using a pair
5670 * of 1-wide MOVs and scattering the result.
5671 */
5672 const fs_builder ubld = ubld1;
5673 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5674 ubld.MOV(res1, elk_imm_d(0));
5675 set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ANY8H :
5676 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ANY16H :
5677 ELK_PREDICATE_ALIGN1_ANY32H,
5678 ubld.MOV(res1, elk_imm_d(-1)));
5679
5680 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5681 break;
5682 }
5683 case nir_intrinsic_vote_all: {
5684 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5685
5686 /* The any/all predicates do not consider channel enables. To prevent
5687 * dead channels from affecting the result, we initialize the flag with
5688 * with the identity value for the logical operation.
5689 */
5690 if (s.dispatch_width == 32) {
5691 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5692 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5693 elk_imm_ud(0xffffffff));
5694 } else {
5695 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5696 }
5697 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5698
5699 /* For some reason, the any/all predicates don't work properly with
5700 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
5701 * doesn't read the correct subset of the flag register and you end up
5702 * getting garbage in the second half. Work around this by using a pair
5703 * of 1-wide MOVs and scattering the result.
5704 */
5705 const fs_builder ubld = ubld1;
5706 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5707 ubld.MOV(res1, elk_imm_d(0));
5708 set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
5709 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5710 ELK_PREDICATE_ALIGN1_ALL32H,
5711 ubld.MOV(res1, elk_imm_d(-1)));
5712
5713 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5714 break;
5715 }
5716 case nir_intrinsic_vote_feq:
5717 case nir_intrinsic_vote_ieq: {
5718 elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5719 if (instr->intrinsic == nir_intrinsic_vote_feq) {
5720 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5721 value.type = bit_size == 8 ? ELK_REGISTER_TYPE_B :
5722 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_F);
5723 }
5724
5725 elk_fs_reg uniformized = bld.emit_uniformize(value);
5726 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5727
5728 /* The any/all predicates do not consider channel enables. To prevent
5729 * dead channels from affecting the result, we initialize the flag with
5730 * with the identity value for the logical operation.
5731 */
5732 if (s.dispatch_width == 32) {
5733 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5734 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5735 elk_imm_ud(0xffffffff));
5736 } else {
5737 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5738 }
5739 bld.CMP(bld.null_reg_d(), value, uniformized, ELK_CONDITIONAL_Z);
5740
5741 /* For some reason, the any/all predicates don't work properly with
5742 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
5743 * doesn't read the correct subset of the flag register and you end up
5744 * getting garbage in the second half. Work around this by using a pair
5745 * of 1-wide MOVs and scattering the result.
5746 */
5747 const fs_builder ubld = ubld1;
5748 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5749 ubld.MOV(res1, elk_imm_d(0));
5750 set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
5751 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5752 ELK_PREDICATE_ALIGN1_ALL32H,
5753 ubld.MOV(res1, elk_imm_d(-1)));
5754
5755 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5756 break;
5757 }
5758
5759 case nir_intrinsic_ballot: {
5760 const elk_fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
5761 ELK_REGISTER_TYPE_UD);
5762 struct elk_reg flag = elk_flag_reg(0, 0);
5763 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5764 * as f0.0. This is a problem for fragment programs as we currently use
5765 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment
5766 * programs yet so this isn't a problem. When we do, something will
5767 * have to change.
5768 */
5769 if (s.dispatch_width == 32)
5770 flag.type = ELK_REGISTER_TYPE_UD;
5771
5772 bld.exec_all().group(1, 0).MOV(flag, elk_imm_ud(0u));
5773 bld.CMP(bld.null_reg_ud(), value, elk_imm_ud(0u), ELK_CONDITIONAL_NZ);
5774
5775 if (instr->def.bit_size > 32) {
5776 dest.type = ELK_REGISTER_TYPE_UQ;
5777 } else {
5778 dest.type = ELK_REGISTER_TYPE_UD;
5779 }
5780 bld.MOV(dest, flag);
5781 break;
5782 }
5783
5784 case nir_intrinsic_read_invocation: {
5785 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5786 const elk_fs_reg invocation = get_nir_src(ntb, instr->src[1]);
5787
5788 elk_fs_reg tmp = bld.vgrf(value.type);
5789
5790 /* When for some reason the subgroup_size picked by NIR is larger than
5791 * the dispatch size picked by the backend (this could happen in RT,
5792 * FS), bound the invocation to the dispatch size.
5793 */
5794 elk_fs_reg bound_invocation;
5795 if (s.api_subgroup_size == 0 ||
5796 bld.dispatch_width() < s.api_subgroup_size) {
5797 bound_invocation = bld.vgrf(ELK_REGISTER_TYPE_UD);
5798 bld.AND(bound_invocation, invocation, elk_imm_ud(s.dispatch_width - 1));
5799 } else {
5800 bound_invocation = invocation;
5801 }
5802 bld.exec_all().emit(ELK_SHADER_OPCODE_BROADCAST, tmp, value,
5803 bld.emit_uniformize(bound_invocation));
5804
5805 bld.MOV(retype(dest, value.type), elk_fs_reg(component(tmp, 0)));
5806 break;
5807 }
5808
5809 case nir_intrinsic_read_first_invocation: {
5810 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5811 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5812 break;
5813 }
5814
5815 case nir_intrinsic_shuffle: {
5816 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5817 const elk_fs_reg index = get_nir_src(ntb, instr->src[1]);
5818
5819 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5820 break;
5821 }
5822
5823 case nir_intrinsic_first_invocation: {
5824 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5825 bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5826 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5827 elk_fs_reg(component(tmp, 0)));
5828 break;
5829 }
5830
5831 case nir_intrinsic_last_invocation: {
5832 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5833 bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
5834 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5835 elk_fs_reg(component(tmp, 0)));
5836 break;
5837 }
5838
5839 case nir_intrinsic_quad_broadcast: {
5840 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5841 const unsigned index = nir_src_as_uint(instr->src[1]);
5842
5843 bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5844 value, elk_imm_ud(index), elk_imm_ud(4));
5845 break;
5846 }
5847
5848 case nir_intrinsic_quad_swap_horizontal: {
5849 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5850 const elk_fs_reg tmp = bld.vgrf(value.type);
5851 if (devinfo->ver <= 7) {
5852 /* The hardware doesn't seem to support these crazy regions with
5853 * compressed instructions on gfx7 and earlier so we fall back to
5854 * using quad swizzles. Fortunately, we don't support 64-bit
5855 * anything in Vulkan on gfx7.
5856 */
5857 assert(nir_src_bit_size(instr->src[0]) == 32);
5858 const fs_builder ubld = bld.exec_all();
5859 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5860 elk_imm_ud(ELK_SWIZZLE4(1,0,3,2)));
5861 bld.MOV(retype(dest, value.type), tmp);
5862 } else {
5863 const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
5864
5865 const elk_fs_reg src_left = horiz_stride(value, 2);
5866 const elk_fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5867 const elk_fs_reg tmp_left = horiz_stride(tmp, 2);
5868 const elk_fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5869
5870 ubld.MOV(tmp_left, src_right);
5871 ubld.MOV(tmp_right, src_left);
5872
5873 }
5874 bld.MOV(retype(dest, value.type), tmp);
5875 break;
5876 }
5877
5878 case nir_intrinsic_quad_swap_vertical: {
5879 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5880 if (nir_src_bit_size(instr->src[0]) == 32) {
5881 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5882 const elk_fs_reg tmp = bld.vgrf(value.type);
5883 const fs_builder ubld = bld.exec_all();
5884 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5885 elk_imm_ud(ELK_SWIZZLE4(2,3,0,1)));
5886 bld.MOV(retype(dest, value.type), tmp);
5887 } else {
5888 /* For larger data types, we have to either emit dispatch_width many
5889 * MOVs or else fall back to doing indirects.
5890 */
5891 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5892 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5893 elk_imm_w(0x2));
5894 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5895 }
5896 break;
5897 }
5898
5899 case nir_intrinsic_quad_swap_diagonal: {
5900 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5901 if (nir_src_bit_size(instr->src[0]) == 32) {
5902 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5903 const elk_fs_reg tmp = bld.vgrf(value.type);
5904 const fs_builder ubld = bld.exec_all();
5905 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5906 elk_imm_ud(ELK_SWIZZLE4(3,2,1,0)));
5907 bld.MOV(retype(dest, value.type), tmp);
5908 } else {
5909 /* For larger data types, we have to either emit dispatch_width many
5910 * MOVs or else fall back to doing indirects.
5911 */
5912 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5913 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5914 elk_imm_w(0x3));
5915 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5916 }
5917 break;
5918 }
5919
5920 case nir_intrinsic_ddx_fine:
5921 bld.emit(ELK_FS_OPCODE_DDX_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5922 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5923 break;
5924 case nir_intrinsic_ddx:
5925 case nir_intrinsic_ddx_coarse:
5926 bld.emit(ELK_FS_OPCODE_DDX_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5927 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5928 break;
5929 case nir_intrinsic_ddy_fine:
5930 bld.emit(ELK_FS_OPCODE_DDY_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5931 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5932 break;
5933 case nir_intrinsic_ddy:
5934 case nir_intrinsic_ddy_coarse:
5935 bld.emit(ELK_FS_OPCODE_DDY_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5936 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5937 break;
5938
5939 case nir_intrinsic_reduce: {
5940 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5941 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5942 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5943 if (cluster_size == 0 || cluster_size > s.dispatch_width)
5944 cluster_size = s.dispatch_width;
5945
5946 /* Figure out the source type */
5947 src.type = elk_type_for_nir_type(devinfo,
5948 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5949 nir_src_bit_size(instr->src[0])));
5950
5951 elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
5952 elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
5953 elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
5954
5955 /* Set up a register for all of our scratching around and initialize it
5956 * to reduction operation's identity value.
5957 */
5958 elk_fs_reg scan = bld.vgrf(src.type);
5959 bld.exec_all().emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5960
5961 bld.emit_scan(elk_op, scan, cluster_size, cond_mod);
5962
5963 dest.type = src.type;
5964 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5965 /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5966 * the distance between clusters is at least 2 GRFs. In this case,
5967 * we don't need the weird striding of the CLUSTER_BROADCAST
5968 * instruction and can just do regular MOVs.
5969 */
5970 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5971 const unsigned groups =
5972 (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5973 const unsigned group_size = s.dispatch_width / groups;
5974 for (unsigned i = 0; i < groups; i++) {
5975 const unsigned cluster = (i * group_size) / cluster_size;
5976 const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5977 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5978 component(scan, comp));
5979 }
5980 } else {
5981 bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
5982 elk_imm_ud(cluster_size - 1), elk_imm_ud(cluster_size));
5983 }
5984 break;
5985 }
5986
5987 case nir_intrinsic_inclusive_scan:
5988 case nir_intrinsic_exclusive_scan: {
5989 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5990 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5991
5992 /* Figure out the source type */
5993 src.type = elk_type_for_nir_type(devinfo,
5994 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5995 nir_src_bit_size(instr->src[0])));
5996
5997 elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
5998 elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
5999 elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
6000
6001 /* Set up a register for all of our scratching around and initialize it
6002 * to reduction operation's identity value.
6003 */
6004 elk_fs_reg scan = bld.vgrf(src.type);
6005 const fs_builder allbld = bld.exec_all();
6006 allbld.emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
6007
6008 if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
6009 /* Exclusive scan is a bit harder because we have to do an annoying
6010 * shift of the contents before we can begin. To make things worse,
6011 * we can't do this with a normal stride; we have to use indirects.
6012 */
6013 elk_fs_reg shifted = bld.vgrf(src.type);
6014 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6015 allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6016 elk_imm_w(-1));
6017 allbld.emit(ELK_SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
6018 allbld.group(1, 0).MOV(component(shifted, 0), identity);
6019 scan = shifted;
6020 }
6021
6022 bld.emit_scan(elk_op, scan, s.dispatch_width, cond_mod);
6023
6024 bld.MOV(retype(dest, src.type), scan);
6025 break;
6026 }
6027
6028 case nir_intrinsic_load_global_block_intel: {
6029 assert(instr->def.bit_size == 32);
6030
6031 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6032
6033 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6034 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6035 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6036
6037 const unsigned total = instr->num_components * s.dispatch_width;
6038 unsigned loaded = 0;
6039
6040 while (loaded < total) {
6041 const unsigned block =
6042 choose_oword_block_size_dwords(devinfo, total - loaded);
6043 const unsigned block_bytes = block * 4;
6044
6045 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6046
6047 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6048 srcs[A64_LOGICAL_ADDRESS] = address;
6049 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
6050 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6051 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(1);
6052 ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6053 retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6054 srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6055
6056 increment_a64_address(ubld1, address, block_bytes);
6057 loaded += block;
6058 }
6059
6060 assert(loaded == total);
6061 break;
6062 }
6063
6064 case nir_intrinsic_store_global_block_intel: {
6065 assert(nir_src_bit_size(instr->src[0]) == 32);
6066
6067 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
6068 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6069
6070 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6071 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6072 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6073
6074 const unsigned total = instr->num_components * s.dispatch_width;
6075 unsigned written = 0;
6076
6077 while (written < total) {
6078 const unsigned block =
6079 choose_oword_block_size_dwords(devinfo, total - written);
6080
6081 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6082 srcs[A64_LOGICAL_ADDRESS] = address;
6083 srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
6084 ELK_REGISTER_TYPE_UD);
6085 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6086 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6087
6088 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6089 ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, elk_fs_reg(),
6090 srcs, A64_LOGICAL_NUM_SRCS);
6091
6092 const unsigned block_bytes = block * 4;
6093 increment_a64_address(ubld1, address, block_bytes);
6094 written += block;
6095 }
6096
6097 assert(written == total);
6098 break;
6099 }
6100
6101 case nir_intrinsic_load_shared_block_intel:
6102 case nir_intrinsic_load_ssbo_block_intel: {
6103 assert(instr->def.bit_size == 32);
6104
6105 const bool is_ssbo =
6106 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
6107 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
6108
6109 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6110 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6111 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6112 elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6113 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6114
6115 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6116 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6117 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6118
6119 const unsigned total = instr->num_components * s.dispatch_width;
6120 unsigned loaded = 0;
6121
6122 while (loaded < total) {
6123 const unsigned block =
6124 choose_oword_block_size_dwords(devinfo, total - loaded);
6125 const unsigned block_bytes = block * 4;
6126
6127 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6128
6129 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6130 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6131 retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6132 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6133
6134 ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6135 loaded += block;
6136 }
6137
6138 assert(loaded == total);
6139 break;
6140 }
6141
6142 case nir_intrinsic_store_shared_block_intel:
6143 case nir_intrinsic_store_ssbo_block_intel: {
6144 assert(nir_src_bit_size(instr->src[0]) == 32);
6145
6146 const bool is_ssbo =
6147 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
6148
6149 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
6150 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6151
6152 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6153 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6154 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6155 elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6156 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6157
6158 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6159 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6160 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6161
6162 const unsigned total = instr->num_components * s.dispatch_width;
6163 unsigned written = 0;
6164
6165 while (written < total) {
6166 const unsigned block =
6167 choose_oword_block_size_dwords(devinfo, total - written);
6168
6169 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6170 srcs[SURFACE_LOGICAL_SRC_DATA] =
6171 retype(byte_offset(src, written * 4), ELK_REGISTER_TYPE_UD);
6172
6173 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6174 ubld.emit(ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
6175 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6176
6177 const unsigned block_bytes = block * 4;
6178 ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6179 written += block;
6180 }
6181
6182 assert(written == total);
6183 break;
6184 }
6185
6186 default:
6187 #ifndef NDEBUG
6188 assert(instr->intrinsic < nir_num_intrinsics);
6189 fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
6190 #endif
6191 unreachable("unknown intrinsic");
6192 }
6193 }
6194
6195 static elk_fs_reg
expand_to_32bit(const fs_builder & bld,const elk_fs_reg & src)6196 expand_to_32bit(const fs_builder &bld, const elk_fs_reg &src)
6197 {
6198 if (type_sz(src.type) == 2) {
6199 elk_fs_reg src32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6200 bld.MOV(src32, retype(src, ELK_REGISTER_TYPE_UW));
6201 return src32;
6202 } else {
6203 return src;
6204 }
6205 }
6206
6207 static void
fs_nir_emit_surface_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,elk_fs_reg surface,bool bindless)6208 fs_nir_emit_surface_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6209 nir_intrinsic_instr *instr,
6210 elk_fs_reg surface,
6211 bool bindless)
6212 {
6213 const intel_device_info *devinfo = ntb.devinfo;
6214 elk_fs_visitor &s = ntb.s;
6215
6216 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6217 int num_data = lsc_op_num_data_values(op);
6218
6219 bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
6220
6221 /* The BTI untyped atomic messages only support 32-bit atomics. If you
6222 * just look at the big table of messages in the Vol 7 of the SKL PRM, they
6223 * appear to exist. However, if you look at Vol 2a, there are no message
6224 * descriptors provided for Qword atomic ops except for A64 messages.
6225 *
6226 * 16-bit float atomics are supported, however.
6227 */
6228 assert(instr->def.bit_size == 32 ||
6229 (instr->def.bit_size == 64 && devinfo->has_lsc) ||
6230 (instr->def.bit_size == 16 &&
6231 (devinfo->has_lsc || elk_lsc_opcode_is_atomic_float(op))));
6232
6233 elk_fs_reg dest = get_nir_def(ntb, instr->def);
6234
6235 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6236 srcs[bindless ?
6237 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6238 SURFACE_LOGICAL_SRC_SURFACE] = surface;
6239 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6240 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
6241 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
6242
6243 if (shared) {
6244 /* SLM - Get the offset */
6245 if (nir_src_is_const(instr->src[0])) {
6246 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6247 elk_imm_ud(nir_intrinsic_base(instr) +
6248 nir_src_as_uint(instr->src[0]));
6249 } else {
6250 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
6251 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6252 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD),
6253 elk_imm_ud(nir_intrinsic_base(instr)));
6254 }
6255 } else {
6256 /* SSBOs */
6257 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6258 }
6259
6260 elk_fs_reg data;
6261 if (num_data >= 1)
6262 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
6263
6264 if (num_data >= 2) {
6265 elk_fs_reg tmp = bld.vgrf(data.type, 2);
6266 elk_fs_reg sources[2] = {
6267 data,
6268 expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
6269 };
6270 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6271 data = tmp;
6272 }
6273 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6274
6275 /* Emit the actual atomic operation */
6276
6277 switch (instr->def.bit_size) {
6278 case 16: {
6279 elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6280 bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6281 retype(dest32, dest.type),
6282 srcs, SURFACE_LOGICAL_NUM_SRCS);
6283 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW),
6284 retype(dest32, ELK_REGISTER_TYPE_UD));
6285 break;
6286 }
6287
6288 case 32:
6289 case 64:
6290 bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6291 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6292 break;
6293 default:
6294 unreachable("Unsupported bit size");
6295 }
6296 }
6297
6298 static void
fs_nir_emit_global_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)6299 fs_nir_emit_global_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6300 nir_intrinsic_instr *instr)
6301 {
6302 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6303 int num_data = lsc_op_num_data_values(op);
6304
6305 elk_fs_reg dest = get_nir_def(ntb, instr->def);
6306
6307 elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
6308
6309 elk_fs_reg data;
6310 if (num_data >= 1)
6311 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
6312
6313 if (num_data >= 2) {
6314 elk_fs_reg tmp = bld.vgrf(data.type, 2);
6315 elk_fs_reg sources[2] = {
6316 data,
6317 expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
6318 };
6319 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6320 data = tmp;
6321 }
6322
6323 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6324 srcs[A64_LOGICAL_ADDRESS] = addr;
6325 srcs[A64_LOGICAL_SRC] = data;
6326 srcs[A64_LOGICAL_ARG] = elk_imm_ud(op);
6327 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6328
6329 switch (instr->def.bit_size) {
6330 case 16: {
6331 elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6332 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
6333 retype(dest32, dest.type),
6334 srcs, A64_LOGICAL_NUM_SRCS);
6335 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW), dest32);
6336 break;
6337 }
6338 case 32:
6339 case 64:
6340 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
6341 srcs, A64_LOGICAL_NUM_SRCS);
6342 break;
6343 default:
6344 unreachable("Unsupported bit size");
6345 }
6346 }
6347
6348 static void
fs_nir_emit_texture(nir_to_elk_state & ntb,nir_tex_instr * instr)6349 fs_nir_emit_texture(nir_to_elk_state &ntb,
6350 nir_tex_instr *instr)
6351 {
6352 const intel_device_info *devinfo = ntb.devinfo;
6353 const fs_builder &bld = ntb.bld;
6354 elk_fs_visitor &s = ntb.s;
6355
6356 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
6357
6358 /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
6359 *
6360 * "The Pixel Null Mask field, when enabled via the Pixel Null Mask
6361 * Enable will be incorect for sample_c when applied to a surface with
6362 * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
6363 * Enable may incorrectly report pixels as referencing a Null surface."
6364 *
6365 * We'll take care of this in NIR.
6366 */
6367 assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
6368
6369 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(instr->is_sparse);
6370
6371 int lod_components = 0;
6372
6373 /* The hardware requires a LOD for buffer textures */
6374 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6375 srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_d(0);
6376
6377 ASSERTED bool got_lod = false;
6378 ASSERTED bool got_bias = false;
6379 uint32_t header_bits = 0;
6380 for (unsigned i = 0; i < instr->num_srcs; i++) {
6381 nir_src nir_src = instr->src[i].src;
6382 elk_fs_reg src = get_nir_src(ntb, nir_src);
6383 switch (instr->src[i].src_type) {
6384 case nir_tex_src_bias:
6385 assert(!got_lod);
6386 got_bias = true;
6387
6388 srcs[TEX_LOGICAL_SRC_LOD] =
6389 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6390 break;
6391 case nir_tex_src_comparator:
6392 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, ELK_REGISTER_TYPE_F);
6393 break;
6394 case nir_tex_src_coord:
6395 switch (instr->op) {
6396 case nir_texop_txf:
6397 case nir_texop_txf_ms:
6398 case nir_texop_txf_ms_mcs_intel:
6399 case nir_texop_samples_identical:
6400 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_D);
6401 break;
6402 default:
6403 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_F);
6404 break;
6405 }
6406 break;
6407 case nir_tex_src_ddx:
6408 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, ELK_REGISTER_TYPE_F);
6409 lod_components = nir_tex_instr_src_size(instr, i);
6410 break;
6411 case nir_tex_src_ddy:
6412 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, ELK_REGISTER_TYPE_F);
6413 break;
6414 case nir_tex_src_lod:
6415 assert(!got_bias);
6416 got_lod = true;
6417
6418 switch (instr->op) {
6419 case nir_texop_txs:
6420 srcs[TEX_LOGICAL_SRC_LOD] =
6421 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_UD);
6422 break;
6423 case nir_texop_txf:
6424 srcs[TEX_LOGICAL_SRC_LOD] =
6425 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_D);
6426 break;
6427 default:
6428 srcs[TEX_LOGICAL_SRC_LOD] =
6429 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6430 break;
6431 }
6432 break;
6433 case nir_tex_src_min_lod:
6434 srcs[TEX_LOGICAL_SRC_MIN_LOD] =
6435 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6436 break;
6437 case nir_tex_src_ms_index:
6438 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, ELK_REGISTER_TYPE_UD);
6439 break;
6440
6441 case nir_tex_src_offset: {
6442 uint32_t offset_bits = 0;
6443 if (elk_texture_offset(instr, i, &offset_bits)) {
6444 header_bits |= offset_bits;
6445 } else {
6446 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
6447 retype(src, ELK_REGISTER_TYPE_D);
6448 }
6449 break;
6450 }
6451
6452 case nir_tex_src_projector:
6453 unreachable("should be lowered");
6454
6455 case nir_tex_src_texture_offset: {
6456 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
6457 /* Emit code to evaluate the actual indexing expression */
6458 if (instr->texture_index == 0 && is_resource_src(nir_src))
6459 srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
6460 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
6461 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6462 bld.ADD(tmp, src, elk_imm_ud(instr->texture_index));
6463 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
6464 }
6465 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
6466 break;
6467 }
6468
6469 case nir_tex_src_sampler_offset: {
6470 /* Emit code to evaluate the actual indexing expression */
6471 if (instr->sampler_index == 0 && is_resource_src(nir_src))
6472 srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
6473 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
6474 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6475 bld.ADD(tmp, src, elk_imm_ud(instr->sampler_index));
6476 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
6477 }
6478 break;
6479 }
6480
6481 case nir_tex_src_texture_handle:
6482 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
6483 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_fs_reg();
6484 if (is_resource_src(nir_src))
6485 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
6486 if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6487 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
6488 break;
6489
6490 case nir_tex_src_sampler_handle:
6491 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
6492 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_fs_reg();
6493 if (is_resource_src(nir_src))
6494 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
6495 if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6496 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
6497 break;
6498
6499 case nir_tex_src_ms_mcs_intel:
6500 assert(instr->op == nir_texop_txf_ms);
6501 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, ELK_REGISTER_TYPE_D);
6502 break;
6503
6504 /* If this parameter is present, we are packing either the explicit LOD
6505 * or LOD bias and the array index into a single (32-bit) value when
6506 * 32-bit texture coordinates are used.
6507 */
6508 case nir_tex_src_backend1:
6509 assert(!got_lod && !got_bias);
6510 got_lod = true;
6511
6512 assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
6513 srcs[TEX_LOGICAL_SRC_LOD] =
6514 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6515 break;
6516
6517 default:
6518 unreachable("unknown texture source");
6519 }
6520 }
6521
6522 /* If the surface or sampler were not specified through sources, use the
6523 * instruction index.
6524 */
6525 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
6526 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6527 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(instr->texture_index);
6528 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
6529 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6530 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(instr->sampler_index);
6531
6532 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
6533 (instr->op == nir_texop_txf_ms ||
6534 instr->op == nir_texop_samples_identical)) {
6535 if (devinfo->ver >= 7) {
6536 srcs[TEX_LOGICAL_SRC_MCS] =
6537 emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
6538 instr->coord_components,
6539 srcs[TEX_LOGICAL_SRC_SURFACE],
6540 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
6541 } else {
6542 srcs[TEX_LOGICAL_SRC_MCS] = elk_imm_ud(0u);
6543 }
6544 }
6545
6546 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(instr->coord_components);
6547 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(lod_components);
6548
6549 enum elk_opcode opcode;
6550 switch (instr->op) {
6551 case nir_texop_tex:
6552 opcode = ELK_SHADER_OPCODE_TEX_LOGICAL;
6553 break;
6554 case nir_texop_txb:
6555 opcode = ELK_FS_OPCODE_TXB_LOGICAL;
6556 break;
6557 case nir_texop_txl:
6558 opcode = ELK_SHADER_OPCODE_TXL_LOGICAL;
6559 break;
6560 case nir_texop_txd:
6561 opcode = ELK_SHADER_OPCODE_TXD_LOGICAL;
6562 break;
6563 case nir_texop_txf:
6564 opcode = ELK_SHADER_OPCODE_TXF_LOGICAL;
6565 break;
6566 case nir_texop_txf_ms:
6567 opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
6568 break;
6569 case nir_texop_txf_ms_mcs_intel:
6570 opcode = ELK_SHADER_OPCODE_TXF_MCS_LOGICAL;
6571 break;
6572 case nir_texop_query_levels:
6573 case nir_texop_txs:
6574 opcode = ELK_SHADER_OPCODE_TXS_LOGICAL;
6575 break;
6576 case nir_texop_lod:
6577 opcode = ELK_SHADER_OPCODE_LOD_LOGICAL;
6578 break;
6579 case nir_texop_tg4:
6580 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
6581 opcode = ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL;
6582 else
6583 opcode = ELK_SHADER_OPCODE_TG4_LOGICAL;
6584 break;
6585 case nir_texop_texture_samples:
6586 opcode = ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL;
6587 break;
6588 case nir_texop_samples_identical: {
6589 elk_fs_reg dst = retype(get_nir_def(ntb, instr->def), ELK_REGISTER_TYPE_D);
6590
6591 /* If mcs is an immediate value, it means there is no MCS. In that case
6592 * just return false.
6593 */
6594 if (srcs[TEX_LOGICAL_SRC_MCS].file == ELK_IMMEDIATE_VALUE) {
6595 bld.MOV(dst, elk_imm_ud(0u));
6596 } else {
6597 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], elk_imm_ud(0u),
6598 ELK_CONDITIONAL_EQ);
6599 }
6600 return;
6601 }
6602 default:
6603 unreachable("unknown texture opcode");
6604 }
6605
6606 if (instr->op == nir_texop_tg4) {
6607 if (instr->component == 1 &&
6608 s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
6609 /* gather4 sampler is broken for green channel on RG32F --
6610 * we must ask for blue instead.
6611 */
6612 header_bits |= 2 << 16;
6613 } else {
6614 header_bits |= instr->component << 16;
6615 }
6616 }
6617
6618 elk_fs_reg dst = bld.vgrf(elk_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
6619 elk_fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
6620 inst->offset = header_bits;
6621
6622 const unsigned dest_size = nir_tex_instr_dest_size(instr);
6623 inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
6624 (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
6625
6626 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
6627 inst->shadow_compare = true;
6628
6629 /* Wa_14012688258:
6630 *
6631 * Don't trim zeros at the end of payload for sample operations
6632 * in cube and cube arrays.
6633 */
6634 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6635 intel_needs_workaround(devinfo, 14012688258)) {
6636
6637 /* Compiler should send U,V,R parameters even if V,R are 0. */
6638 if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
6639 assert(instr->coord_components >= 3u);
6640
6641 /* See opt_zero_samples(). */
6642 inst->keep_payload_trailing_zeros = true;
6643 }
6644
6645 elk_fs_reg nir_dest[5];
6646 for (unsigned i = 0; i < dest_size; i++)
6647 nir_dest[i] = offset(dst, bld, i);
6648
6649 if (instr->op == nir_texop_query_levels) {
6650 /* # levels is in .w */
6651 /**
6652 * Wa_1940217:
6653 *
6654 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
6655 * MIPCount returned is undefined instead of 0.
6656 */
6657 elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
6658 mov->conditional_mod = ELK_CONDITIONAL_NZ;
6659 nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
6660 elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
6661 sel->predicate = ELK_PREDICATE_NORMAL;
6662 } else if (instr->op == nir_texop_txs &&
6663 dest_size >= 3 && devinfo->ver < 7) {
6664 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
6665 elk_fs_reg depth = offset(dst, bld, 2);
6666 nir_dest[2] = s.vgrf(glsl_int_type());
6667 bld.emit_minmax(nir_dest[2], depth, elk_imm_d(1), ELK_CONDITIONAL_GE);
6668 }
6669
6670 /* The residency bits are only in the first component. */
6671 if (instr->is_sparse)
6672 nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
6673
6674 bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
6675 }
6676
6677 static void
fs_nir_emit_jump(nir_to_elk_state & ntb,nir_jump_instr * instr)6678 fs_nir_emit_jump(nir_to_elk_state &ntb, nir_jump_instr *instr)
6679 {
6680 switch (instr->type) {
6681 case nir_jump_break:
6682 ntb.bld.emit(ELK_OPCODE_BREAK);
6683 break;
6684 case nir_jump_continue:
6685 ntb.bld.emit(ELK_OPCODE_CONTINUE);
6686 break;
6687 case nir_jump_halt:
6688 ntb.bld.emit(ELK_OPCODE_HALT);
6689 break;
6690 case nir_jump_return:
6691 default:
6692 unreachable("unknown jump");
6693 }
6694 }
6695
6696 /*
6697 * This helper takes a source register and un/shuffles it into the destination
6698 * register.
6699 *
6700 * If source type size is smaller than destination type size the operation
6701 * needed is a component shuffle. The opposite case would be an unshuffle. If
6702 * source/destination type size is equal a shuffle is done that would be
6703 * equivalent to a simple MOV.
6704 *
6705 * For example, if source is a 16-bit type and destination is 32-bit. A 3
6706 * components .xyz 16-bit vector on SIMD8 would be.
6707 *
6708 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
6709 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
6710 *
6711 * This helper will return the following 2 32-bit components with the 16-bit
6712 * values shuffled:
6713 *
6714 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
6715 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
6716 *
6717 * For unshuffle, the example would be the opposite, a 64-bit type source
6718 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
6719 * would be:
6720 *
6721 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
6722 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
6723 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
6724 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
6725 *
6726 * The returned result would be the following 4 32-bit components unshuffled:
6727 *
6728 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
6729 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
6730 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
6731 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
6732 *
6733 * - Source and destination register must not be overlapped.
6734 * - components units are measured in terms of the smaller type between
6735 * source and destination because we are un/shuffling the smaller
6736 * components from/into the bigger ones.
6737 * - first_component parameter allows skipping source components.
6738 */
6739 void
elk_shuffle_src_to_dst(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6740 elk_shuffle_src_to_dst(const fs_builder &bld,
6741 const elk_fs_reg &dst,
6742 const elk_fs_reg &src,
6743 uint32_t first_component,
6744 uint32_t components)
6745 {
6746 if (type_sz(src.type) == type_sz(dst.type)) {
6747 assert(!regions_overlap(dst,
6748 type_sz(dst.type) * bld.dispatch_width() * components,
6749 offset(src, bld, first_component),
6750 type_sz(src.type) * bld.dispatch_width() * components));
6751 for (unsigned i = 0; i < components; i++) {
6752 bld.MOV(retype(offset(dst, bld, i), src.type),
6753 offset(src, bld, i + first_component));
6754 }
6755 } else if (type_sz(src.type) < type_sz(dst.type)) {
6756 /* Source is shuffled into destination */
6757 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
6758 assert(!regions_overlap(dst,
6759 type_sz(dst.type) * bld.dispatch_width() *
6760 DIV_ROUND_UP(components, size_ratio),
6761 offset(src, bld, first_component),
6762 type_sz(src.type) * bld.dispatch_width() * components));
6763
6764 elk_reg_type shuffle_type =
6765 elk_reg_type_from_bit_size(8 * type_sz(src.type),
6766 ELK_REGISTER_TYPE_D);
6767 for (unsigned i = 0; i < components; i++) {
6768 elk_fs_reg shuffle_component_i =
6769 subscript(offset(dst, bld, i / size_ratio),
6770 shuffle_type, i % size_ratio);
6771 bld.MOV(shuffle_component_i,
6772 retype(offset(src, bld, i + first_component), shuffle_type));
6773 }
6774 } else {
6775 /* Source is unshuffled into destination */
6776 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
6777 assert(!regions_overlap(dst,
6778 type_sz(dst.type) * bld.dispatch_width() * components,
6779 offset(src, bld, first_component / size_ratio),
6780 type_sz(src.type) * bld.dispatch_width() *
6781 DIV_ROUND_UP(components + (first_component % size_ratio),
6782 size_ratio)));
6783
6784 elk_reg_type shuffle_type =
6785 elk_reg_type_from_bit_size(8 * type_sz(dst.type),
6786 ELK_REGISTER_TYPE_D);
6787 for (unsigned i = 0; i < components; i++) {
6788 elk_fs_reg shuffle_component_i =
6789 subscript(offset(src, bld, (first_component + i) / size_ratio),
6790 shuffle_type, (first_component + i) % size_ratio);
6791 bld.MOV(retype(offset(dst, bld, i), shuffle_type),
6792 shuffle_component_i);
6793 }
6794 }
6795 }
6796
6797 void
elk_shuffle_from_32bit_read(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6798 elk_shuffle_from_32bit_read(const fs_builder &bld,
6799 const elk_fs_reg &dst,
6800 const elk_fs_reg &src,
6801 uint32_t first_component,
6802 uint32_t components)
6803 {
6804 assert(type_sz(src.type) == 4);
6805
6806 /* This function takes components in units of the destination type while
6807 * elk_shuffle_src_to_dst takes components in units of the smallest type
6808 */
6809 if (type_sz(dst.type) > 4) {
6810 assert(type_sz(dst.type) == 8);
6811 first_component *= 2;
6812 components *= 2;
6813 }
6814
6815 elk_shuffle_src_to_dst(bld, dst, src, first_component, components);
6816 }
6817
6818 elk_fs_reg
elk_setup_imm_df(const fs_builder & bld,double v)6819 elk_setup_imm_df(const fs_builder &bld, double v)
6820 {
6821 const struct intel_device_info *devinfo = bld.shader->devinfo;
6822 assert(devinfo->ver >= 7);
6823
6824 if (devinfo->ver >= 8)
6825 return elk_imm_df(v);
6826
6827 /* gfx7.5 does not support DF immediates straightforward but the DIM
6828 * instruction allows to set the 64-bit immediate value.
6829 */
6830 if (devinfo->platform == INTEL_PLATFORM_HSW) {
6831 const fs_builder ubld = bld.exec_all().group(1, 0);
6832 elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_DF, 1);
6833 ubld.DIM(dst, elk_imm_df(v));
6834 return component(dst, 0);
6835 }
6836
6837 /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
6838 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
6839 * the high 32-bit to suboffset 4 and then applying a stride of 0.
6840 *
6841 * Alternatively, we could also produce a normal VGRF (without stride 0)
6842 * by writing to all the channels in the VGRF, however, that would hit the
6843 * gfx7 bug where we have to split writes that span more than 1 register
6844 * into instructions with a width of 4 (otherwise the write to the second
6845 * register written runs into an execmask hardware bug) which isn't very
6846 * nice.
6847 */
6848 union {
6849 double d;
6850 struct {
6851 uint32_t i1;
6852 uint32_t i2;
6853 };
6854 } di;
6855
6856 di.d = v;
6857
6858 const fs_builder ubld = bld.exec_all().group(1, 0);
6859 const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
6860 ubld.MOV(tmp, elk_imm_ud(di.i1));
6861 ubld.MOV(horiz_offset(tmp, 1), elk_imm_ud(di.i2));
6862
6863 return component(retype(tmp, ELK_REGISTER_TYPE_DF), 0);
6864 }
6865
6866 elk_fs_reg
elk_setup_imm_b(const fs_builder & bld,int8_t v)6867 elk_setup_imm_b(const fs_builder &bld, int8_t v)
6868 {
6869 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_B);
6870 bld.MOV(tmp, elk_imm_w(v));
6871 return tmp;
6872 }
6873
6874 elk_fs_reg
elk_setup_imm_ub(const fs_builder & bld,uint8_t v)6875 elk_setup_imm_ub(const fs_builder &bld, uint8_t v)
6876 {
6877 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UB);
6878 bld.MOV(tmp, elk_imm_uw(v));
6879 return tmp;
6880 }
6881
6882 static void
fs_nir_emit_instr(nir_to_elk_state & ntb,nir_instr * instr)6883 fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr)
6884 {
6885 ntb.bld = ntb.bld.annotate(NULL, instr);
6886
6887 switch (instr->type) {
6888 case nir_instr_type_alu:
6889 fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
6890 break;
6891
6892 case nir_instr_type_deref:
6893 unreachable("All derefs should've been lowered");
6894 break;
6895
6896 case nir_instr_type_intrinsic:
6897 switch (ntb.s.stage) {
6898 case MESA_SHADER_VERTEX:
6899 fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6900 break;
6901 case MESA_SHADER_TESS_CTRL:
6902 fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6903 break;
6904 case MESA_SHADER_TESS_EVAL:
6905 fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6906 break;
6907 case MESA_SHADER_GEOMETRY:
6908 fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6909 break;
6910 case MESA_SHADER_FRAGMENT:
6911 fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6912 break;
6913 case MESA_SHADER_COMPUTE:
6914 fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6915 break;
6916 default:
6917 unreachable("unsupported shader stage");
6918 }
6919 break;
6920
6921 case nir_instr_type_tex:
6922 fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
6923 break;
6924
6925 case nir_instr_type_load_const:
6926 fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
6927 break;
6928
6929 case nir_instr_type_undef:
6930 /* We create a new VGRF for undefs on every use (by handling
6931 * them in get_nir_src()), rather than for each definition.
6932 * This helps register coalescing eliminate MOVs from undef.
6933 */
6934 break;
6935
6936 case nir_instr_type_jump:
6937 fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
6938 break;
6939
6940 default:
6941 unreachable("unknown instruction type");
6942 }
6943 }
6944
6945 static unsigned
elk_rnd_mode_from_nir(unsigned mode,unsigned * mask)6946 elk_rnd_mode_from_nir(unsigned mode, unsigned *mask)
6947 {
6948 unsigned elk_mode = 0;
6949 *mask = 0;
6950
6951 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
6952 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
6953 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
6954 mode) {
6955 elk_mode |= ELK_RND_MODE_RTZ << ELK_CR0_RND_MODE_SHIFT;
6956 *mask |= ELK_CR0_RND_MODE_MASK;
6957 }
6958 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
6959 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
6960 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
6961 mode) {
6962 elk_mode |= ELK_RND_MODE_RTNE << ELK_CR0_RND_MODE_SHIFT;
6963 *mask |= ELK_CR0_RND_MODE_MASK;
6964 }
6965 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
6966 elk_mode |= ELK_CR0_FP16_DENORM_PRESERVE;
6967 *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6968 }
6969 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
6970 elk_mode |= ELK_CR0_FP32_DENORM_PRESERVE;
6971 *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
6972 }
6973 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
6974 elk_mode |= ELK_CR0_FP64_DENORM_PRESERVE;
6975 *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
6976 }
6977 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
6978 *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6979 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
6980 *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
6981 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
6982 *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
6983 if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
6984 *mask |= ELK_CR0_FP_MODE_MASK;
6985
6986 if (*mask != 0)
6987 assert((*mask & elk_mode) == elk_mode);
6988
6989 return elk_mode;
6990 }
6991
6992 static void
emit_shader_float_controls_execution_mode(nir_to_elk_state & ntb)6993 emit_shader_float_controls_execution_mode(nir_to_elk_state &ntb)
6994 {
6995 const fs_builder &bld = ntb.bld;
6996 elk_fs_visitor &s = ntb.s;
6997
6998 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
6999 if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7000 return;
7001
7002 fs_builder ubld = bld.exec_all().group(1, 0);
7003 fs_builder abld = ubld.annotate("shader floats control execution mode");
7004 unsigned mask, mode = elk_rnd_mode_from_nir(execution_mode, &mask);
7005
7006 if (mask == 0)
7007 return;
7008
7009 abld.emit(ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7010 elk_imm_d(mode), elk_imm_d(mask));
7011 }
7012
7013 void
nir_to_elk(elk_fs_visitor * s)7014 nir_to_elk(elk_fs_visitor *s)
7015 {
7016 nir_to_elk_state ntb = {
7017 .s = *s,
7018 .nir = s->nir,
7019 .devinfo = s->devinfo,
7020 .mem_ctx = ralloc_context(NULL),
7021 .bld = fs_builder(s).at_end(),
7022 };
7023
7024 emit_shader_float_controls_execution_mode(ntb);
7025
7026 /* emit the arrays used for inputs and outputs - load/store intrinsics will
7027 * be converted to reads/writes of these arrays
7028 */
7029 fs_nir_setup_outputs(ntb);
7030 fs_nir_setup_uniforms(ntb.s);
7031 fs_nir_emit_system_values(ntb);
7032 ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7033
7034 fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7035
7036 ntb.bld.emit(ELK_SHADER_OPCODE_HALT_TARGET);
7037
7038 ralloc_free(ntb.mem_ctx);
7039 }
7040
7041