1 /*
2 * Copyright © 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25 #include "nir_builder.h"
26
27 /*
28 * Wa_18019110168 for gfx 12.5.
29 *
30 * This file implements workaround for HW bug, which leads to fragment shader
31 * reading incorrect per-primitive data if mesh shader, in addition to writing
32 * per-primitive data, also writes to gl_ClipDistance.
33 *
34 * The suggested solution to that bug is to not use per-primitive data by:
35 * - creating new vertices for provoking vertices shared by multiple primitives
36 * - converting per-primitive attributes read by fragment shader to flat
37 * per-vertex attributes for the provoking vertex
38 * - modifying fragment shader to read those per-vertex attributes
39 *
40 * There are at least 2 type of failures not handled very well:
41 * - if the number of varying slots overflows, than only some attributes will
42 * be converted, leading to corruption of those unconverted attributes
43 * - if the overall MUE size is so large it doesn't fit in URB, then URB
44 * allocation will fail in some way; unfortunately there's no good way to
45 * say how big MUE will be at this moment and back out
46 *
47 * This workaround needs to be applied before linking, so that unused outputs
48 * created by this code are removed at link time.
49 *
50 * This workaround can be controlled by a driconf option to either disable it,
51 * lower its scope or force enable it.
52 *
53 * Option "anv_mesh_conv_prim_attrs_to_vert_attrs" is evaluated like this:
54 * value == 0 - disable workaround
55 * value < 0 - enable ONLY if workaround is required
56 * value > 0 - enable ALWAYS, even if it's not required
57 * abs(value) >= 1 - attribute conversion
58 * abs(value) >= 2 - attribute conversion and vertex duplication
59 *
60 * Default: -2 (both parts of the work around, ONLY if it's required)
61 *
62 */
63
64 static bool
anv_mesh_convert_attrs_prim_to_vert(struct nir_shader * nir,gl_varying_slot * wa_mapping,uint64_t fs_inputs,const VkGraphicsPipelineCreateInfo * pCreateInfo,void * mem_ctx,const bool dup_vertices,const bool force_conversion)65 anv_mesh_convert_attrs_prim_to_vert(struct nir_shader *nir,
66 gl_varying_slot *wa_mapping,
67 uint64_t fs_inputs,
68 const VkGraphicsPipelineCreateInfo *pCreateInfo,
69 void *mem_ctx,
70 const bool dup_vertices,
71 const bool force_conversion)
72 {
73 uint64_t per_primitive_outputs = nir->info.per_primitive_outputs;
74 per_primitive_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
75
76 if (per_primitive_outputs == 0)
77 return false;
78
79 uint64_t outputs_written = nir->info.outputs_written;
80 uint64_t other_outputs = outputs_written & ~per_primitive_outputs;
81
82 if ((other_outputs & (VARYING_BIT_CLIP_DIST0 | VARYING_BIT_CLIP_DIST1)) == 0)
83 if (!force_conversion)
84 return false;
85
86 uint64_t all_outputs = outputs_written;
87 unsigned attrs = 0;
88
89 uint64_t remapped_outputs = outputs_written & per_primitive_outputs;
90 remapped_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
91
92 /* Skip locations not read by the fragment shader, because they will
93 * be eliminated at linking time. Note that some fs inputs may be
94 * removed only after optimizations, so it's possible that we will
95 * create too many variables.
96 */
97 remapped_outputs &= fs_inputs;
98
99 /* Figure out the mapping between per-primitive and new per-vertex outputs. */
100 nir_foreach_shader_out_variable(var, nir) {
101 int location = var->data.location;
102
103 if (!(BITFIELD64_BIT(location) & remapped_outputs))
104 continue;
105
106 /* Although primitive shading rate, layer and viewport have predefined
107 * place in MUE Primitive Header (so we can't really move them anywhere),
108 * we have to copy them to per-vertex space if fragment shader reads them.
109 */
110 assert(location == VARYING_SLOT_PRIMITIVE_SHADING_RATE ||
111 location == VARYING_SLOT_LAYER ||
112 location == VARYING_SLOT_VIEWPORT ||
113 location == VARYING_SLOT_PRIMITIVE_ID ||
114 location >= VARYING_SLOT_VAR0);
115
116 const struct glsl_type *type = var->type;
117 if (nir_is_arrayed_io(var, MESA_SHADER_MESH) || var->data.per_view) {
118 assert(glsl_type_is_array(type));
119 type = glsl_get_array_element(type);
120 }
121
122 unsigned num_slots = glsl_count_attribute_slots(type, false);
123
124 for (gl_varying_slot slot = VARYING_SLOT_VAR0; slot <= VARYING_SLOT_VAR31; slot++) {
125 uint64_t mask = BITFIELD64_MASK(num_slots) << slot;
126 if ((all_outputs & mask) == 0) {
127 wa_mapping[location] = slot;
128 all_outputs |= mask;
129 attrs++;
130 break;
131 }
132 }
133
134 if (wa_mapping[location] == 0) {
135 fprintf(stderr, "Not enough space for hardware per-primitive data corruption work around.\n");
136 break;
137 }
138 }
139
140 if (attrs == 0)
141 if (!force_conversion)
142 return false;
143
144 unsigned provoking_vertex = 0;
145
146 const VkPipelineRasterizationStateCreateInfo *rs_info = pCreateInfo->pRasterizationState;
147 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
148 vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
149 if (rs_pv_info && rs_pv_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
150 provoking_vertex = 2;
151
152 unsigned vertices_per_primitive =
153 mesa_vertices_per_prim(nir->info.mesh.primitive_type);
154
155 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
156 nir_builder b = nir_builder_at(nir_after_impl(impl));
157
158 /* wait for all subgroups to finish */
159 nir_barrier(&b, SCOPE_WORKGROUP);
160
161 nir_def *zero = nir_imm_int(&b, 0);
162
163 nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
164
165 nir_def *cmp = nir_ieq(&b, local_invocation_index, zero);
166 nir_if *if_stmt = nir_push_if(&b, cmp);
167 {
168 nir_variable *primitive_count_var = NULL;
169 nir_variable *primitive_indices_var = NULL;
170
171 unsigned num_other_variables = 0;
172 nir_foreach_shader_out_variable(var, b.shader) {
173 if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
174 continue;
175 num_other_variables++;
176 }
177
178 nir_deref_instr **per_vertex_derefs =
179 ralloc_array(mem_ctx, nir_deref_instr *, num_other_variables);
180
181 unsigned num_per_vertex_variables = 0;
182
183 unsigned processed = 0;
184 nir_foreach_shader_out_variable(var, b.shader) {
185 if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
186 continue;
187
188 switch (var->data.location) {
189 case VARYING_SLOT_PRIMITIVE_COUNT:
190 primitive_count_var = var;
191 break;
192 case VARYING_SLOT_PRIMITIVE_INDICES:
193 primitive_indices_var = var;
194 break;
195 default: {
196 const struct glsl_type *type = var->type;
197 assert(glsl_type_is_array(type));
198 const struct glsl_type *array_element_type =
199 glsl_get_array_element(type);
200
201 if (dup_vertices) {
202 /*
203 * Resize type of array output to make space for one extra
204 * vertex attribute for each primitive, so we ensure that
205 * the provoking vertex is not shared between primitives.
206 */
207 const struct glsl_type *new_type =
208 glsl_array_type(array_element_type,
209 glsl_get_length(type) +
210 nir->info.mesh.max_primitives_out,
211 0);
212
213 var->type = new_type;
214 }
215
216 per_vertex_derefs[num_per_vertex_variables++] =
217 nir_build_deref_var(&b, var);
218 break;
219 }
220 }
221
222 ++processed;
223 }
224 assert(processed == num_other_variables);
225
226 assert(primitive_count_var != NULL);
227 assert(primitive_indices_var != NULL);
228
229 /* Update types of derefs to match type of variables they (de)reference. */
230 if (dup_vertices) {
231 nir_foreach_function_impl(impl, b.shader) {
232 nir_foreach_block(block, impl) {
233 nir_foreach_instr(instr, block) {
234 if (instr->type != nir_instr_type_deref)
235 continue;
236
237 nir_deref_instr *deref = nir_instr_as_deref(instr);
238 if (deref->deref_type != nir_deref_type_var)
239 continue;
240
241 if (deref->var->type != deref->type)
242 deref->type = deref->var->type;
243 }
244 }
245 }
246 }
247
248 /* indexed by slot of per-prim attribute */
249 struct {
250 nir_deref_instr *per_prim_deref;
251 nir_deref_instr *per_vert_deref;
252 } mapping[VARYING_SLOT_MAX] = {{NULL, NULL}, };
253
254 /* Create new per-vertex output variables mirroring per-primitive variables
255 * and create derefs for both old and new variables.
256 */
257 nir_foreach_shader_out_variable(var, b.shader) {
258 gl_varying_slot location = var->data.location;
259
260 if ((BITFIELD64_BIT(location) & (outputs_written & per_primitive_outputs)) == 0)
261 continue;
262 if (wa_mapping[location] == 0)
263 continue;
264
265 const struct glsl_type *type = var->type;
266 assert(glsl_type_is_array(type));
267 const struct glsl_type *array_element_type = glsl_get_array_element(type);
268
269 const struct glsl_type *new_type =
270 glsl_array_type(array_element_type,
271 nir->info.mesh.max_vertices_out +
272 (dup_vertices ? nir->info.mesh.max_primitives_out : 0),
273 0);
274
275 nir_variable *new_var =
276 nir_variable_create(b.shader, nir_var_shader_out, new_type, var->name);
277 assert(wa_mapping[location] >= VARYING_SLOT_VAR0);
278 assert(wa_mapping[location] <= VARYING_SLOT_VAR31);
279 new_var->data.location = wa_mapping[location];
280 new_var->data.interpolation = INTERP_MODE_FLAT;
281
282 mapping[location].per_vert_deref = nir_build_deref_var(&b, new_var);
283 mapping[location].per_prim_deref = nir_build_deref_var(&b, var);
284 }
285
286 nir_def *trueconst = nir_imm_true(&b);
287
288 /*
289 * for each Primitive (0 : primitiveCount)
290 * if VertexUsed[PrimitiveIndices[Primitive][provoking vertex]]
291 * create 1 new vertex at offset "Vertex"
292 * copy per vert attributes of provoking vertex to the new one
293 * update PrimitiveIndices[Primitive][provoking vertex]
294 * Vertex++
295 * else
296 * VertexUsed[PrimitiveIndices[Primitive][provoking vertex]] := true
297 *
298 * for each attribute : mapping
299 * copy per_prim_attr(Primitive) to per_vert_attr[Primitive][provoking vertex]
300 */
301
302 /* primitive count */
303 nir_def *primitive_count = nir_load_var(&b, primitive_count_var);
304
305 /* primitive index */
306 nir_variable *primitive_var =
307 nir_local_variable_create(impl, glsl_uint_type(), "Primitive");
308 nir_deref_instr *primitive_deref = nir_build_deref_var(&b, primitive_var);
309 nir_store_deref(&b, primitive_deref, zero, 1);
310
311 /* vertex index */
312 nir_variable *vertex_var =
313 nir_local_variable_create(impl, glsl_uint_type(), "Vertex");
314 nir_deref_instr *vertex_deref = nir_build_deref_var(&b, vertex_var);
315 nir_store_deref(&b, vertex_deref, nir_imm_int(&b, nir->info.mesh.max_vertices_out), 1);
316
317 /* used vertices bitvector */
318 const struct glsl_type *used_vertex_type =
319 glsl_array_type(glsl_bool_type(),
320 nir->info.mesh.max_vertices_out,
321 0);
322 nir_variable *used_vertex_var =
323 nir_local_variable_create(impl, used_vertex_type, "VertexUsed");
324 nir_deref_instr *used_vertex_deref =
325 nir_build_deref_var(&b, used_vertex_var);
326 /* Initialize it as "not used" */
327 for (unsigned i = 0; i < nir->info.mesh.max_vertices_out; ++i) {
328 nir_deref_instr *indexed_used_vertex_deref =
329 nir_build_deref_array(&b, used_vertex_deref, nir_imm_int(&b, i));
330 nir_store_deref(&b, indexed_used_vertex_deref, nir_imm_false(&b), 1);
331 }
332
333 nir_loop *loop = nir_push_loop(&b);
334 {
335 nir_def *primitive = nir_load_deref(&b, primitive_deref);
336 nir_def *cmp = nir_ige(&b, primitive, primitive_count);
337
338 nir_if *loop_check = nir_push_if(&b, cmp);
339 nir_jump(&b, nir_jump_break);
340 nir_pop_if(&b, loop_check);
341
342 nir_deref_instr *primitive_indices_deref =
343 nir_build_deref_var(&b, primitive_indices_var);
344 nir_deref_instr *indexed_primitive_indices_deref;
345 nir_def *src_vertex;
346 nir_def *prim_indices;
347
348 /* array of vectors, we have to extract index out of array deref */
349 indexed_primitive_indices_deref = nir_build_deref_array(&b, primitive_indices_deref, primitive);
350 prim_indices = nir_load_deref(&b, indexed_primitive_indices_deref);
351 src_vertex = nir_channel(&b, prim_indices, provoking_vertex);
352
353 nir_def *dst_vertex = nir_load_deref(&b, vertex_deref);
354
355 nir_deref_instr *indexed_used_vertex_deref =
356 nir_build_deref_array(&b, used_vertex_deref, src_vertex);
357 nir_def *used_vertex = nir_load_deref(&b, indexed_used_vertex_deref);
358 if (!dup_vertices)
359 used_vertex = nir_imm_false(&b);
360
361 nir_if *vertex_used_check = nir_push_if(&b, used_vertex);
362 {
363 for (unsigned a = 0; a < num_per_vertex_variables; ++a) {
364 nir_deref_instr *attr_arr = per_vertex_derefs[a];
365 nir_deref_instr *src = nir_build_deref_array(&b, attr_arr, src_vertex);
366 nir_deref_instr *dst = nir_build_deref_array(&b, attr_arr, dst_vertex);
367
368 nir_copy_deref(&b, dst, src);
369 }
370
371 /* replace one component of primitive indices vector */
372 nir_def *new_val =
373 nir_vector_insert_imm(&b, prim_indices, dst_vertex, provoking_vertex);
374
375 /* and store complete vector */
376 nir_store_deref(&b, indexed_primitive_indices_deref, new_val,
377 BITFIELD_MASK(vertices_per_primitive));
378
379 nir_store_deref(&b, vertex_deref, nir_iadd_imm(&b, dst_vertex, 1), 1);
380
381 for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
382 if (!mapping[i].per_vert_deref)
383 continue;
384
385 nir_deref_instr *src =
386 nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
387 nir_deref_instr *dst =
388 nir_build_deref_array(&b, mapping[i].per_vert_deref, dst_vertex);
389
390 nir_copy_deref(&b, dst, src);
391 }
392 }
393 nir_push_else(&b, vertex_used_check);
394 {
395 nir_store_deref(&b, indexed_used_vertex_deref, trueconst, 1);
396
397 for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
398 if (!mapping[i].per_vert_deref)
399 continue;
400
401 nir_deref_instr *src =
402 nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
403 nir_deref_instr *dst =
404 nir_build_deref_array(&b, mapping[i].per_vert_deref, src_vertex);
405
406 nir_copy_deref(&b, dst, src);
407 }
408
409 }
410 nir_pop_if(&b, vertex_used_check);
411
412 nir_store_deref(&b, primitive_deref, nir_iadd_imm(&b, primitive, 1), 1);
413 }
414 nir_pop_loop(&b, loop);
415 }
416 nir_pop_if(&b, if_stmt); /* local_invocation_index == 0 */
417
418 if (dup_vertices)
419 nir->info.mesh.max_vertices_out += nir->info.mesh.max_primitives_out;
420
421 if (should_print_nir(nir)) {
422 printf("%s\n", __func__);
423 nir_print_shader(nir, stdout);
424 }
425
426 /* deal with copy_derefs */
427 NIR_PASS(_, nir, nir_split_var_copies);
428 NIR_PASS(_, nir, nir_lower_var_copies);
429
430 nir_shader_gather_info(nir, impl);
431
432 return true;
433 }
434
435 static bool
anv_frag_update_derefs_instr(struct nir_builder * b,nir_instr * instr,void * data)436 anv_frag_update_derefs_instr(struct nir_builder *b, nir_instr *instr, void *data)
437 {
438 if (instr->type != nir_instr_type_deref)
439 return false;
440
441 nir_deref_instr *deref = nir_instr_as_deref(instr);
442 if (deref->deref_type != nir_deref_type_var)
443 return false;
444
445 nir_variable *var = deref->var;
446 if (!(var->data.mode & nir_var_shader_in))
447 return false;
448
449 int location = var->data.location;
450 nir_deref_instr **new_derefs = (nir_deref_instr **)data;
451 if (new_derefs[location] == NULL)
452 return false;
453
454 nir_instr_remove(&deref->instr);
455 nir_def_rewrite_uses(&deref->def, &new_derefs[location]->def);
456
457 return true;
458 }
459
460 static bool
anv_frag_update_derefs(nir_shader * shader,nir_deref_instr ** mapping)461 anv_frag_update_derefs(nir_shader *shader, nir_deref_instr **mapping)
462 {
463 return nir_shader_instructions_pass(shader, anv_frag_update_derefs_instr,
464 nir_metadata_none, (void *)mapping);
465 }
466
467 /* Update fragment shader inputs with new ones. */
468 static void
anv_frag_convert_attrs_prim_to_vert(struct nir_shader * nir,gl_varying_slot * wa_mapping)469 anv_frag_convert_attrs_prim_to_vert(struct nir_shader *nir,
470 gl_varying_slot *wa_mapping)
471 {
472 /* indexed by slot of per-prim attribute */
473 nir_deref_instr *new_derefs[VARYING_SLOT_MAX] = {NULL, };
474
475 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
476 nir_builder b = nir_builder_at(nir_before_impl(impl));
477
478 nir_foreach_shader_in_variable_safe(var, nir) {
479 gl_varying_slot location = var->data.location;
480 gl_varying_slot new_location = wa_mapping[location];
481 if (new_location == 0)
482 continue;
483
484 assert(wa_mapping[new_location] == 0);
485
486 nir_variable *new_var =
487 nir_variable_create(b.shader, nir_var_shader_in, var->type, var->name);
488 new_var->data.location = new_location;
489 new_var->data.location_frac = var->data.location_frac;
490 new_var->data.interpolation = INTERP_MODE_FLAT;
491
492 new_derefs[location] = nir_build_deref_var(&b, new_var);
493 }
494
495 NIR_PASS(_, nir, anv_frag_update_derefs, new_derefs);
496
497 nir_shader_gather_info(nir, impl);
498 }
499
500 void
anv_apply_per_prim_attr_wa(struct nir_shader * ms_nir,struct nir_shader * fs_nir,struct anv_device * device,const VkGraphicsPipelineCreateInfo * info)501 anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
502 struct nir_shader *fs_nir,
503 struct anv_device *device,
504 const VkGraphicsPipelineCreateInfo *info)
505 {
506 const struct intel_device_info *devinfo = device->info;
507
508 int mesh_conv_prim_attrs_to_vert_attrs =
509 device->physical->instance->mesh_conv_prim_attrs_to_vert_attrs;
510 if (mesh_conv_prim_attrs_to_vert_attrs < 0 &&
511 !intel_needs_workaround(devinfo, 18019110168))
512 mesh_conv_prim_attrs_to_vert_attrs = 0;
513
514 if (mesh_conv_prim_attrs_to_vert_attrs != 0) {
515 uint64_t fs_inputs = 0;
516 nir_foreach_shader_in_variable(var, fs_nir)
517 fs_inputs |= BITFIELD64_BIT(var->data.location);
518
519 void *stage_ctx = ralloc_context(NULL);
520
521 gl_varying_slot wa_mapping[VARYING_SLOT_MAX] = { 0, };
522
523 const bool dup_vertices = abs(mesh_conv_prim_attrs_to_vert_attrs) >= 2;
524 const bool force_conversion = mesh_conv_prim_attrs_to_vert_attrs > 0;
525
526 if (anv_mesh_convert_attrs_prim_to_vert(ms_nir, wa_mapping,
527 fs_inputs, info, stage_ctx,
528 dup_vertices, force_conversion))
529 anv_frag_convert_attrs_prim_to_vert(fs_nir, wa_mapping);
530
531 ralloc_free(stage_ctx);
532 }
533 }
534