1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_cmd_buffer.h"
6 #include "nvk_physical_device.h"
7 #include "nvk_shader.h"
8
9 #include "nir.h"
10 #include "nir_builder.h"
11 #include "nir_xfb_info.h"
12
13 #include "nv50_ir_driver.h"
14 #include "pipe/p_defines.h"
15 #include "pipe/p_shader_tokens.h"
16 #include "util/u_memory.h"
17
18 #include "nv_push_cl9097.h"
19
20 uint64_t
nvk_cg_get_prog_debug(void)21 nvk_cg_get_prog_debug(void)
22 {
23 return debug_get_num_option("NV50_PROG_DEBUG", 0);
24 }
25
26 uint64_t
nvk_cg_get_prog_optimize(void)27 nvk_cg_get_prog_optimize(void)
28 {
29 return debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
30 }
31
32 const nir_shader_compiler_options *
nvk_cg_nir_options(const struct nvk_physical_device * pdev,gl_shader_stage stage)33 nvk_cg_nir_options(const struct nvk_physical_device *pdev,
34 gl_shader_stage stage)
35 {
36 return nv50_ir_nir_shader_compiler_options(pdev->info.chipset, stage);
37 }
38
39 static nir_variable *
find_or_create_input(nir_builder * b,const struct glsl_type * type,const char * name,unsigned location)40 find_or_create_input(nir_builder *b, const struct glsl_type *type,
41 const char *name, unsigned location)
42 {
43 nir_foreach_shader_in_variable(in, b->shader) {
44 if (in->data.location == location)
45 return in;
46 }
47 nir_variable *in = nir_variable_create(b->shader, nir_var_shader_in,
48 type, name);
49 in->data.location = location;
50 if (glsl_type_is_integer(type))
51 in->data.interpolation = INTERP_MODE_FLAT;
52 else
53 in->data.interpolation = INTERP_MODE_NOPERSPECTIVE;
54
55 return in;
56 }
57
58 static bool
lower_fragcoord_instr(nir_builder * b,nir_instr * instr,UNUSED void * _data)59 lower_fragcoord_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
60 {
61 assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
62 nir_variable *var;
63
64 if (instr->type != nir_instr_type_intrinsic)
65 return false;
66
67 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
68 b->cursor = nir_before_instr(&intrin->instr);
69
70 nir_def *val;
71 switch (intrin->intrinsic) {
72 case nir_intrinsic_load_frag_coord:
73 var = find_or_create_input(b, glsl_vec4_type(),
74 "gl_FragCoord",
75 VARYING_SLOT_POS);
76 val = nir_load_var(b, var);
77 break;
78 case nir_intrinsic_load_point_coord:
79 var = find_or_create_input(b, glsl_vector_type(GLSL_TYPE_FLOAT, 2),
80 "gl_PointCoord",
81 VARYING_SLOT_PNTC);
82 val = nir_load_var(b, var);
83 break;
84 case nir_intrinsic_load_sample_pos:
85 var = find_or_create_input(b, glsl_vec4_type(),
86 "gl_FragCoord",
87 VARYING_SLOT_POS);
88 val = nir_ffract(b, nir_trim_vector(b, nir_load_var(b, var), 2));
89 break;
90 case nir_intrinsic_load_layer_id:
91 var = find_or_create_input(b, glsl_int_type(),
92 "gl_Layer", VARYING_SLOT_LAYER);
93 val = nir_load_var(b, var);
94 break;
95
96 default:
97 return false;
98 }
99
100 nir_def_rewrite_uses(&intrin->def, val);
101
102 return true;
103 }
104
105 void
nvk_cg_preprocess_nir(nir_shader * nir)106 nvk_cg_preprocess_nir(nir_shader *nir)
107 {
108 NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
109 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
110
111 NIR_PASS(_, nir, nir_split_var_copies);
112 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
113
114 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
115 NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
116
117 NIR_PASS(_, nir, nir_lower_system_values);
118
119 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
120 NIR_PASS(_, nir, nir_shader_instructions_pass, lower_fragcoord_instr,
121 nir_metadata_control_flow, NULL);
122 }
123
124 nvk_cg_optimize_nir(nir);
125
126 NIR_PASS(_, nir, nir_lower_var_copies);
127 }
128
129 void
nvk_cg_optimize_nir(nir_shader * nir)130 nvk_cg_optimize_nir(nir_shader *nir)
131 {
132 bool progress;
133
134 do {
135 progress = false;
136
137 NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
138 NIR_PASS(progress, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
139
140 if (!nir->info.var_copies_lowered) {
141 /* Only run this pass if nir_lower_var_copies was not called
142 * yet. That would lower away any copy_deref instructions and we
143 * don't want to introduce any more.
144 */
145 NIR_PASS(progress, nir, nir_opt_find_array_copies);
146 }
147 NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
148 NIR_PASS(progress, nir, nir_opt_dead_write_vars);
149 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
150 NIR_PASS(progress, nir, nir_copy_prop);
151 NIR_PASS(progress, nir, nir_opt_remove_phis);
152 NIR_PASS(progress, nir, nir_opt_dce);
153 if (nir_opt_loop(nir)) {
154 progress = true;
155 NIR_PASS(progress, nir, nir_copy_prop);
156 NIR_PASS(progress, nir, nir_opt_remove_phis);
157 NIR_PASS(progress, nir, nir_opt_dce);
158 }
159 NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
160 NIR_PASS(progress, nir, nir_opt_dead_cf);
161 NIR_PASS(progress, nir, nir_opt_cse);
162 /*
163 * this should be fine, likely a backend problem,
164 * but a bunch of tessellation shaders blow up.
165 * we should revisit this when NAK is merged.
166 */
167 NIR_PASS(progress, nir, nir_opt_peephole_select, 2, true, true);
168 NIR_PASS(progress, nir, nir_opt_constant_folding);
169 NIR_PASS(progress, nir, nir_opt_algebraic);
170
171 NIR_PASS(progress, nir, nir_opt_undef);
172
173 if (nir->options->max_unroll_iterations) {
174 NIR_PASS(progress, nir, nir_opt_loop_unroll);
175 }
176 } while (progress);
177
178 NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
179 NIR_PASS(progress, nir, nir_remove_dead_variables,
180 nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, NULL);
181 }
182
183 static bool
lower_image_size_to_txs(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * _data)184 lower_image_size_to_txs(nir_builder *b, nir_intrinsic_instr *intrin,
185 UNUSED void *_data)
186 {
187 if (intrin->intrinsic != nir_intrinsic_image_deref_size)
188 return false;
189
190 b->cursor = nir_instr_remove(&intrin->instr);
191
192 nir_deref_instr *img = nir_src_as_deref(intrin->src[0]);
193 nir_def *lod = nir_tex_type_has_lod(img->type) ?
194 intrin->src[1].ssa : NULL;
195 nir_def *size = nir_txs_deref(b, img, lod);
196
197 if (glsl_get_sampler_dim(img->type) == GLSL_SAMPLER_DIM_CUBE) {
198 /* Cube image descriptors are set up as simple arrays but SPIR-V wants
199 * the number of cubes.
200 */
201 if (glsl_sampler_type_is_array(img->type)) {
202 size = nir_vec3(b, nir_channel(b, size, 0),
203 nir_channel(b, size, 1),
204 nir_udiv_imm(b, nir_channel(b, size, 2), 6));
205 } else {
206 size = nir_vec3(b, nir_channel(b, size, 0),
207 nir_channel(b, size, 1),
208 nir_imm_int(b, 1));
209 }
210 }
211
212 nir_def_rewrite_uses(&intrin->def, size);
213
214 return true;
215 }
216
217 static int
count_location_slots(const struct glsl_type * type,bool bindless)218 count_location_slots(const struct glsl_type *type, bool bindless)
219 {
220 return glsl_count_attribute_slots(type, false);
221 }
222
223 static void
assign_io_locations(nir_shader * nir)224 assign_io_locations(nir_shader *nir)
225 {
226 if (nir->info.stage != MESA_SHADER_VERTEX) {
227 unsigned location = 0;
228 nir_foreach_variable_with_modes(var, nir, nir_var_shader_in) {
229 var->data.driver_location = location;
230 if (nir_is_arrayed_io(var, nir->info.stage)) {
231 location += glsl_count_attribute_slots(glsl_get_array_element(var->type), false);
232 } else {
233 location += glsl_count_attribute_slots(var->type, false);
234 }
235 }
236 nir->num_inputs = location;
237 } else {
238 nir_foreach_shader_in_variable(var, nir) {
239 assert(var->data.location >= VERT_ATTRIB_GENERIC0);
240 var->data.driver_location = var->data.location - VERT_ATTRIB_GENERIC0;
241 }
242 }
243
244 {
245 unsigned location = 0;
246 nir_foreach_variable_with_modes(var, nir, nir_var_shader_out) {
247 var->data.driver_location = location;
248 if (nir_is_arrayed_io(var, nir->info.stage)) {
249 location += glsl_count_attribute_slots(glsl_get_array_element(var->type), false);
250 } else {
251 location += glsl_count_attribute_slots(var->type, false);
252 }
253 }
254 nir->num_outputs = location;
255 }
256 }
257
258 static void
nak_cg_postprocess_nir(nir_shader * nir)259 nak_cg_postprocess_nir(nir_shader *nir)
260 {
261 NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_image_size_to_txs,
262 nir_metadata_control_flow, NULL);
263
264 uint32_t indirect_mask = nir_var_function_temp;
265
266 NIR_PASS(_, nir, nir_lower_indirect_derefs, indirect_mask, 16);
267
268 nvk_cg_optimize_nir(nir);
269 if (nir->info.stage != MESA_SHADER_COMPUTE)
270 assign_io_locations(nir);
271
272 NIR_PASS(_, nir, nir_lower_int64);
273
274 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
275 }
276
277 /* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
278 * 124 scalar varying values.
279 */
280 static uint32_t
nvc0_shader_input_address(unsigned sn,unsigned si)281 nvc0_shader_input_address(unsigned sn, unsigned si)
282 {
283 switch (sn) {
284 case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4;
285 case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4;
286 case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10;
287 case TGSI_SEMANTIC_PRIMID: return 0x060;
288 case TGSI_SEMANTIC_LAYER: return 0x064;
289 case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
290 case TGSI_SEMANTIC_PSIZE: return 0x06c;
291 case TGSI_SEMANTIC_POSITION: return 0x070;
292 case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10;
293 case TGSI_SEMANTIC_FOG: return 0x2e8;
294 case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10;
295 case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10;
296 case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10;
297 case TGSI_SEMANTIC_CLIPVERTEX: return 0x270;
298 case TGSI_SEMANTIC_PCOORD: return 0x2e0;
299 case TGSI_SEMANTIC_TESSCOORD: return 0x2f0;
300 case TGSI_SEMANTIC_INSTANCEID: return 0x2f8;
301 case TGSI_SEMANTIC_VERTEXID: return 0x2fc;
302 case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10;
303 default:
304 assert(!"invalid TGSI input semantic");
305 return ~0;
306 }
307 }
308
309 static uint32_t
nvc0_shader_output_address(unsigned sn,unsigned si)310 nvc0_shader_output_address(unsigned sn, unsigned si)
311 {
312 switch (sn) {
313 case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4;
314 case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4;
315 case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10;
316 case TGSI_SEMANTIC_PRIMID: return 0x060;
317 case TGSI_SEMANTIC_LAYER: return 0x064;
318 case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
319 case TGSI_SEMANTIC_PSIZE: return 0x06c;
320 case TGSI_SEMANTIC_POSITION: return 0x070;
321 case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10;
322 case TGSI_SEMANTIC_FOG: return 0x2e8;
323 case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10;
324 case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10;
325 case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10;
326 case TGSI_SEMANTIC_CLIPVERTEX: return 0x270;
327 case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10;
328 case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0;
329 case TGSI_SEMANTIC_EDGEFLAG: return ~0;
330 default:
331 assert(!"invalid TGSI output semantic");
332 return ~0;
333 }
334 }
335
336 static int
nvc0_vp_assign_input_slots(struct nv50_ir_prog_info_out * info)337 nvc0_vp_assign_input_slots(struct nv50_ir_prog_info_out *info)
338 {
339 unsigned i, c, n;
340
341 for (n = 0, i = 0; i < info->numInputs; ++i) {
342 switch (info->in[i].sn) {
343 case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
344 case TGSI_SEMANTIC_VERTEXID:
345 info->in[i].mask = 0x1;
346 info->in[i].slot[0] =
347 nvc0_shader_input_address(info->in[i].sn, 0) / 4;
348 continue;
349 default:
350 break;
351 }
352 for (c = 0; c < 4; ++c)
353 info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
354 ++n;
355 }
356
357 return 0;
358 }
359
360 static int
nvc0_sp_assign_input_slots(struct nv50_ir_prog_info_out * info)361 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info_out *info)
362 {
363 unsigned offset;
364 unsigned i, c;
365
366 for (i = 0; i < info->numInputs; ++i) {
367 offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
368
369 for (c = 0; c < 4; ++c)
370 info->in[i].slot[c] = (offset + c * 0x4) / 4;
371 }
372
373 return 0;
374 }
375
376 static int
nvc0_fp_assign_output_slots(struct nv50_ir_prog_info_out * info)377 nvc0_fp_assign_output_slots(struct nv50_ir_prog_info_out *info)
378 {
379 unsigned count = info->prop.fp.numColourResults * 4;
380 unsigned i, c;
381
382 /* Compute the relative position of each color output, since skipped MRT
383 * positions will not have registers allocated to them.
384 */
385 unsigned colors[8] = {0};
386 for (i = 0; i < info->numOutputs; ++i)
387 if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
388 colors[info->out[i].si] = 1;
389 for (i = 0, c = 0; i < 8; i++)
390 if (colors[i])
391 colors[i] = c++;
392 for (i = 0; i < info->numOutputs; ++i)
393 if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
394 for (c = 0; c < 4; ++c)
395 info->out[i].slot[c] = colors[info->out[i].si] * 4 + c;
396
397 if (info->io.sampleMask < NV50_CODEGEN_MAX_VARYINGS)
398 info->out[info->io.sampleMask].slot[0] = count++;
399 else
400 if (info->target >= 0xe0)
401 count++; /* on Kepler, depth is always last colour reg + 2 */
402
403 if (info->io.fragDepth < NV50_CODEGEN_MAX_VARYINGS)
404 info->out[info->io.fragDepth].slot[2] = count;
405
406 return 0;
407 }
408
409 static int
nvc0_sp_assign_output_slots(struct nv50_ir_prog_info_out * info)410 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info_out *info)
411 {
412 unsigned offset;
413 unsigned i, c;
414
415 for (i = 0; i < info->numOutputs; ++i) {
416 offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
417
418 for (c = 0; c < 4; ++c)
419 info->out[i].slot[c] = (offset + c * 0x4) / 4;
420 }
421
422 return 0;
423 }
424
425 static int
nvc0_program_assign_varying_slots(struct nv50_ir_prog_info_out * info)426 nvc0_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)
427 {
428 int ret;
429
430 if (info->type == PIPE_SHADER_VERTEX)
431 ret = nvc0_vp_assign_input_slots(info);
432 else
433 ret = nvc0_sp_assign_input_slots(info);
434 if (ret)
435 return ret;
436
437 if (info->type == PIPE_SHADER_FRAGMENT)
438 ret = nvc0_fp_assign_output_slots(info);
439 else
440 ret = nvc0_sp_assign_output_slots(info);
441 return ret;
442 }
443
444 static inline void
nvk_vtgs_hdr_update_oread(struct nvk_shader * vs,uint8_t slot)445 nvk_vtgs_hdr_update_oread(struct nvk_shader *vs, uint8_t slot)
446 {
447 uint8_t min = (vs->info.hdr[4] >> 12) & 0xff;
448 uint8_t max = (vs->info.hdr[4] >> 24);
449
450 min = MIN2(min, slot);
451 max = MAX2(max, slot);
452
453 vs->info.hdr[4] = (max << 24) | (min << 12);
454 }
455
456 static int
nvk_vtgp_gen_header(struct nvk_shader * vs,struct nv50_ir_prog_info_out * info)457 nvk_vtgp_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
458 {
459 unsigned i, c, a;
460
461 for (i = 0; i < info->numInputs; ++i) {
462 if (info->in[i].patch)
463 continue;
464 for (c = 0; c < 4; ++c) {
465 a = info->in[i].slot[c];
466 if (info->in[i].mask & (1 << c))
467 vs->info.hdr[5 + a / 32] |= 1 << (a % 32);
468 }
469 }
470
471 for (i = 0; i < info->numOutputs; ++i) {
472 if (info->out[i].patch)
473 continue;
474 for (c = 0; c < 4; ++c) {
475 if (!(info->out[i].mask & (1 << c)))
476 continue;
477 assert(info->out[i].slot[c] >= 0x40 / 4);
478 a = info->out[i].slot[c] - 0x40 / 4;
479 vs->info.hdr[13 + a / 32] |= 1 << (a % 32);
480 if (info->out[i].oread)
481 nvk_vtgs_hdr_update_oread(vs, info->out[i].slot[c]);
482 }
483 }
484
485 for (i = 0; i < info->numSysVals; ++i) {
486 switch (info->sv[i].sn) {
487 case SYSTEM_VALUE_PRIMITIVE_ID:
488 vs->info.hdr[5] |= 1 << 24;
489 break;
490 case SYSTEM_VALUE_INSTANCE_ID:
491 vs->info.hdr[10] |= 1 << 30;
492 break;
493 case SYSTEM_VALUE_VERTEX_ID:
494 vs->info.hdr[10] |= 1 << 31;
495 break;
496 case SYSTEM_VALUE_TESS_COORD:
497 /* We don't have the mask, nor the slots populated. While this could
498 * be achieved, the vast majority of the time if either of the coords
499 * are read, then both will be read.
500 */
501 nvk_vtgs_hdr_update_oread(vs, 0x2f0 / 4);
502 nvk_vtgs_hdr_update_oread(vs, 0x2f4 / 4);
503 break;
504 default:
505 break;
506 }
507 }
508
509 vs->info.vtg.writes_layer = (vs->info.hdr[13] & (1 << 9)) != 0;
510 vs->info.vtg.clip_enable = (1 << info->io.clipDistances) - 1;
511 vs->info.vtg.cull_enable =
512 ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
513
514 return 0;
515 }
516
517 static int
nvk_vs_gen_header(struct nvk_shader * vs,struct nv50_ir_prog_info_out * info)518 nvk_vs_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
519 {
520 vs->info.hdr[0] = 0x20061 | (1 << 10);
521 vs->info.hdr[4] = 0xff000;
522
523 return nvk_vtgp_gen_header(vs, info);
524 }
525
526 static int
nvk_gs_gen_header(struct nvk_shader * gs,const struct nir_shader * nir,struct nv50_ir_prog_info_out * info)527 nvk_gs_gen_header(struct nvk_shader *gs,
528 const struct nir_shader *nir,
529 struct nv50_ir_prog_info_out *info)
530 {
531 gs->info.hdr[0] = 0x20061 | (4 << 10);
532
533 gs->info.hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;
534
535 switch (info->prop.gp.outputPrim) {
536 case MESA_PRIM_POINTS:
537 gs->info.hdr[3] = 0x01000000;
538 break;
539 case MESA_PRIM_LINE_STRIP:
540 gs->info.hdr[3] = 0x06000000;
541 break;
542 case MESA_PRIM_TRIANGLE_STRIP:
543 gs->info.hdr[3] = 0x07000000;
544 break;
545 default:
546 assert(0);
547 break;
548 }
549
550 gs->info.hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024);
551
552 gs->info.hdr[0] |= nir->info.gs.active_stream_mask << 28;
553
554 return nvk_vtgp_gen_header(gs, info);
555 }
556
557 static void
nvk_generate_tessellation_parameters(const struct nv50_ir_prog_info_out * info,struct nvk_shader * shader)558 nvk_generate_tessellation_parameters(const struct nv50_ir_prog_info_out *info,
559 struct nvk_shader *shader)
560 {
561 // TODO: this is a little confusing because nouveau codegen uses
562 // MESA_PRIM_POINTS for unspecified domain and
563 // MESA_PRIM_POINTS = 0, the same as NV9097 ISOLINE enum
564 switch (info->prop.tp.domain) {
565 case MESA_PRIM_LINES:
566 shader->info.ts.domain = NAK_TS_DOMAIN_ISOLINE;
567 break;
568 case MESA_PRIM_TRIANGLES:
569 shader->info.ts.domain = NAK_TS_DOMAIN_TRIANGLE;
570 break;
571 case MESA_PRIM_QUADS:
572 shader->info.ts.domain = NAK_TS_DOMAIN_QUAD;
573 break;
574 default:
575 return;
576 }
577
578 switch (info->prop.tp.partitioning) {
579 case PIPE_TESS_SPACING_EQUAL:
580 shader->info.ts.spacing = NAK_TS_SPACING_INTEGER;
581 break;
582 case PIPE_TESS_SPACING_FRACTIONAL_ODD:
583 shader->info.ts.spacing = NAK_TS_SPACING_FRACT_ODD;
584 break;
585 case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
586 shader->info.ts.spacing = NAK_TS_SPACING_FRACT_EVEN;
587 break;
588 default:
589 assert(!"invalid tessellator partitioning");
590 break;
591 }
592
593 if (info->prop.tp.outputPrim == MESA_PRIM_POINTS) { // point_mode
594 shader->info.ts.prims = NAK_TS_PRIMS_POINTS;
595 } else if (info->prop.tp.domain == MESA_PRIM_LINES) { // isoline domain
596 shader->info.ts.prims = NAK_TS_PRIMS_LINES;
597 } else { // triangle/quad domain
598 if (info->prop.tp.winding > 0) {
599 shader->info.ts.prims = NAK_TS_PRIMS_TRIANGLES_CW;
600 } else {
601 shader->info.ts.prims = NAK_TS_PRIMS_TRIANGLES_CCW;
602 }
603 }
604 }
605
606 static int
nvk_tcs_gen_header(struct nvk_shader * tcs,struct nv50_ir_prog_info_out * info)607 nvk_tcs_gen_header(struct nvk_shader *tcs, struct nv50_ir_prog_info_out *info)
608 {
609 unsigned opcs = 6; /* output patch constants (at least the TessFactors) */
610
611 if (info->numPatchConstants)
612 opcs = 8 + info->numPatchConstants * 4;
613
614 tcs->info.hdr[0] = 0x20061 | (2 << 10);
615
616 tcs->info.hdr[1] = opcs << 24;
617 tcs->info.hdr[2] = info->prop.tp.outputPatchSize << 24;
618
619 tcs->info.hdr[4] = 0xff000; /* initial min/max parallel output read address */
620
621 nvk_vtgp_gen_header(tcs, info);
622
623 if (info->target >= NVISA_GM107_CHIPSET) {
624 /* On GM107+, the number of output patch components has moved in the TCP
625 * header, but it seems like blob still also uses the old position.
626 * Also, the high 8-bits are located in between the min/max parallel
627 * field and has to be set after updating the outputs. */
628 tcs->info.hdr[3] = (opcs & 0x0f) << 28;
629 tcs->info.hdr[4] |= (opcs & 0xf0) << 16;
630 }
631
632 nvk_generate_tessellation_parameters(info, tcs);
633
634 return 0;
635 }
636
637 static int
nvk_tes_gen_header(struct nvk_shader * tes,struct nv50_ir_prog_info_out * info)638 nvk_tes_gen_header(struct nvk_shader *tes, struct nv50_ir_prog_info_out *info)
639 {
640 tes->info.hdr[0] = 0x20061 | (3 << 10);
641 tes->info.hdr[4] = 0xff000;
642
643 nvk_vtgp_gen_header(tes, info);
644
645 nvk_generate_tessellation_parameters(info, tes);
646
647 tes->info.hdr[18] |= 0x3 << 12; /* ? */
648
649 return 0;
650 }
651
652 #define NVC0_INTERP_FLAT (1 << 0)
653 #define NVC0_INTERP_PERSPECTIVE (2 << 0)
654 #define NVC0_INTERP_LINEAR (3 << 0)
655 #define NVC0_INTERP_CENTROID (1 << 2)
656
657 static uint8_t
nvk_hdr_interp_mode(const struct nv50_ir_varying * var)658 nvk_hdr_interp_mode(const struct nv50_ir_varying *var)
659 {
660 if (var->linear)
661 return NVC0_INTERP_LINEAR;
662 if (var->flat)
663 return NVC0_INTERP_FLAT;
664 return NVC0_INTERP_PERSPECTIVE;
665 }
666
667
668 static int
nvk_fs_gen_header(struct nvk_shader * fs,const struct nak_fs_key * key,struct nv50_ir_prog_info_out * info)669 nvk_fs_gen_header(struct nvk_shader *fs, const struct nak_fs_key *key,
670 struct nv50_ir_prog_info_out *info)
671 {
672 unsigned i, c, a, m;
673
674 /* just 00062 on Kepler */
675 fs->info.hdr[0] = 0x20062 | (5 << 10);
676 fs->info.hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
677
678 if (info->prop.fp.usesDiscard || key->zs_self_dep)
679 fs->info.hdr[0] |= 0x8000;
680 if (!info->prop.fp.separateFragData)
681 fs->info.hdr[0] |= 0x4000;
682 if (info->io.sampleMask < 80 /* PIPE_MAX_SHADER_OUTPUTS */)
683 fs->info.hdr[19] |= 0x1;
684 if (info->prop.fp.writesDepth) {
685 fs->info.hdr[19] |= 0x2;
686 fs->info.fs.writes_depth = true;
687 }
688
689 for (i = 0; i < info->numInputs; ++i) {
690 m = nvk_hdr_interp_mode(&info->in[i]);
691 for (c = 0; c < 4; ++c) {
692 if (!(info->in[i].mask & (1 << c)))
693 continue;
694 a = info->in[i].slot[c];
695 if (info->in[i].slot[0] >= (0x060 / 4) &&
696 info->in[i].slot[0] <= (0x07c / 4)) {
697 fs->info.hdr[5] |= 1 << (24 + (a - 0x060 / 4));
698 } else
699 if (info->in[i].slot[0] >= (0x2c0 / 4) &&
700 info->in[i].slot[0] <= (0x2fc / 4)) {
701 fs->info.hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
702 } else {
703 if (info->in[i].slot[c] < (0x040 / 4) ||
704 info->in[i].slot[c] > (0x380 / 4))
705 continue;
706 a *= 2;
707 if (info->in[i].slot[0] >= (0x300 / 4))
708 a -= 32;
709 fs->info.hdr[4 + a / 32] |= m << (a % 32);
710 }
711 }
712 }
713 /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */
714 if (info->prop.fp.readsSampleLocations && info->target >= NVISA_GM200_CHIPSET)
715 fs->info.hdr[5] |= 0x30000000;
716
717 for (i = 0; i < info->numOutputs; ++i) {
718 if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
719 fs->info.hdr[18] |= 0xf << (4 * info->out[i].si);
720 }
721
722 /* There are no "regular" attachments, but the shader still needs to be
723 * executed. It seems like it wants to think that it has some color
724 * outputs in order to actually run.
725 */
726 if (info->prop.fp.numColourResults == 0 &&
727 !info->prop.fp.writesDepth &&
728 info->io.sampleMask >= 80 /* PIPE_MAX_SHADER_OUTPUTS */)
729 fs->info.hdr[18] |= 0xf;
730
731 fs->info.fs.early_fragment_tests = info->prop.fp.earlyFragTests;
732 fs->info.fs.reads_sample_mask = info->prop.fp.usesSampleMaskIn;
733 fs->info.fs.post_depth_coverage = info->prop.fp.postDepthCoverage;
734
735 return 0;
736 }
737
find_register_index_for_xfb_output(const struct nir_shader * nir,nir_xfb_output_info output)738 static uint8_t find_register_index_for_xfb_output(const struct nir_shader *nir,
739 nir_xfb_output_info output)
740 {
741 nir_foreach_shader_out_variable(var, nir) {
742 uint32_t slots = glsl_count_vec4_slots(var->type, false, false);
743 for (uint32_t i = 0; i < slots; ++i) {
744 if (output.location == (var->data.location+i)) {
745 return var->data.driver_location+i;
746 }
747 }
748 }
749 // should not be reached
750 return 0;
751 }
752
753 static void
nvk_fill_transform_feedback_state(struct nak_xfb_info * xfb,struct nir_shader * nir,const struct nv50_ir_prog_info_out * info)754 nvk_fill_transform_feedback_state(struct nak_xfb_info *xfb,
755 struct nir_shader *nir,
756 const struct nv50_ir_prog_info_out *info)
757 {
758 const uint8_t max_buffers = 4;
759 const uint8_t dw_bytes = 4;
760 const struct nir_xfb_info *nx = nir->xfb_info;
761 //nir_print_xfb_info(nx, stdout);
762
763 memset(xfb, 0, sizeof(*xfb));
764
765 for (uint8_t b = 0; b < max_buffers; ++b) {
766 xfb->stride[b] = b < nx->buffers_written ? nx->buffers[b].stride : 0;
767 xfb->attr_count[b] = 0;
768 xfb->stream[b] = nx->buffer_to_stream[b];
769 }
770 memset(xfb->attr_index, 0xff, sizeof(xfb->attr_index)); /* = skip */
771
772 if (info->numOutputs == 0)
773 return;
774
775 for (uint32_t i = 0; i < nx->output_count; ++i) {
776 const nir_xfb_output_info output = nx->outputs[i];
777 const uint8_t b = output.buffer;
778 const uint8_t r = find_register_index_for_xfb_output(nir, output);
779 uint32_t p = output.offset / dw_bytes;
780
781 assert(r < info->numOutputs && p < ARRAY_SIZE(xfb->attr_index[b]));
782
783 u_foreach_bit(c, nx->outputs[i].component_mask)
784 xfb->attr_index[b][p++] = info->out[r].slot[c];
785
786 xfb->attr_count[b] = MAX2(xfb->attr_count[b], p);
787 }
788
789 /* zero unused indices */
790 for (uint8_t b = 0; b < 4; ++b)
791 for (uint32_t c = xfb->attr_count[b]; c & 3; ++c)
792 xfb->attr_index[b][c] = 0;
793 }
794
795 VkResult
nvk_cg_compile_nir(struct nvk_physical_device * pdev,nir_shader * nir,const struct nak_fs_key * fs_key,struct nvk_shader * shader)796 nvk_cg_compile_nir(struct nvk_physical_device *pdev, nir_shader *nir,
797 const struct nak_fs_key *fs_key,
798 struct nvk_shader *shader)
799 {
800 struct nv50_ir_prog_info *info;
801 struct nv50_ir_prog_info_out info_out = {};
802 int ret;
803
804 nak_cg_postprocess_nir(nir);
805
806 info = CALLOC_STRUCT(nv50_ir_prog_info);
807 if (!info)
808 return false;
809
810 info->type = nir->info.stage;
811 info->target = pdev->info.chipset;
812 info->bin.nir = nir;
813
814 for (unsigned i = 0; i < 3; i++)
815 shader->info.cs.local_size[i] = nir->info.workgroup_size[i];
816
817 info->dbgFlags = nvk_cg_get_prog_debug();
818 info->optLevel = nvk_cg_get_prog_optimize();
819 info->io.auxCBSlot = 1;
820 info->io.uboInfoBase = 0;
821 info->io.drawInfoBase = nvk_root_descriptor_offset(draw.base_vertex);
822 if (nir->info.stage == MESA_SHADER_COMPUTE) {
823 info->prop.cp.gridInfoBase = 0;
824 } else {
825 info->assignSlots = nvc0_program_assign_varying_slots;
826 }
827 ret = nv50_ir_generate_code(info, &info_out);
828 if (ret)
829 return VK_ERROR_UNKNOWN;
830
831 if (info_out.bin.fixupData) {
832 nv50_ir_apply_fixups(info_out.bin.fixupData, info_out.bin.code,
833 fs_key && fs_key->force_sample_shading,
834 false /* flatshade */, false /* alphatest */,
835 fs_key && fs_key->force_sample_shading);
836 }
837
838 shader->info.stage = nir->info.stage;
839 shader->code_ptr = (uint8_t *)info_out.bin.code;
840 shader->code_size = info_out.bin.codeSize;
841
842 if (info_out.target >= NVISA_GV100_CHIPSET)
843 shader->info.num_gprs = MAX2(4, info_out.bin.maxGPR + 3);
844 else
845 shader->info.num_gprs = MAX2(4, info_out.bin.maxGPR + 1);
846 shader->info.num_control_barriers = info_out.numBarriers;
847
848 if (info_out.bin.tlsSpace) {
849 assert(info_out.bin.tlsSpace < (1 << 24));
850 shader->info.hdr[0] |= 1 << 26;
851 shader->info.hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */
852 shader->info.slm_size = info_out.bin.tlsSpace;
853 }
854
855 switch (info->type) {
856 case PIPE_SHADER_VERTEX:
857 ret = nvk_vs_gen_header(shader, &info_out);
858 break;
859 case PIPE_SHADER_FRAGMENT:
860 ret = nvk_fs_gen_header(shader, fs_key, &info_out);
861 shader->info.fs.uses_sample_shading = nir->info.fs.uses_sample_shading;
862 break;
863 case PIPE_SHADER_GEOMETRY:
864 ret = nvk_gs_gen_header(shader, nir, &info_out);
865 break;
866 case PIPE_SHADER_TESS_CTRL:
867 ret = nvk_tcs_gen_header(shader, &info_out);
868 break;
869 case PIPE_SHADER_TESS_EVAL:
870 ret = nvk_tes_gen_header(shader, &info_out);
871 break;
872 case PIPE_SHADER_COMPUTE:
873 shader->info.cs.smem_size = info_out.bin.smemSize;
874 break;
875 default:
876 unreachable("Invalid shader stage");
877 break;
878 }
879 assert(ret == 0);
880
881 if (info_out.io.globalAccess)
882 shader->info.hdr[0] |= 1 << 26;
883 if (info_out.io.globalAccess & 0x2)
884 shader->info.hdr[0] |= 1 << 16;
885 if (info_out.io.fp64)
886 shader->info.hdr[0] |= 1 << 27;
887
888 if (nir->xfb_info)
889 nvk_fill_transform_feedback_state(&shader->info.vtg.xfb, nir, &info_out);
890
891 return VK_SUCCESS;
892 }
893