xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_vec4_tcs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2013 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /**
25  * \file elk_vec4_tcs.cpp
26  *
27  * Tessellaton control shader specific code derived from the vec4_visitor class.
28  */
29 
30 #include "../intel_nir.h"
31 #include "elk_nir.h"
32 #include "elk_vec4_tcs.h"
33 #include "elk_fs.h"
34 #include "elk_private.h"
35 #include "dev/intel_debug.h"
36 
37 namespace elk {
38 
vec4_tcs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const struct elk_tcs_prog_key * key,struct elk_tcs_prog_data * prog_data,const nir_shader * nir,bool debug_enabled)39 vec4_tcs_visitor::vec4_tcs_visitor(const struct elk_compiler *compiler,
40                                    const struct elk_compile_params *params,
41                                    const struct elk_tcs_prog_key *key,
42                                    struct elk_tcs_prog_data *prog_data,
43                                    const nir_shader *nir,
44                                    bool debug_enabled)
45    : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
46                   nir, false, debug_enabled),
47      key(key)
48 {
49 }
50 
51 
52 void
setup_payload()53 vec4_tcs_visitor::setup_payload()
54 {
55    int reg = 0;
56 
57    /* The payload always contains important data in r0, which contains
58     * the URB handles that are passed on to the URB write at the end
59     * of the thread.
60     */
61    reg++;
62 
63    /* r1.0 - r4.7 may contain the input control point URB handles,
64     * which we use to pull vertex data.
65     */
66    reg += 4;
67 
68    /* Push constants may start at r5.0 */
69    reg = setup_uniforms(reg);
70 
71    this->first_non_payload_grf = reg;
72 }
73 
74 
75 void
emit_prolog()76 vec4_tcs_visitor::emit_prolog()
77 {
78    invocation_id = src_reg(this, glsl_uint_type());
79    emit(ELK_TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
80 
81    /* HS threads are dispatched with the dispatch mask set to 0xFF.
82     * If there are an odd number of output vertices, then the final
83     * HS instance dispatched will only have its bottom half doing real
84     * work, and so we need to disable the upper half:
85     */
86    if (nir->info.tess.tcs_vertices_out % 2) {
87       emit(CMP(dst_null_d(), invocation_id,
88                elk_imm_ud(nir->info.tess.tcs_vertices_out),
89                ELK_CONDITIONAL_L));
90 
91       /* Matching ENDIF is in emit_thread_end() */
92       emit(IF(ELK_PREDICATE_NORMAL));
93    }
94 }
95 
96 
97 void
emit_thread_end()98 vec4_tcs_visitor::emit_thread_end()
99 {
100    vec4_instruction *inst;
101    current_annotation = "thread end";
102 
103    if (nir->info.tess.tcs_vertices_out % 2) {
104       emit(ELK_OPCODE_ENDIF);
105    }
106 
107    if (devinfo->ver == 7) {
108       struct elk_tcs_prog_data *tcs_prog_data =
109          (struct elk_tcs_prog_data *) prog_data;
110 
111       current_annotation = "release input vertices";
112 
113       /* Synchronize all threads, so we know that no one is still
114        * using the input URB handles.
115        */
116       if (tcs_prog_data->instances > 1) {
117          dst_reg header = dst_reg(this, glsl_uvec4_type());
118          emit(ELK_TCS_OPCODE_CREATE_BARRIER_HEADER, header);
119          emit(ELK_SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
120       }
121 
122       /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
123        * We want to compare the bottom half of invocation_id with 0, but
124        * use that truth value for the top half as well.  Unfortunately,
125        * we don't have stride in the vec4 world, nor UV immediates in
126        * align16, so we need an opcode to get invocation_id<0,4,0>.
127        */
128       set_condmod(ELK_CONDITIONAL_Z,
129                   emit(ELK_TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
130                        invocation_id));
131       emit(IF(ELK_PREDICATE_NORMAL));
132       for (unsigned i = 0; i < key->input_vertices; i += 2) {
133          /* If we have an odd number of input vertices, the last will be
134           * unpaired.  We don't want to use an interleaved URB write in
135           * that case.
136           */
137          const bool is_unpaired = i == key->input_vertices - 1;
138 
139          dst_reg header(this, glsl_uvec4_type());
140          emit(ELK_TCS_OPCODE_RELEASE_INPUT, header, elk_imm_ud(i),
141               elk_imm_ud(is_unpaired));
142       }
143       emit(ELK_OPCODE_ENDIF);
144    }
145 
146    inst = emit(ELK_TCS_OPCODE_THREAD_END);
147    inst->base_mrf = 14;
148    inst->mlen = 2;
149 }
150 
151 
152 void
emit_input_urb_read(const dst_reg & dst,const src_reg & vertex_index,unsigned base_offset,unsigned first_component,const src_reg & indirect_offset)153 vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
154                                       const src_reg &vertex_index,
155                                       unsigned base_offset,
156                                       unsigned first_component,
157                                       const src_reg &indirect_offset)
158 {
159    vec4_instruction *inst;
160    dst_reg temp(this, glsl_ivec4_type());
161    temp.type = dst.type;
162 
163    /* Set up the message header to reference the proper parts of the URB */
164    dst_reg header = dst_reg(this, glsl_uvec4_type());
165    inst = emit(ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
166                indirect_offset);
167    inst->force_writemask_all = true;
168 
169    /* Read into a temporary, ignoring writemasking. */
170    inst = emit(ELK_VEC4_OPCODE_URB_READ, temp, src_reg(header));
171    inst->offset = base_offset;
172    inst->mlen = 1;
173    inst->base_mrf = -1;
174 
175    /* Copy the temporary to the destination to deal with writemasking.
176     *
177     * Also attempt to deal with gl_PointSize being in the .w component.
178     */
179    if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
180       emit(MOV(dst, swizzle(src_reg(temp), ELK_SWIZZLE_WWWW)));
181    } else {
182       src_reg src = src_reg(temp);
183       src.swizzle = ELK_SWZ_COMP_INPUT(first_component);
184       emit(MOV(dst, src));
185    }
186 }
187 
188 void
emit_output_urb_read(const dst_reg & dst,unsigned base_offset,unsigned first_component,const src_reg & indirect_offset)189 vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
190                                        unsigned base_offset,
191                                        unsigned first_component,
192                                        const src_reg &indirect_offset)
193 {
194    vec4_instruction *inst;
195 
196    /* Set up the message header to reference the proper parts of the URB */
197    dst_reg header = dst_reg(this, glsl_uvec4_type());
198    inst = emit(ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
199                elk_imm_ud(dst.writemask << first_component), indirect_offset);
200    inst->force_writemask_all = true;
201 
202    vec4_instruction *read = emit(ELK_VEC4_OPCODE_URB_READ, dst, src_reg(header));
203    read->offset = base_offset;
204    read->mlen = 1;
205    read->base_mrf = -1;
206 
207    if (first_component) {
208       /* Read into a temporary and copy with a swizzle and writemask. */
209       read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type);
210       emit(MOV(dst, swizzle(src_reg(read->dst),
211                             ELK_SWZ_COMP_INPUT(first_component))));
212    }
213 }
214 
215 void
emit_urb_write(const src_reg & value,unsigned writemask,unsigned base_offset,const src_reg & indirect_offset)216 vec4_tcs_visitor::emit_urb_write(const src_reg &value,
217                                  unsigned writemask,
218                                  unsigned base_offset,
219                                  const src_reg &indirect_offset)
220 {
221    if (writemask == 0)
222       return;
223 
224    src_reg message(this, glsl_uvec4_type(), 2);
225    vec4_instruction *inst;
226 
227    inst = emit(ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
228                elk_imm_ud(writemask), indirect_offset);
229    inst->force_writemask_all = true;
230    inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
231                    value));
232    inst->force_writemask_all = true;
233 
234    inst = emit(ELK_VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message);
235    inst->offset = base_offset;
236    inst->mlen = 2;
237    inst->base_mrf = -1;
238 }
239 
240 void
nir_emit_intrinsic(nir_intrinsic_instr * instr)241 vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
242 {
243    switch (instr->intrinsic) {
244    case nir_intrinsic_load_invocation_id:
245       emit(MOV(get_nir_def(instr->def, ELK_REGISTER_TYPE_UD),
246                invocation_id));
247       break;
248    case nir_intrinsic_load_primitive_id:
249       emit(ELK_TCS_OPCODE_GET_PRIMITIVE_ID,
250            get_nir_def(instr->def, ELK_REGISTER_TYPE_UD));
251       break;
252    case nir_intrinsic_load_patch_vertices_in:
253       emit(MOV(get_nir_def(instr->def, ELK_REGISTER_TYPE_D),
254                elk_imm_d(key->input_vertices)));
255       break;
256    case nir_intrinsic_load_per_vertex_input: {
257       assert(instr->def.bit_size == 32);
258       src_reg indirect_offset = get_indirect_offset(instr);
259       unsigned imm_offset = nir_intrinsic_base(instr);
260 
261       src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
262                                     ELK_REGISTER_TYPE_UD);
263 
264       unsigned first_component = nir_intrinsic_component(instr);
265       dst_reg dst = get_nir_def(instr->def, ELK_REGISTER_TYPE_D);
266       dst.writemask = elk_writemask_for_size(instr->num_components);
267       emit_input_urb_read(dst, vertex_index, imm_offset,
268                           first_component, indirect_offset);
269       break;
270    }
271    case nir_intrinsic_load_input:
272       unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
273       break;
274    case nir_intrinsic_load_output:
275    case nir_intrinsic_load_per_vertex_output: {
276       src_reg indirect_offset = get_indirect_offset(instr);
277       unsigned imm_offset = nir_intrinsic_base(instr);
278 
279       dst_reg dst = get_nir_def(instr->def, ELK_REGISTER_TYPE_D);
280       dst.writemask = elk_writemask_for_size(instr->num_components);
281 
282       emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
283                            indirect_offset);
284       break;
285    }
286    case nir_intrinsic_store_output:
287    case nir_intrinsic_store_per_vertex_output: {
288       assert(nir_src_bit_size(instr->src[0]) == 32);
289       src_reg value = get_nir_src(instr->src[0]);
290       unsigned mask = nir_intrinsic_write_mask(instr);
291       unsigned swiz = ELK_SWIZZLE_XYZW;
292 
293       src_reg indirect_offset = get_indirect_offset(instr);
294       unsigned imm_offset = nir_intrinsic_base(instr);
295 
296       unsigned first_component = nir_intrinsic_component(instr);
297       if (first_component) {
298          assert(swiz == ELK_SWIZZLE_XYZW);
299          swiz = ELK_SWZ_COMP_OUTPUT(first_component);
300          mask = mask << first_component;
301       }
302 
303       emit_urb_write(swizzle(value, swiz), mask,
304                      imm_offset, indirect_offset);
305       break;
306    }
307 
308    case nir_intrinsic_barrier:
309       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
310          vec4_visitor::nir_emit_intrinsic(instr);
311       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
312          dst_reg header = dst_reg(this, glsl_uvec4_type());
313          emit(ELK_TCS_OPCODE_CREATE_BARRIER_HEADER, header);
314          emit(ELK_SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
315       }
316       break;
317 
318    default:
319       vec4_visitor::nir_emit_intrinsic(instr);
320    }
321 }
322 
323 /**
324  * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
325  * launched.  In cases with a large number of input control points and a large
326  * amount of VS outputs, the VS URB space needed to store an entire 8 patches
327  * worth of data can be prohibitive, so it can be beneficial to launch threads
328  * early.
329  *
330  * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
331  * values.  Note that 0 means to "disable" early dispatch, meaning to wait for
332  * a full 8 patches as normal.
333  */
334 static int
get_patch_count_threshold(int input_control_points)335 get_patch_count_threshold(int input_control_points)
336 {
337    if (input_control_points <= 4)
338       return 0;
339    else if (input_control_points <= 6)
340       return 5;
341    else if (input_control_points <= 8)
342       return 4;
343    else if (input_control_points <= 10)
344       return 3;
345    else if (input_control_points <= 14)
346       return 2;
347 
348    /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
349    return 1;
350 }
351 
352 } /* namespace elk */
353 
354 extern "C" const unsigned *
elk_compile_tcs(const struct elk_compiler * compiler,struct elk_compile_tcs_params * params)355 elk_compile_tcs(const struct elk_compiler *compiler,
356                 struct elk_compile_tcs_params *params)
357 {
358    const struct intel_device_info *devinfo = compiler->devinfo;
359    nir_shader *nir = params->base.nir;
360    const struct elk_tcs_prog_key *key = params->key;
361    struct elk_tcs_prog_data *prog_data = params->prog_data;
362    struct elk_vue_prog_data *vue_prog_data = &prog_data->base;
363 
364    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
365    const bool debug_enabled = elk_should_print_shader(nir, DEBUG_TCS);
366    const unsigned *assembly;
367 
368    vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
369    prog_data->base.base.total_scratch = 0;
370 
371    nir->info.outputs_written = key->outputs_written;
372    nir->info.patch_outputs_written = key->patch_outputs_written;
373 
374    struct intel_vue_map input_vue_map;
375    elk_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
376                        nir->info.separate_shader, 1);
377    elk_compute_tess_vue_map(&vue_prog_data->vue_map,
378                             nir->info.outputs_written,
379                             nir->info.patch_outputs_written);
380 
381    elk_nir_apply_key(nir, compiler, &key->base, 8);
382    elk_nir_lower_vue_inputs(nir, &input_vue_map);
383    elk_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
384                              key->_tes_primitive_mode);
385    if (key->quads_workaround)
386       intel_nir_apply_tcs_quads_workaround(nir);
387    if (key->input_vertices > 0)
388       intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
389 
390    elk_postprocess_nir(nir, compiler, debug_enabled,
391                        key->base.robust_flags);
392 
393    prog_data->patch_count_threshold = elk::get_patch_count_threshold(key->input_vertices);
394 
395    unsigned verts_per_thread = is_scalar ? 8 : 2;
396    vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
397    prog_data->instances =
398       DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
399 
400    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
401     * That divides up as follows:
402     *
403     *     32 bytes for the patch header (tessellation factors)
404     *    480 bytes for per-patch varyings (a varying component is 4 bytes and
405     *              gl_MaxTessPatchComponents = 120)
406     *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
407     *              gl_MaxPatchVertices = 32 and
408     *              gl_MaxTessControlOutputComponents = 128)
409     *
410     *  15808 bytes left for varying packing overhead
411     */
412    const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
413    const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
414    unsigned output_size_bytes = 0;
415    /* Note that the patch header is counted in num_per_patch_slots. */
416    output_size_bytes += num_per_patch_slots * 16;
417    output_size_bytes += nir->info.tess.tcs_vertices_out *
418                         num_per_vertex_slots * 16;
419 
420    assert(output_size_bytes >= 1);
421    if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
422       return NULL;
423 
424    /* URB entry sizes are stored as a multiple of 64 bytes. */
425    vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
426 
427    /* HS does not use the usual payload pushing from URB to GRFs,
428     * because we don't have enough registers for a full-size payload, and
429     * the hardware is broken on Haswell anyway.
430     */
431    vue_prog_data->urb_read_length = 0;
432 
433    if (unlikely(debug_enabled)) {
434       fprintf(stderr, "TCS Input ");
435       elk_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
436       fprintf(stderr, "TCS Output ");
437       elk_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
438    }
439 
440    if (is_scalar) {
441       const unsigned dispatch_width = 8;
442       elk_fs_visitor v(compiler, &params->base, &key->base,
443                    &prog_data->base.base, nir, dispatch_width,
444                    params->base.stats != NULL, debug_enabled);
445       if (!v.run_tcs()) {
446          params->base.error_str =
447             ralloc_strdup(params->base.mem_ctx, v.fail_msg);
448          return NULL;
449       }
450 
451       assert(v.payload().num_regs % reg_unit(devinfo) == 0);
452       prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
453 
454       elk_fs_generator g(compiler, &params->base,
455                      &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
456       if (unlikely(debug_enabled)) {
457          g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
458                                         "%s tessellation control shader %s",
459                                         nir->info.label ? nir->info.label
460                                                         : "unnamed",
461                                         nir->info.name));
462       }
463 
464       g.generate_code(v.cfg, dispatch_width, v.shader_stats,
465                       v.performance_analysis.require(), params->base.stats);
466 
467       g.add_const_data(nir->constant_data, nir->constant_data_size);
468 
469       assembly = g.get_assembly();
470    } else {
471       elk::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
472                               nir, debug_enabled);
473       if (!v.run()) {
474          params->base.error_str =
475             ralloc_strdup(params->base.mem_ctx, v.fail_msg);
476          return NULL;
477       }
478 
479       if (INTEL_DEBUG(DEBUG_TCS))
480          v.dump_instructions();
481 
482 
483       assembly = elk_vec4_generate_assembly(compiler, &params->base, nir,
484                                             &prog_data->base, v.cfg,
485                                             v.performance_analysis.require(),
486                                             debug_enabled);
487    }
488 
489    return assembly;
490 }
491