1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file elk_vec4_tcs.cpp
26 *
27 * Tessellaton control shader specific code derived from the vec4_visitor class.
28 */
29
30 #include "../intel_nir.h"
31 #include "elk_nir.h"
32 #include "elk_vec4_tcs.h"
33 #include "elk_fs.h"
34 #include "elk_private.h"
35 #include "dev/intel_debug.h"
36
37 namespace elk {
38
vec4_tcs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const struct elk_tcs_prog_key * key,struct elk_tcs_prog_data * prog_data,const nir_shader * nir,bool debug_enabled)39 vec4_tcs_visitor::vec4_tcs_visitor(const struct elk_compiler *compiler,
40 const struct elk_compile_params *params,
41 const struct elk_tcs_prog_key *key,
42 struct elk_tcs_prog_data *prog_data,
43 const nir_shader *nir,
44 bool debug_enabled)
45 : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
46 nir, false, debug_enabled),
47 key(key)
48 {
49 }
50
51
52 void
setup_payload()53 vec4_tcs_visitor::setup_payload()
54 {
55 int reg = 0;
56
57 /* The payload always contains important data in r0, which contains
58 * the URB handles that are passed on to the URB write at the end
59 * of the thread.
60 */
61 reg++;
62
63 /* r1.0 - r4.7 may contain the input control point URB handles,
64 * which we use to pull vertex data.
65 */
66 reg += 4;
67
68 /* Push constants may start at r5.0 */
69 reg = setup_uniforms(reg);
70
71 this->first_non_payload_grf = reg;
72 }
73
74
75 void
emit_prolog()76 vec4_tcs_visitor::emit_prolog()
77 {
78 invocation_id = src_reg(this, glsl_uint_type());
79 emit(ELK_TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
80
81 /* HS threads are dispatched with the dispatch mask set to 0xFF.
82 * If there are an odd number of output vertices, then the final
83 * HS instance dispatched will only have its bottom half doing real
84 * work, and so we need to disable the upper half:
85 */
86 if (nir->info.tess.tcs_vertices_out % 2) {
87 emit(CMP(dst_null_d(), invocation_id,
88 elk_imm_ud(nir->info.tess.tcs_vertices_out),
89 ELK_CONDITIONAL_L));
90
91 /* Matching ENDIF is in emit_thread_end() */
92 emit(IF(ELK_PREDICATE_NORMAL));
93 }
94 }
95
96
97 void
emit_thread_end()98 vec4_tcs_visitor::emit_thread_end()
99 {
100 vec4_instruction *inst;
101 current_annotation = "thread end";
102
103 if (nir->info.tess.tcs_vertices_out % 2) {
104 emit(ELK_OPCODE_ENDIF);
105 }
106
107 if (devinfo->ver == 7) {
108 struct elk_tcs_prog_data *tcs_prog_data =
109 (struct elk_tcs_prog_data *) prog_data;
110
111 current_annotation = "release input vertices";
112
113 /* Synchronize all threads, so we know that no one is still
114 * using the input URB handles.
115 */
116 if (tcs_prog_data->instances > 1) {
117 dst_reg header = dst_reg(this, glsl_uvec4_type());
118 emit(ELK_TCS_OPCODE_CREATE_BARRIER_HEADER, header);
119 emit(ELK_SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
120 }
121
122 /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
123 * We want to compare the bottom half of invocation_id with 0, but
124 * use that truth value for the top half as well. Unfortunately,
125 * we don't have stride in the vec4 world, nor UV immediates in
126 * align16, so we need an opcode to get invocation_id<0,4,0>.
127 */
128 set_condmod(ELK_CONDITIONAL_Z,
129 emit(ELK_TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
130 invocation_id));
131 emit(IF(ELK_PREDICATE_NORMAL));
132 for (unsigned i = 0; i < key->input_vertices; i += 2) {
133 /* If we have an odd number of input vertices, the last will be
134 * unpaired. We don't want to use an interleaved URB write in
135 * that case.
136 */
137 const bool is_unpaired = i == key->input_vertices - 1;
138
139 dst_reg header(this, glsl_uvec4_type());
140 emit(ELK_TCS_OPCODE_RELEASE_INPUT, header, elk_imm_ud(i),
141 elk_imm_ud(is_unpaired));
142 }
143 emit(ELK_OPCODE_ENDIF);
144 }
145
146 inst = emit(ELK_TCS_OPCODE_THREAD_END);
147 inst->base_mrf = 14;
148 inst->mlen = 2;
149 }
150
151
152 void
emit_input_urb_read(const dst_reg & dst,const src_reg & vertex_index,unsigned base_offset,unsigned first_component,const src_reg & indirect_offset)153 vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
154 const src_reg &vertex_index,
155 unsigned base_offset,
156 unsigned first_component,
157 const src_reg &indirect_offset)
158 {
159 vec4_instruction *inst;
160 dst_reg temp(this, glsl_ivec4_type());
161 temp.type = dst.type;
162
163 /* Set up the message header to reference the proper parts of the URB */
164 dst_reg header = dst_reg(this, glsl_uvec4_type());
165 inst = emit(ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
166 indirect_offset);
167 inst->force_writemask_all = true;
168
169 /* Read into a temporary, ignoring writemasking. */
170 inst = emit(ELK_VEC4_OPCODE_URB_READ, temp, src_reg(header));
171 inst->offset = base_offset;
172 inst->mlen = 1;
173 inst->base_mrf = -1;
174
175 /* Copy the temporary to the destination to deal with writemasking.
176 *
177 * Also attempt to deal with gl_PointSize being in the .w component.
178 */
179 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
180 emit(MOV(dst, swizzle(src_reg(temp), ELK_SWIZZLE_WWWW)));
181 } else {
182 src_reg src = src_reg(temp);
183 src.swizzle = ELK_SWZ_COMP_INPUT(first_component);
184 emit(MOV(dst, src));
185 }
186 }
187
188 void
emit_output_urb_read(const dst_reg & dst,unsigned base_offset,unsigned first_component,const src_reg & indirect_offset)189 vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
190 unsigned base_offset,
191 unsigned first_component,
192 const src_reg &indirect_offset)
193 {
194 vec4_instruction *inst;
195
196 /* Set up the message header to reference the proper parts of the URB */
197 dst_reg header = dst_reg(this, glsl_uvec4_type());
198 inst = emit(ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
199 elk_imm_ud(dst.writemask << first_component), indirect_offset);
200 inst->force_writemask_all = true;
201
202 vec4_instruction *read = emit(ELK_VEC4_OPCODE_URB_READ, dst, src_reg(header));
203 read->offset = base_offset;
204 read->mlen = 1;
205 read->base_mrf = -1;
206
207 if (first_component) {
208 /* Read into a temporary and copy with a swizzle and writemask. */
209 read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type);
210 emit(MOV(dst, swizzle(src_reg(read->dst),
211 ELK_SWZ_COMP_INPUT(first_component))));
212 }
213 }
214
215 void
emit_urb_write(const src_reg & value,unsigned writemask,unsigned base_offset,const src_reg & indirect_offset)216 vec4_tcs_visitor::emit_urb_write(const src_reg &value,
217 unsigned writemask,
218 unsigned base_offset,
219 const src_reg &indirect_offset)
220 {
221 if (writemask == 0)
222 return;
223
224 src_reg message(this, glsl_uvec4_type(), 2);
225 vec4_instruction *inst;
226
227 inst = emit(ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
228 elk_imm_ud(writemask), indirect_offset);
229 inst->force_writemask_all = true;
230 inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
231 value));
232 inst->force_writemask_all = true;
233
234 inst = emit(ELK_VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message);
235 inst->offset = base_offset;
236 inst->mlen = 2;
237 inst->base_mrf = -1;
238 }
239
240 void
nir_emit_intrinsic(nir_intrinsic_instr * instr)241 vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
242 {
243 switch (instr->intrinsic) {
244 case nir_intrinsic_load_invocation_id:
245 emit(MOV(get_nir_def(instr->def, ELK_REGISTER_TYPE_UD),
246 invocation_id));
247 break;
248 case nir_intrinsic_load_primitive_id:
249 emit(ELK_TCS_OPCODE_GET_PRIMITIVE_ID,
250 get_nir_def(instr->def, ELK_REGISTER_TYPE_UD));
251 break;
252 case nir_intrinsic_load_patch_vertices_in:
253 emit(MOV(get_nir_def(instr->def, ELK_REGISTER_TYPE_D),
254 elk_imm_d(key->input_vertices)));
255 break;
256 case nir_intrinsic_load_per_vertex_input: {
257 assert(instr->def.bit_size == 32);
258 src_reg indirect_offset = get_indirect_offset(instr);
259 unsigned imm_offset = nir_intrinsic_base(instr);
260
261 src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
262 ELK_REGISTER_TYPE_UD);
263
264 unsigned first_component = nir_intrinsic_component(instr);
265 dst_reg dst = get_nir_def(instr->def, ELK_REGISTER_TYPE_D);
266 dst.writemask = elk_writemask_for_size(instr->num_components);
267 emit_input_urb_read(dst, vertex_index, imm_offset,
268 first_component, indirect_offset);
269 break;
270 }
271 case nir_intrinsic_load_input:
272 unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
273 break;
274 case nir_intrinsic_load_output:
275 case nir_intrinsic_load_per_vertex_output: {
276 src_reg indirect_offset = get_indirect_offset(instr);
277 unsigned imm_offset = nir_intrinsic_base(instr);
278
279 dst_reg dst = get_nir_def(instr->def, ELK_REGISTER_TYPE_D);
280 dst.writemask = elk_writemask_for_size(instr->num_components);
281
282 emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
283 indirect_offset);
284 break;
285 }
286 case nir_intrinsic_store_output:
287 case nir_intrinsic_store_per_vertex_output: {
288 assert(nir_src_bit_size(instr->src[0]) == 32);
289 src_reg value = get_nir_src(instr->src[0]);
290 unsigned mask = nir_intrinsic_write_mask(instr);
291 unsigned swiz = ELK_SWIZZLE_XYZW;
292
293 src_reg indirect_offset = get_indirect_offset(instr);
294 unsigned imm_offset = nir_intrinsic_base(instr);
295
296 unsigned first_component = nir_intrinsic_component(instr);
297 if (first_component) {
298 assert(swiz == ELK_SWIZZLE_XYZW);
299 swiz = ELK_SWZ_COMP_OUTPUT(first_component);
300 mask = mask << first_component;
301 }
302
303 emit_urb_write(swizzle(value, swiz), mask,
304 imm_offset, indirect_offset);
305 break;
306 }
307
308 case nir_intrinsic_barrier:
309 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
310 vec4_visitor::nir_emit_intrinsic(instr);
311 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
312 dst_reg header = dst_reg(this, glsl_uvec4_type());
313 emit(ELK_TCS_OPCODE_CREATE_BARRIER_HEADER, header);
314 emit(ELK_SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
315 }
316 break;
317
318 default:
319 vec4_visitor::nir_emit_intrinsic(instr);
320 }
321 }
322
323 /**
324 * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
325 * launched. In cases with a large number of input control points and a large
326 * amount of VS outputs, the VS URB space needed to store an entire 8 patches
327 * worth of data can be prohibitive, so it can be beneficial to launch threads
328 * early.
329 *
330 * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
331 * values. Note that 0 means to "disable" early dispatch, meaning to wait for
332 * a full 8 patches as normal.
333 */
334 static int
get_patch_count_threshold(int input_control_points)335 get_patch_count_threshold(int input_control_points)
336 {
337 if (input_control_points <= 4)
338 return 0;
339 else if (input_control_points <= 6)
340 return 5;
341 else if (input_control_points <= 8)
342 return 4;
343 else if (input_control_points <= 10)
344 return 3;
345 else if (input_control_points <= 14)
346 return 2;
347
348 /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
349 return 1;
350 }
351
352 } /* namespace elk */
353
354 extern "C" const unsigned *
elk_compile_tcs(const struct elk_compiler * compiler,struct elk_compile_tcs_params * params)355 elk_compile_tcs(const struct elk_compiler *compiler,
356 struct elk_compile_tcs_params *params)
357 {
358 const struct intel_device_info *devinfo = compiler->devinfo;
359 nir_shader *nir = params->base.nir;
360 const struct elk_tcs_prog_key *key = params->key;
361 struct elk_tcs_prog_data *prog_data = params->prog_data;
362 struct elk_vue_prog_data *vue_prog_data = &prog_data->base;
363
364 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
365 const bool debug_enabled = elk_should_print_shader(nir, DEBUG_TCS);
366 const unsigned *assembly;
367
368 vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
369 prog_data->base.base.total_scratch = 0;
370
371 nir->info.outputs_written = key->outputs_written;
372 nir->info.patch_outputs_written = key->patch_outputs_written;
373
374 struct intel_vue_map input_vue_map;
375 elk_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
376 nir->info.separate_shader, 1);
377 elk_compute_tess_vue_map(&vue_prog_data->vue_map,
378 nir->info.outputs_written,
379 nir->info.patch_outputs_written);
380
381 elk_nir_apply_key(nir, compiler, &key->base, 8);
382 elk_nir_lower_vue_inputs(nir, &input_vue_map);
383 elk_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
384 key->_tes_primitive_mode);
385 if (key->quads_workaround)
386 intel_nir_apply_tcs_quads_workaround(nir);
387 if (key->input_vertices > 0)
388 intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
389
390 elk_postprocess_nir(nir, compiler, debug_enabled,
391 key->base.robust_flags);
392
393 prog_data->patch_count_threshold = elk::get_patch_count_threshold(key->input_vertices);
394
395 unsigned verts_per_thread = is_scalar ? 8 : 2;
396 vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
397 prog_data->instances =
398 DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
399
400 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
401 * That divides up as follows:
402 *
403 * 32 bytes for the patch header (tessellation factors)
404 * 480 bytes for per-patch varyings (a varying component is 4 bytes and
405 * gl_MaxTessPatchComponents = 120)
406 * 16384 bytes for per-vertex varyings (a varying component is 4 bytes,
407 * gl_MaxPatchVertices = 32 and
408 * gl_MaxTessControlOutputComponents = 128)
409 *
410 * 15808 bytes left for varying packing overhead
411 */
412 const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
413 const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
414 unsigned output_size_bytes = 0;
415 /* Note that the patch header is counted in num_per_patch_slots. */
416 output_size_bytes += num_per_patch_slots * 16;
417 output_size_bytes += nir->info.tess.tcs_vertices_out *
418 num_per_vertex_slots * 16;
419
420 assert(output_size_bytes >= 1);
421 if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
422 return NULL;
423
424 /* URB entry sizes are stored as a multiple of 64 bytes. */
425 vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
426
427 /* HS does not use the usual payload pushing from URB to GRFs,
428 * because we don't have enough registers for a full-size payload, and
429 * the hardware is broken on Haswell anyway.
430 */
431 vue_prog_data->urb_read_length = 0;
432
433 if (unlikely(debug_enabled)) {
434 fprintf(stderr, "TCS Input ");
435 elk_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
436 fprintf(stderr, "TCS Output ");
437 elk_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
438 }
439
440 if (is_scalar) {
441 const unsigned dispatch_width = 8;
442 elk_fs_visitor v(compiler, ¶ms->base, &key->base,
443 &prog_data->base.base, nir, dispatch_width,
444 params->base.stats != NULL, debug_enabled);
445 if (!v.run_tcs()) {
446 params->base.error_str =
447 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
448 return NULL;
449 }
450
451 assert(v.payload().num_regs % reg_unit(devinfo) == 0);
452 prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
453
454 elk_fs_generator g(compiler, ¶ms->base,
455 &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
456 if (unlikely(debug_enabled)) {
457 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
458 "%s tessellation control shader %s",
459 nir->info.label ? nir->info.label
460 : "unnamed",
461 nir->info.name));
462 }
463
464 g.generate_code(v.cfg, dispatch_width, v.shader_stats,
465 v.performance_analysis.require(), params->base.stats);
466
467 g.add_const_data(nir->constant_data, nir->constant_data_size);
468
469 assembly = g.get_assembly();
470 } else {
471 elk::vec4_tcs_visitor v(compiler, ¶ms->base, key, prog_data,
472 nir, debug_enabled);
473 if (!v.run()) {
474 params->base.error_str =
475 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
476 return NULL;
477 }
478
479 if (INTEL_DEBUG(DEBUG_TCS))
480 v.dump_instructions();
481
482
483 assembly = elk_vec4_generate_assembly(compiler, ¶ms->base, nir,
484 &prog_data->base, v.cfg,
485 v.performance_analysis.require(),
486 debug_enabled);
487 }
488
489 return assembly;
490 }
491