1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29
30 #include "genxml/gen_macros.h"
31 #include "genxml/genX_pack.h"
32 #include "common/intel_genX_state_brw.h"
33
34 #include "ds/intel_tracepoints.h"
35
36 #include "genX_mi_builder.h"
37
38 static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer)39 cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
40 {
41 struct anv_graphics_pipeline *pipeline =
42 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
43 VkShaderStageFlags stages = pipeline->base.base.active_stages;
44
45 /* In order to avoid thrash, we assume that vertex and fragment stages
46 * always exist. In the rare case where one is missing *and* the other
47 * uses push concstants, this may be suboptimal. However, avoiding stalls
48 * seems more important.
49 */
50 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
51 if (anv_pipeline_is_primitive(pipeline))
52 stages |= VK_SHADER_STAGE_VERTEX_BIT;
53
54 if (stages == cmd_buffer->state.gfx.push_constant_stages)
55 return;
56
57 unsigned push_constant_kb;
58
59 const struct intel_device_info *devinfo = cmd_buffer->device->info;
60 if (anv_pipeline_is_mesh(pipeline))
61 push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
62 else
63 push_constant_kb = devinfo->max_constant_urb_size_kb;
64
65 const unsigned num_stages =
66 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
67 unsigned size_per_stage = push_constant_kb / num_stages;
68
69 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
70 * units of 2KB. Incidentally, these are the same platforms that have
71 * 32KB worth of push constant space.
72 */
73 if (push_constant_kb == 32)
74 size_per_stage &= ~1u;
75
76 uint32_t kb_used = 0;
77 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
78 const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
79 anv_batch_emit(&cmd_buffer->batch,
80 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
81 alloc._3DCommandSubOpcode = 18 + i;
82 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
83 alloc.ConstantBufferSize = push_size;
84 }
85 kb_used += push_size;
86 }
87
88 anv_batch_emit(&cmd_buffer->batch,
89 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
90 alloc.ConstantBufferOffset = kb_used;
91 alloc.ConstantBufferSize = push_constant_kb - kb_used;
92 }
93
94 #if GFX_VERx10 == 125
95 /* DG2: Wa_22011440098
96 * MTL: Wa_18022330953
97 *
98 * In 3D mode, after programming push constant alloc command immediately
99 * program push constant command(ZERO length) without any commit between
100 * them.
101 */
102 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
103 /* Update empty push constants for all stages (bitmask = 11111b) */
104 c.ShaderUpdateEnable = 0x1f;
105 c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
106 }
107 #endif
108
109 cmd_buffer->state.gfx.push_constant_stages = stages;
110
111 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
112 *
113 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
114 * the next 3DPRIMITIVE command after programming the
115 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
116 *
117 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
118 * pipeline setup, we need to dirty push constants.
119 */
120 cmd_buffer->state.push_constants_dirty |= stages;
121 }
122
123 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)124 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
125 uint32_t stages)
126 {
127 static const uint32_t sampler_state_opcodes[] = {
128 [MESA_SHADER_VERTEX] = 43,
129 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
130 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
131 [MESA_SHADER_GEOMETRY] = 46,
132 [MESA_SHADER_FRAGMENT] = 47,
133 };
134
135 static const uint32_t binding_table_opcodes[] = {
136 [MESA_SHADER_VERTEX] = 38,
137 [MESA_SHADER_TESS_CTRL] = 39,
138 [MESA_SHADER_TESS_EVAL] = 40,
139 [MESA_SHADER_GEOMETRY] = 41,
140 [MESA_SHADER_FRAGMENT] = 42,
141 };
142
143 anv_foreach_stage(s, stages) {
144 assert(s < ARRAY_SIZE(binding_table_opcodes));
145
146 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
147 anv_batch_emit(&cmd_buffer->batch,
148 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
149 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
150 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
151 }
152 }
153
154 /* Always emit binding table pointers if we're asked to, since on SKL
155 * this is what flushes push constants. */
156 anv_batch_emit(&cmd_buffer->batch,
157 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
158 btp._3DCommandSubOpcode = binding_table_opcodes[s];
159 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
160 }
161 }
162 }
163
164 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)165 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
166 const struct anv_shader_bin *shader,
167 const struct anv_push_range *range)
168 {
169 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
170 switch (range->set) {
171 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
172 /* This is a descriptor set buffer so the set index is
173 * actually given by binding->binding. (Yes, that's
174 * confusing.)
175 */
176 struct anv_descriptor_set *set =
177 gfx_state->base.descriptors[range->index];
178 return anv_descriptor_set_address(set);
179 }
180
181 case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
182 return anv_address_from_u64(
183 anv_cmd_buffer_descriptor_buffer_address(
184 cmd_buffer,
185 gfx_state->base.descriptor_buffers[range->index].buffer_index) +
186 gfx_state->base.descriptor_buffers[range->index].buffer_offset);
187 }
188
189 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
190 if (gfx_state->base.push_constants_state.alloc_size == 0) {
191 gfx_state->base.push_constants_state =
192 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
193 }
194 return anv_cmd_buffer_temporary_state_address(
195 cmd_buffer, gfx_state->base.push_constants_state);
196 }
197
198 default: {
199 assert(range->set < MAX_SETS);
200 struct anv_descriptor_set *set =
201 gfx_state->base.descriptors[range->set];
202 const struct anv_descriptor *desc =
203 &set->descriptors[range->index];
204
205 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
206 if (desc->buffer) {
207 return anv_address_add(desc->buffer->address,
208 desc->offset);
209 }
210 } else {
211 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
212 if (desc->buffer) {
213 const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
214 uint32_t dynamic_offset =
215 pipe_state->dynamic_offsets[
216 range->set].offsets[range->dynamic_offset_index];
217 return anv_address_add(desc->buffer->address,
218 desc->offset + dynamic_offset);
219 }
220 }
221
222 /* For NULL UBOs, we just return an address in the workaround BO. We do
223 * writes to it for workarounds but always at the bottom. The higher
224 * bytes should be all zeros.
225 */
226 assert(range->length * 32 <= 2048);
227 return cmd_buffer->device->workaround_address;
228 }
229 }
230 }
231
232
233 /** Returns the size in bytes of the bound buffer
234 *
235 * The range is relative to the start of the buffer, not the start of the
236 * range. The returned range may be smaller than
237 *
238 * (range->start + range->length) * 32;
239 */
240 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)241 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
242 const struct anv_shader_bin *shader,
243 const struct anv_push_range *range)
244 {
245 assert(shader->stage != MESA_SHADER_COMPUTE);
246 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
247 switch (range->set) {
248 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
249 struct anv_descriptor_set *set =
250 gfx_state->base.descriptors[range->index];
251 struct anv_state state = set->desc_surface_mem;
252 assert(range->start * 32 < state.alloc_size);
253 assert((range->start + range->length) * 32 <= state.alloc_size);
254 return state.alloc_size;
255 }
256
257 case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
258 return gfx_state->base.pipeline->layout.set[
259 range->index].layout->descriptor_buffer_surface_size;
260
261 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
262 return (range->start + range->length) * 32;
263
264 default: {
265 assert(range->set < MAX_SETS);
266 struct anv_descriptor_set *set =
267 gfx_state->base.descriptors[range->set];
268 const struct anv_descriptor *desc =
269 &set->descriptors[range->index];
270
271 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
272 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
273 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
274 */
275 if (!desc->buffer)
276 return 0;
277
278 if (range->start * 32 > desc->bind_range)
279 return 0;
280
281 return desc->bind_range;
282 } else {
283 if (!desc->buffer)
284 return 0;
285
286 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
287 /* Compute the offset within the buffer */
288 const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
289 uint32_t dynamic_offset =
290 pipe_state->dynamic_offsets[
291 range->set].offsets[range->dynamic_offset_index];
292 uint64_t offset = desc->offset + dynamic_offset;
293 /* Clamp to the buffer size */
294 offset = MIN2(offset, desc->buffer->vk.size);
295 /* Clamp the range to the buffer size */
296 uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
297
298 /* Align the range for consistency */
299 bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
300
301 return bound_range;
302 }
303 }
304 }
305 }
306
307 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)308 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
309 gl_shader_stage stage,
310 struct anv_address *buffers,
311 unsigned buffer_count)
312 {
313 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
314 const struct anv_graphics_pipeline *pipeline =
315 anv_pipeline_to_graphics(gfx_state->base.pipeline);
316
317 static const uint32_t push_constant_opcodes[] = {
318 [MESA_SHADER_VERTEX] = 21,
319 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
320 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
321 [MESA_SHADER_GEOMETRY] = 22,
322 [MESA_SHADER_FRAGMENT] = 23,
323 };
324
325 assert(stage < ARRAY_SIZE(push_constant_opcodes));
326
327 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
328
329 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
330 c._3DCommandSubOpcode = push_constant_opcodes[stage];
331
332 /* Set MOCS.
333 *
334 * We only have one MOCS field for the whole packet, not one per
335 * buffer. We could go out of our way here to walk over all of
336 * the buffers and see if any of them are used externally and use
337 * the external MOCS. However, the notion that someone would use
338 * the same bit of memory for both scanout and a UBO is nuts.
339 *
340 * Let's not bother and assume it's all internal.
341 */
342 c.MOCS = mocs;
343
344 if (anv_pipeline_has_stage(pipeline, stage)) {
345 const struct anv_pipeline_bind_map *bind_map =
346 &pipeline->base.shaders[stage]->bind_map;
347
348 /* The Skylake PRM contains the following restriction:
349 *
350 * "The driver must ensure The following case does not occur
351 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
352 * buffer 3 read length equal to zero committed followed by a
353 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
354 * zero committed."
355 *
356 * To avoid this, we program the buffers in the highest slots.
357 * This way, slot 0 is only used if slot 3 is also used.
358 */
359 assert(buffer_count <= 4);
360 const unsigned shift = 4 - buffer_count;
361 for (unsigned i = 0; i < buffer_count; i++) {
362 const struct anv_push_range *range = &bind_map->push_ranges[i];
363
364 /* At this point we only have non-empty ranges */
365 assert(range->length > 0);
366
367 c.ConstantBody.ReadLength[i + shift] = range->length;
368 c.ConstantBody.Buffer[i + shift] =
369 anv_address_add(buffers[i], range->start * 32);
370 }
371 }
372 }
373 }
374
375 #if GFX_VER >= 12
376 static void
emit_null_push_constant_tbimr_workaround(struct anv_cmd_buffer * cmd_buffer)377 emit_null_push_constant_tbimr_workaround(struct anv_cmd_buffer *cmd_buffer)
378 {
379 /* Pass a single-register push constant payload for the PS
380 * stage even if empty, since PS invocations with zero push
381 * constant cycles have been found to cause hangs with TBIMR
382 * enabled. See HSDES #22020184996.
383 *
384 * XXX - Use workaround infrastructure and final workaround
385 * when provided by hardware team.
386 */
387 const struct anv_address null_addr = cmd_buffer->device->workaround_address;
388 uint32_t *dw = anv_batch_emitn(
389 &cmd_buffer->batch, 4,
390 GENX(3DSTATE_CONSTANT_ALL),
391 .ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT),
392 .PointerBufferMask = 1,
393 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
394 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
395 &cmd_buffer->batch, dw + 2,
396 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
397 .PointerToConstantBuffer = null_addr,
398 .ConstantBufferReadLength = 1,
399 });
400 }
401
402 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)403 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
404 uint32_t shader_mask,
405 struct anv_address *buffers,
406 uint32_t buffer_count)
407 {
408 if (buffer_count == 0) {
409 if (cmd_buffer->device->info->needs_null_push_constant_tbimr_workaround &&
410 (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
411 emit_null_push_constant_tbimr_workaround(cmd_buffer);
412 shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
413 }
414
415 if (shader_mask) {
416 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
417 c.ShaderUpdateEnable = shader_mask;
418 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
419 }
420 }
421
422 return;
423 }
424
425 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
426 const struct anv_graphics_pipeline *pipeline =
427 anv_pipeline_to_graphics(gfx_state->base.pipeline);
428
429 gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
430
431 const struct anv_pipeline_bind_map *bind_map =
432 &pipeline->base.shaders[stage]->bind_map;
433
434 uint32_t *dw;
435 const uint32_t buffer_mask = (1 << buffer_count) - 1;
436 const uint32_t num_dwords = 2 + 2 * buffer_count;
437
438 dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
439 GENX(3DSTATE_CONSTANT_ALL),
440 .ShaderUpdateEnable = shader_mask,
441 .PointerBufferMask = buffer_mask,
442 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
443
444 for (int i = 0; i < buffer_count; i++) {
445 const struct anv_push_range *range = &bind_map->push_ranges[i];
446 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
447 &cmd_buffer->batch, dw + 2 + i * 2,
448 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
449 .PointerToConstantBuffer =
450 anv_address_add(buffers[i], range->start * 32),
451 .ConstantBufferReadLength = range->length,
452 });
453 }
454 }
455 #endif
456
457 static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)458 cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
459 VkShaderStageFlags dirty_stages)
460 {
461 VkShaderStageFlags flushed = 0;
462 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
463 const struct anv_graphics_pipeline *pipeline =
464 anv_pipeline_to_graphics(gfx_state->base.pipeline);
465
466 #if GFX_VER >= 12
467 uint32_t nobuffer_stages = 0;
468 #endif
469
470 /* Compute robust pushed register access mask for each stage. */
471 anv_foreach_stage(stage, dirty_stages) {
472 if (!anv_pipeline_has_stage(pipeline, stage))
473 continue;
474
475 const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
476 if (shader->prog_data->zero_push_reg) {
477 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
478 struct anv_push_constants *push = &gfx_state->base.push_constants;
479
480 push->push_reg_mask[stage] = 0;
481 /* Start of the current range in the shader, relative to the start of
482 * push constants in the shader.
483 */
484 unsigned range_start_reg = 0;
485 for (unsigned i = 0; i < 4; i++) {
486 const struct anv_push_range *range = &bind_map->push_ranges[i];
487 if (range->length == 0)
488 continue;
489
490 unsigned bound_size =
491 get_push_range_bound_size(cmd_buffer, shader, range);
492 if (bound_size >= range->start * 32) {
493 unsigned bound_regs =
494 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
495 range->length);
496 assert(range_start_reg + bound_regs <= 64);
497 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
498 bound_regs);
499 }
500
501 cmd_buffer->state.push_constants_dirty |=
502 mesa_to_vk_shader_stage(stage);
503
504 range_start_reg += range->length;
505 }
506 }
507 }
508
509 /* Setting NULL resets the push constant state so that we allocate a new one
510 * if needed. If push constant data not dirty, get_push_range_address can
511 * re-use existing allocation.
512 *
513 * Always reallocate on gfx9, gfx11 to fix push constant related flaky tests.
514 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/11064
515 */
516 if (gfx_state->base.push_constants_data_dirty || GFX_VER < 12)
517 gfx_state->base.push_constants_state = ANV_STATE_NULL;
518
519 anv_foreach_stage(stage, dirty_stages) {
520 unsigned buffer_count = 0;
521 flushed |= mesa_to_vk_shader_stage(stage);
522 UNUSED uint32_t max_push_range = 0;
523
524 struct anv_address buffers[4] = {};
525 if (anv_pipeline_has_stage(pipeline, stage)) {
526 const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
527 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
528
529 /* We have to gather buffer addresses as a second step because the
530 * loop above puts data into the push constant area and the call to
531 * get_push_range_address is what locks our push constants and copies
532 * them into the actual GPU buffer. If we did the two loops at the
533 * same time, we'd risk only having some of the sizes in the push
534 * constant buffer when we did the copy.
535 */
536 for (unsigned i = 0; i < 4; i++) {
537 const struct anv_push_range *range = &bind_map->push_ranges[i];
538 if (range->length == 0)
539 break;
540
541 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
542 max_push_range = MAX2(max_push_range, range->length);
543 buffer_count++;
544 }
545
546 /* We have at most 4 buffers but they should be tightly packed */
547 for (unsigned i = buffer_count; i < 4; i++)
548 assert(bind_map->push_ranges[i].length == 0);
549 }
550
551 #if GFX_VER >= 12
552 /* If this stage doesn't have any push constants, emit it later in a
553 * single CONSTANT_ALL packet.
554 */
555 if (buffer_count == 0) {
556 nobuffer_stages |= 1 << stage;
557 continue;
558 }
559
560 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
561 * contains only 5 bits, so we can only use it for buffers smaller than
562 * 32.
563 *
564 * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
565 * in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
566 * for disabling stages, where all address bits are zero. However, we
567 * can't safely use it for general buffers with arbitrary addresses.
568 * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
569 * case.
570 */
571 if (max_push_range < 32 && GFX_VERx10 > 120) {
572 cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
573 buffers, buffer_count);
574 continue;
575 }
576 #endif
577
578 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
579 }
580
581 #if GFX_VER >= 12
582 if (nobuffer_stages)
583 /* Wa_16011448509: all address bits are zero */
584 cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
585 #endif
586
587 cmd_buffer->state.push_constants_dirty &= ~flushed;
588 gfx_state->base.push_constants_data_dirty = false;
589 }
590
591 #if GFX_VERx10 >= 125
592 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)593 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
594 VkShaderStageFlags dirty_stages)
595 {
596 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
597 const struct anv_graphics_pipeline *pipeline =
598 anv_pipeline_to_graphics(gfx_state->base.pipeline);
599
600 if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
601 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
602
603 const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
604 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
605
606 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
607 const struct anv_push_range *range = &bind_map->push_ranges[0];
608 if (range->length > 0) {
609 struct anv_address buffer =
610 get_push_range_address(cmd_buffer, shader, range);
611
612 uint64_t addr = anv_address_physical(buffer);
613 data.InlineData[0] = addr & 0xffffffff;
614 data.InlineData[1] = addr >> 32;
615
616 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
617 cmd_buffer->state.gfx.base.push_constants.client_data,
618 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
619 }
620 }
621 }
622
623 if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
624 anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
625
626 const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
627 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
628
629 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
630 const struct anv_push_range *range = &bind_map->push_ranges[0];
631 if (range->length > 0) {
632 struct anv_address buffer =
633 get_push_range_address(cmd_buffer, shader, range);
634
635 uint64_t addr = anv_address_physical(buffer);
636 data.InlineData[0] = addr & 0xffffffff;
637 data.InlineData[1] = addr >> 32;
638
639 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
640 cmd_buffer->state.gfx.base.push_constants.client_data,
641 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
642 }
643 }
644 }
645
646 cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
647 }
648 #endif
649
650 ALWAYS_INLINE static void
genX(emit_hs)651 genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
652 {
653 struct anv_graphics_pipeline *pipeline =
654 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
655 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
656 return;
657
658 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
659 }
660
661 ALWAYS_INLINE static void
genX(emit_ds)662 genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
663 {
664 #if INTEL_NEEDS_WA_22018402687
665 /* Wa_22018402687:
666 * In any 3D enabled context, just before any Tessellation enabled draw
667 * call (3D Primitive), re-send the last programmed 3DSTATE_DS again.
668 * This will make sure that the 3DSTATE_INT generated just before the
669 * draw call will have TDS dirty which will make sure TDS will launch the
670 * state thread before the draw call.
671 *
672 * This fixes a hang resulting from running anything using tessellation
673 * after a switch away from the mesh pipeline.
674 * We don't need to track said switch, as it matters at the HW level, and
675 * can be triggered even across processes, so we apply the Wa at all times.
676 */
677 struct anv_graphics_pipeline *pipeline =
678 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
679 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
680 return;
681
682 const bool protected = cmd_buffer->vk.pool->flags &
683 VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
684
685 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
686 final.ds, protected);
687 #endif
688 }
689
690 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)691 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
692 {
693 struct anv_graphics_pipeline *pipeline =
694 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
695 const struct vk_dynamic_graphics_state *dyn =
696 &cmd_buffer->vk.dynamic_graphics_state;
697 uint32_t *p;
698
699 assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
700
701 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
702
703 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
704
705 genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
706
707 genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
708
709 genX(flush_pipeline_select_3d)(cmd_buffer);
710
711 /* Wa_14015814527
712 *
713 * Apply task URB workaround when switching from task to primitive.
714 */
715 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
716 if (anv_pipeline_is_primitive(pipeline)) {
717 genX(apply_task_urb_workaround)(cmd_buffer);
718 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
719 cmd_buffer->state.gfx.used_task_shader = true;
720 }
721 }
722
723 /* Apply any pending pipeline flushes we may have. We want to apply them
724 * now because, if any of those flushes are for things like push constants,
725 * the GPU will read the state at weird times.
726 */
727 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
728
729 /* Check what vertex buffers have been rebound against the set of bindings
730 * being used by the current set of vertex attributes.
731 */
732 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
733 /* If the pipeline changed, the we have to consider all the valid bindings. */
734 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
735 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
736 vb_emit |= dyn->vi->bindings_valid;
737
738 if (vb_emit) {
739 const uint32_t num_buffers = __builtin_popcount(vb_emit);
740 const uint32_t num_dwords = 1 + num_buffers * 4;
741
742 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
743 GENX(3DSTATE_VERTEX_BUFFERS));
744 uint32_t i = 0;
745 u_foreach_bit(vb, vb_emit) {
746 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
747 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
748
749 struct GENX(VERTEX_BUFFER_STATE) state;
750 if (buffer) {
751 uint32_t stride = dyn->vi_binding_strides[vb];
752 UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
753
754 state = (struct GENX(VERTEX_BUFFER_STATE)) {
755 .VertexBufferIndex = vb,
756
757 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
758 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
759 .AddressModifyEnable = true,
760 .BufferPitch = stride,
761 .BufferStartingAddress = anv_address_add(buffer->address, offset),
762 .NullVertexBuffer = offset >= buffer->vk.size,
763 #if GFX_VER >= 12
764 .L3BypassDisable = true,
765 #endif
766
767 .BufferSize = size,
768 };
769 } else {
770 state = (struct GENX(VERTEX_BUFFER_STATE)) {
771 .VertexBufferIndex = vb,
772 .NullVertexBuffer = true,
773 .MOCS = anv_mocs(cmd_buffer->device, NULL,
774 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
775 };
776 }
777
778 #if GFX_VER == 9
779 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
780 state.BufferStartingAddress,
781 state.BufferSize);
782 #endif
783
784 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
785 i++;
786 }
787 }
788
789 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
790
791 const bool any_dynamic_state_dirty =
792 vk_dynamic_graphics_state_any_dirty(dyn);
793 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
794 pipeline->base.base.active_stages;
795
796 descriptors_dirty |=
797 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
798 &cmd_buffer->state.gfx.base,
799 &pipeline->base.base);
800
801 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive. */
802 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE ||
803 (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343)) {
804 genX(emit_hs)(cmd_buffer);
805 }
806
807 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
808 !any_dynamic_state_dirty &&
809 ((cmd_buffer->state.push_constants_dirty &
810 (VK_SHADER_STAGE_ALL_GRAPHICS |
811 VK_SHADER_STAGE_TASK_BIT_EXT |
812 VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
813 return;
814
815 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
816 /* Wa_16011411144:
817 *
818 * SW must insert a PIPE_CONTROL cmd before and after the
819 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
820 * state is not combined with other state changes.
821 */
822 if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
823 anv_add_pending_pipe_bits(cmd_buffer,
824 ANV_PIPE_CS_STALL_BIT,
825 "before SO_BUFFER change WA");
826 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
827 }
828
829 /* We don't need any per-buffer dirty tracking because you're not
830 * allowed to bind different XFB buffers while XFB is enabled.
831 */
832 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
833 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
834 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
835 #if GFX_VER < 12
836 sob.SOBufferIndex = idx;
837 #else
838 sob._3DCommandOpcode = 0;
839 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
840 #endif
841
842 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
843 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
844 ISL_SURF_USAGE_STREAM_OUT_BIT);
845 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
846 xfb->offset);
847 sob.SOBufferEnable = true;
848 sob.StreamOffsetWriteEnable = false;
849 /* Size is in DWords - 1 */
850 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
851 } else {
852 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
853 }
854 }
855 }
856
857 if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
858 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
859 anv_add_pending_pipe_bits(cmd_buffer,
860 ANV_PIPE_CS_STALL_BIT,
861 "after SO_BUFFER change WA");
862 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
863 } else if (GFX_VER >= 10) {
864 /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
865 anv_add_pending_pipe_bits(cmd_buffer,
866 ANV_PIPE_CS_STALL_BIT,
867 "after 3DSTATE_SO_BUFFER call");
868 }
869 }
870
871 /* State left dirty after flushing runtime state. */
872 anv_cmd_dirty_mask_t dirty_state_mask = 0;
873
874 /* Flush the runtime state into the HW state tracking */
875 if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
876 dirty_state_mask = genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
877
878 /* Flush the HW state into the commmand buffer */
879 if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
880 genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
881
882 /* If the pipeline changed, we may need to re-allocate push constant space
883 * in the URB.
884 */
885 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
886 cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
887
888 /* Also add the relocations (scratch buffers) */
889 VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
890 pipeline->base.base.batch.relocs);
891 if (result != VK_SUCCESS) {
892 anv_batch_set_error(&cmd_buffer->batch, result);
893 return;
894 }
895 }
896
897 /* Render targets live in the same binding table as fragment descriptors */
898 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
899 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
900
901 /* We emit the binding tables and sampler tables first, then emit push
902 * constants and then finally emit binding table and sampler table
903 * pointers. It has to happen in this order, since emitting the binding
904 * tables may change the push constants (in case of storage images). After
905 * emitting push constants, on SKL+ we have to emit the corresponding
906 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
907 */
908 uint32_t dirty = 0;
909 if (descriptors_dirty) {
910 dirty = genX(cmd_buffer_flush_descriptor_sets)(
911 cmd_buffer,
912 &cmd_buffer->state.gfx.base,
913 descriptors_dirty,
914 pipeline->base.shaders,
915 ARRAY_SIZE(pipeline->base.shaders));
916 cmd_buffer->state.descriptors_dirty &= ~dirty;
917 }
918
919 if (dirty || cmd_buffer->state.push_constants_dirty) {
920 /* Because we're pushing UBOs, we have to push whenever either
921 * descriptors or push constants is dirty.
922 */
923 dirty |= cmd_buffer->state.push_constants_dirty &
924 pipeline->base.base.active_stages;
925 cmd_buffer_flush_gfx_push_constants(cmd_buffer,
926 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
927 #if GFX_VERx10 >= 125
928 cmd_buffer_flush_mesh_inline_data(
929 cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
930 VK_SHADER_STAGE_MESH_BIT_EXT));
931 #endif
932 }
933
934 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
935 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
936 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
937 }
938
939 #if GFX_VER >= 20
940 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE) {
941 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BYTE_STRIDE), sb_stride) {
942 sb_stride.ByteStride = cmd_buffer->state.gfx.indirect_data_stride;
943 sb_stride.ByteStrideEnable = !cmd_buffer->state.gfx.indirect_data_stride_aligned;
944 }
945 }
946 #endif
947
948 /* When we're done, only thing left is the possible dirty state
949 * returned by cmd_buffer_flush_gfx_runtime_state.
950 */
951 cmd_buffer->state.gfx.dirty = dirty_state_mask;
952 }
953
954 ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer * cmd_buffer,uint32_t count)955 anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
956 {
957 const struct anv_device *device = cmd_buffer->device;
958 const struct anv_graphics_pipeline *pipeline =
959 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
960
961 /* We cannot generate readable commands in protected mode. */
962 if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
963 return false;
964
965 /* Limit generated draws to pipelines without HS stage. This makes things
966 * simpler for implementing Wa_1306463417, Wa_16011107343.
967 */
968 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
969 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
970 return false;
971
972 return count >= device->physical->instance->generated_indirect_threshold;
973 }
974
975 #include "genX_cmd_draw_helpers.h"
976 #include "genX_cmd_draw_generated_indirect.h"
977
978 #if GFX_VER >= 11
979 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
980 #else
981 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
982 #endif
983
genX(CmdDraw)984 void genX(CmdDraw)(
985 VkCommandBuffer commandBuffer,
986 uint32_t vertexCount,
987 uint32_t instanceCount,
988 uint32_t firstVertex,
989 uint32_t firstInstance)
990 {
991 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
992 struct anv_graphics_pipeline *pipeline =
993 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
994
995 if (anv_batch_has_error(&cmd_buffer->batch))
996 return;
997
998 const uint32_t count =
999 vertexCount * instanceCount * pipeline->instance_multiplier;
1000 anv_measure_snapshot(cmd_buffer,
1001 INTEL_SNAPSHOT_DRAW,
1002 "draw", count);
1003 trace_intel_begin_draw(&cmd_buffer->trace);
1004
1005 /* Select pipeline here to allow
1006 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1007 * cmd_buffer_flush_gfx_state().
1008 */
1009 genX(flush_pipeline_select_3d)(cmd_buffer);
1010
1011 #if GFX_VER < 11
1012 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1013 get_vs_prog_data(pipeline),
1014 firstVertex, firstInstance, 0,
1015 false /* force_flush */);
1016 #endif
1017
1018 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1019 genX(emit_ds)(cmd_buffer);
1020
1021 if (cmd_buffer->state.conditional_render_enabled)
1022 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1023
1024 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1025
1026 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1027 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1028 #if GFX_VERx10 >= 125
1029 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1030 #endif
1031 prim.VertexAccessType = SEQUENTIAL;
1032 prim.VertexCountPerInstance = vertexCount;
1033 prim.StartVertexLocation = firstVertex;
1034 prim.InstanceCount = instanceCount *
1035 pipeline->instance_multiplier;
1036 prim.StartInstanceLocation = firstInstance;
1037 prim.BaseVertexLocation = 0;
1038 #if GFX_VER >= 11
1039 prim.ExtendedParametersPresent = true;
1040 prim.ExtendedParameter0 = firstVertex;
1041 prim.ExtendedParameter1 = firstInstance;
1042 prim.ExtendedParameter2 = 0;
1043 #endif
1044 }
1045
1046 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1047 cmd_buffer->device,
1048 cmd_buffer->state.gfx.primitive_topology,
1049 vertexCount);
1050 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1051
1052 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1053
1054 trace_intel_end_draw(&cmd_buffer->trace, count);
1055 }
1056
genX(CmdDrawMultiEXT)1057 void genX(CmdDrawMultiEXT)(
1058 VkCommandBuffer commandBuffer,
1059 uint32_t drawCount,
1060 const VkMultiDrawInfoEXT *pVertexInfo,
1061 uint32_t instanceCount,
1062 uint32_t firstInstance,
1063 uint32_t stride)
1064 {
1065 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1066 UNUSED struct anv_graphics_pipeline *pipeline =
1067 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1068
1069 if (anv_batch_has_error(&cmd_buffer->batch))
1070 return;
1071
1072 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1073
1074 if (cmd_buffer->state.conditional_render_enabled)
1075 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1076
1077 uint32_t i = 0;
1078 #if GFX_VER < 11
1079 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1080 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1081 get_vs_prog_data(pipeline),
1082 draw->firstVertex,
1083 firstInstance, i, !i);
1084
1085 const uint32_t count =
1086 draw->vertexCount * instanceCount * pipeline->instance_multiplier;
1087 anv_measure_snapshot(cmd_buffer,
1088 INTEL_SNAPSHOT_DRAW,
1089 "draw multi", count);
1090 trace_intel_begin_draw_multi(&cmd_buffer->trace);
1091
1092 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1093
1094 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1095 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1096 prim.VertexAccessType = SEQUENTIAL;
1097 prim.VertexCountPerInstance = draw->vertexCount;
1098 prim.StartVertexLocation = draw->firstVertex;
1099 prim.InstanceCount = instanceCount *
1100 pipeline->instance_multiplier;
1101 prim.StartInstanceLocation = firstInstance;
1102 prim.BaseVertexLocation = 0;
1103 }
1104
1105 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1106 cmd_buffer->device,
1107 cmd_buffer->state.gfx.primitive_topology,
1108 drawCount == 0 ? 0 :
1109 pVertexInfo[drawCount - 1].vertexCount);
1110
1111 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1112 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1113 }
1114 #else
1115 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1116
1117 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1118 * first one was handled by cmd_buffer_flush_gfx_state.
1119 */
1120 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1121 genX(emit_hs)(cmd_buffer);
1122 genX(emit_ds)(cmd_buffer);
1123
1124 const uint32_t count = draw->vertexCount * instanceCount;
1125 anv_measure_snapshot(cmd_buffer,
1126 INTEL_SNAPSHOT_DRAW,
1127 "draw multi", count);
1128 trace_intel_begin_draw_multi(&cmd_buffer->trace);
1129
1130 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1131
1132 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1133 #if GFX_VERx10 >= 125
1134 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1135 #endif
1136 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1137 prim.VertexAccessType = SEQUENTIAL;
1138 prim.VertexCountPerInstance = draw->vertexCount;
1139 prim.StartVertexLocation = draw->firstVertex;
1140 prim.InstanceCount = instanceCount;
1141 prim.StartInstanceLocation = firstInstance;
1142 prim.BaseVertexLocation = 0;
1143 prim.ExtendedParametersPresent = true;
1144 prim.ExtendedParameter0 = draw->firstVertex;
1145 prim.ExtendedParameter1 = firstInstance;
1146 prim.ExtendedParameter2 = i;
1147 }
1148
1149 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1150 cmd_buffer->device,
1151 cmd_buffer->state.gfx.primitive_topology,
1152 drawCount == 0 ? 0 :
1153 pVertexInfo[drawCount - 1].vertexCount);
1154
1155 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1156 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1157 }
1158 #endif
1159
1160 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1161 }
1162
genX(CmdDrawIndexed)1163 void genX(CmdDrawIndexed)(
1164 VkCommandBuffer commandBuffer,
1165 uint32_t indexCount,
1166 uint32_t instanceCount,
1167 uint32_t firstIndex,
1168 int32_t vertexOffset,
1169 uint32_t firstInstance)
1170 {
1171 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1172 struct anv_graphics_pipeline *pipeline =
1173 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1174
1175 if (anv_batch_has_error(&cmd_buffer->batch))
1176 return;
1177
1178 const uint32_t count =
1179 indexCount * instanceCount * pipeline->instance_multiplier;
1180 anv_measure_snapshot(cmd_buffer,
1181 INTEL_SNAPSHOT_DRAW,
1182 "draw indexed",
1183 count);
1184 trace_intel_begin_draw_indexed(&cmd_buffer->trace);
1185
1186 /* Select pipeline here to allow
1187 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1188 * cmd_buffer_flush_gfx_state().
1189 */
1190 genX(flush_pipeline_select_3d)(cmd_buffer);
1191
1192 #if GFX_VER < 11
1193 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1194 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1195 vertexOffset, firstInstance,
1196 0, false /* force_flush */);
1197 #endif
1198
1199 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1200
1201 if (cmd_buffer->state.conditional_render_enabled)
1202 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1203
1204 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1205
1206 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1207 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1208 #if GFX_VERx10 >= 125
1209 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1210 #endif
1211 prim.VertexAccessType = RANDOM;
1212 prim.VertexCountPerInstance = indexCount;
1213 prim.StartVertexLocation = firstIndex;
1214 prim.InstanceCount = instanceCount *
1215 pipeline->instance_multiplier;
1216 prim.StartInstanceLocation = firstInstance;
1217 prim.BaseVertexLocation = vertexOffset;
1218 #if GFX_VER >= 11
1219 prim.ExtendedParametersPresent = true;
1220 prim.ExtendedParameter0 = vertexOffset;
1221 prim.ExtendedParameter1 = firstInstance;
1222 prim.ExtendedParameter2 = 0;
1223 #endif
1224 }
1225
1226 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1227 cmd_buffer->device,
1228 cmd_buffer->state.gfx.primitive_topology,
1229 indexCount);
1230 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1231
1232 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1233
1234 trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
1235 }
1236
genX(CmdDrawMultiIndexedEXT)1237 void genX(CmdDrawMultiIndexedEXT)(
1238 VkCommandBuffer commandBuffer,
1239 uint32_t drawCount,
1240 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
1241 uint32_t instanceCount,
1242 uint32_t firstInstance,
1243 uint32_t stride,
1244 const int32_t *pVertexOffset)
1245 {
1246 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1247 struct anv_graphics_pipeline *pipeline =
1248 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1249
1250 if (anv_batch_has_error(&cmd_buffer->batch))
1251 return;
1252
1253 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1254
1255 if (cmd_buffer->state.conditional_render_enabled)
1256 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1257
1258 uint32_t i = 0;
1259 #if GFX_VER < 11
1260 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1261 if (pVertexOffset) {
1262 if (vs_prog_data->uses_drawid) {
1263 bool emitted = true;
1264 if (vs_prog_data->uses_firstvertex ||
1265 vs_prog_data->uses_baseinstance) {
1266 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1267 emitted = true;
1268 }
1269 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1270 if (vs_prog_data->uses_drawid) {
1271 emit_draw_index(cmd_buffer, i);
1272 emitted = true;
1273 }
1274 /* Emitting draw index or vertex index BOs may result in needing
1275 * additional VF cache flushes.
1276 */
1277 if (emitted)
1278 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1279
1280 const uint32_t count =
1281 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1282 anv_measure_snapshot(cmd_buffer,
1283 INTEL_SNAPSHOT_DRAW,
1284 "draw indexed multi",
1285 count);
1286 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1287 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1288 true);
1289
1290 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1291 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1292 prim.VertexAccessType = RANDOM;
1293 prim.VertexCountPerInstance = draw->indexCount;
1294 prim.StartVertexLocation = draw->firstIndex;
1295 prim.InstanceCount = instanceCount *
1296 pipeline->instance_multiplier;
1297 prim.StartInstanceLocation = firstInstance;
1298 prim.BaseVertexLocation = *pVertexOffset;
1299 }
1300
1301 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1302 cmd_buffer->device,
1303 cmd_buffer->state.gfx.primitive_topology,
1304 drawCount == 0 ? 0 :
1305 pIndexInfo[drawCount - 1].indexCount);
1306
1307 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1308 false);
1309 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1310 emitted = false;
1311 }
1312 } else {
1313 if (vs_prog_data->uses_firstvertex ||
1314 vs_prog_data->uses_baseinstance) {
1315 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1316 /* Emitting draw index or vertex index BOs may result in needing
1317 * additional VF cache flushes.
1318 */
1319 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1320 }
1321 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1322 const uint32_t count =
1323 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1324 anv_measure_snapshot(cmd_buffer,
1325 INTEL_SNAPSHOT_DRAW,
1326 "draw indexed multi",
1327 count);
1328 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1329 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1330 true);
1331
1332 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1333 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1334 prim.VertexAccessType = RANDOM;
1335 prim.VertexCountPerInstance = draw->indexCount;
1336 prim.StartVertexLocation = draw->firstIndex;
1337 prim.InstanceCount = instanceCount *
1338 pipeline->instance_multiplier;
1339 prim.StartInstanceLocation = firstInstance;
1340 prim.BaseVertexLocation = *pVertexOffset;
1341 }
1342
1343 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1344 cmd_buffer->device,
1345 cmd_buffer->state.gfx.primitive_topology,
1346 drawCount == 0 ? 0 :
1347 pIndexInfo[drawCount - 1].indexCount);
1348
1349 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1350 false);
1351 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1352 }
1353 }
1354 } else {
1355 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1356 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1357 draw->vertexOffset,
1358 firstInstance, i, i != 0);
1359
1360 const uint32_t count =
1361 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1362 anv_measure_snapshot(cmd_buffer,
1363 INTEL_SNAPSHOT_DRAW,
1364 "draw indexed multi",
1365 count);
1366 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1367 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1368
1369 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1370 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1371 prim.VertexAccessType = RANDOM;
1372 prim.VertexCountPerInstance = draw->indexCount;
1373 prim.StartVertexLocation = draw->firstIndex;
1374 prim.InstanceCount = instanceCount *
1375 pipeline->instance_multiplier;
1376 prim.StartInstanceLocation = firstInstance;
1377 prim.BaseVertexLocation = draw->vertexOffset;
1378 }
1379
1380 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1381 cmd_buffer->device,
1382 cmd_buffer->state.gfx.primitive_topology,
1383 drawCount == 0 ? 0 :
1384 pIndexInfo[drawCount - 1].indexCount);
1385
1386 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1387 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1388 }
1389 }
1390 #else
1391 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1392
1393 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1394 * first one was handled by cmd_buffer_flush_gfx_state.
1395 */
1396 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1397 genX(emit_hs)(cmd_buffer);
1398 genX(emit_ds)(cmd_buffer);
1399
1400 const uint32_t count =
1401 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1402 anv_measure_snapshot(cmd_buffer,
1403 INTEL_SNAPSHOT_DRAW,
1404 "draw indexed multi",
1405 count);
1406 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1407 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1408
1409 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
1410 #if GFX_VERx10 >= 125
1411 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1412 #endif
1413 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1414 prim.VertexAccessType = RANDOM;
1415 prim.VertexCountPerInstance = draw->indexCount;
1416 prim.StartVertexLocation = draw->firstIndex;
1417 prim.InstanceCount = instanceCount *
1418 pipeline->instance_multiplier;
1419 prim.StartInstanceLocation = firstInstance;
1420 prim.BaseVertexLocation = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1421 prim.ExtendedParametersPresent = true;
1422 prim.ExtendedParameter0 = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1423 prim.ExtendedParameter1 = firstInstance;
1424 prim.ExtendedParameter2 = i;
1425 }
1426
1427 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1428 cmd_buffer->device,
1429 cmd_buffer->state.gfx.primitive_topology,
1430 drawCount == 0 ? 0 :
1431 pIndexInfo[drawCount - 1].indexCount);
1432
1433 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1434 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1435 }
1436 #endif
1437
1438 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1439 }
1440
1441 /* Auto-Draw / Indirect Registers */
1442 #define GFX7_3DPRIM_END_OFFSET 0x2420
1443 #define GFX7_3DPRIM_START_VERTEX 0x2430
1444 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
1445 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
1446 #define GFX7_3DPRIM_START_INSTANCE 0x243C
1447 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
1448
1449 /* On Gen11+, we have three custom "extended parameters" which we can use to
1450 * provide extra system-generated values to shaders. Our assignment of these
1451 * is arbitrary; we choose to assign them as follows:
1452 *
1453 * gl_BaseVertex = XP0
1454 * gl_BaseInstance = XP1
1455 * gl_DrawID = XP2
1456 *
1457 * For gl_BaseInstance, we never actually have to set up the value because we
1458 * can just program 3DSTATE_VF_SGVS_2 to load it implicitly. We can also do
1459 * that for gl_BaseVertex but it does the wrong thing for indexed draws.
1460 */
1461 #define GEN11_3DPRIM_XP0 0x2690
1462 #define GEN11_3DPRIM_XP1 0x2694
1463 #define GEN11_3DPRIM_XP2 0x2698
1464 #define GEN11_3DPRIM_XP_BASE_VERTEX GEN11_3DPRIM_XP0
1465 #define GEN11_3DPRIM_XP_BASE_INSTANCE GEN11_3DPRIM_XP1
1466 #define GEN11_3DPRIM_XP_DRAW_ID GEN11_3DPRIM_XP2
1467
genX(CmdDrawIndirectByteCountEXT)1468 void genX(CmdDrawIndirectByteCountEXT)(
1469 VkCommandBuffer commandBuffer,
1470 uint32_t instanceCount,
1471 uint32_t firstInstance,
1472 VkBuffer counterBuffer,
1473 VkDeviceSize counterBufferOffset,
1474 uint32_t counterOffset,
1475 uint32_t vertexStride)
1476 {
1477 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1478 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
1479 struct anv_graphics_pipeline *pipeline =
1480 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1481
1482 /* firstVertex is always zero for this draw function */
1483 const uint32_t firstVertex = 0;
1484
1485 if (anv_batch_has_error(&cmd_buffer->batch))
1486 return;
1487
1488 anv_measure_snapshot(cmd_buffer,
1489 INTEL_SNAPSHOT_DRAW,
1490 "draw indirect byte count",
1491 instanceCount * pipeline->instance_multiplier);
1492 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
1493
1494 /* Select pipeline here to allow
1495 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1496 * emit_base_vertex_instance() & emit_draw_index().
1497 */
1498 genX(flush_pipeline_select_3d)(cmd_buffer);
1499
1500 #if GFX_VER < 11
1501 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1502 if (vs_prog_data->uses_firstvertex ||
1503 vs_prog_data->uses_baseinstance)
1504 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1505 if (vs_prog_data->uses_drawid)
1506 emit_draw_index(cmd_buffer, 0);
1507 #endif
1508
1509 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1510
1511 if (cmd_buffer->state.conditional_render_enabled)
1512 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1513
1514 struct mi_builder b;
1515 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1516 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
1517 mi_builder_set_mocs(&b, mocs);
1518 struct mi_value count =
1519 mi_mem32(anv_address_add(counter_buffer->address,
1520 counterBufferOffset));
1521 if (counterOffset)
1522 count = mi_isub(&b, count, mi_imm(counterOffset));
1523 count = mi_udiv32_imm(&b, count, vertexStride);
1524 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
1525
1526 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
1527 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
1528 mi_imm(instanceCount * pipeline->instance_multiplier));
1529 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
1530 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1531
1532 #if GFX_VER >= 11
1533 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1534 mi_imm(firstVertex));
1535 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1536 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
1537 #endif
1538
1539 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1540 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1541 #if GFX_VERx10 >= 125
1542 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1543 #endif
1544 prim.IndirectParameterEnable = true;
1545 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1546 prim.VertexAccessType = SEQUENTIAL;
1547 #if GFX_VER >= 11
1548 prim.ExtendedParametersPresent = true;
1549 #endif
1550 }
1551
1552 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1553 cmd_buffer->device,
1554 cmd_buffer->state.gfx.primitive_topology,
1555 1);
1556 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1557
1558 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1559
1560 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
1561 instanceCount * pipeline->instance_multiplier);
1562 }
1563
1564 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed,uint32_t draw_id)1565 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
1566 struct anv_address addr,
1567 bool indexed,
1568 uint32_t draw_id)
1569 {
1570 struct anv_graphics_pipeline *pipeline =
1571 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1572
1573 struct mi_builder b;
1574 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1575 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
1576 mi_builder_set_mocs(&b, mocs);
1577
1578 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
1579 mi_mem32(anv_address_add(addr, 0)));
1580
1581 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
1582 if (pipeline->instance_multiplier > 1) {
1583 instance_count = mi_imul_imm(&b, instance_count,
1584 pipeline->instance_multiplier);
1585 }
1586 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
1587
1588 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
1589 mi_mem32(anv_address_add(addr, 8)));
1590
1591 if (indexed) {
1592 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
1593 mi_mem32(anv_address_add(addr, 12)));
1594 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1595 mi_mem32(anv_address_add(addr, 16)));
1596 #if GFX_VER >= 11
1597 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1598 mi_mem32(anv_address_add(addr, 12)));
1599 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1600 #endif
1601 } else {
1602 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1603 mi_mem32(anv_address_add(addr, 12)));
1604 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1605 #if GFX_VER >= 11
1606 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1607 mi_mem32(anv_address_add(addr, 8)));
1608 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1609 #endif
1610 }
1611
1612 #if GFX_VER >= 11
1613 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
1614 mi_imm(draw_id));
1615 #endif
1616 }
1617
1618 static const inline bool
execute_indirect_draw_supported(const struct anv_cmd_buffer * cmd_buffer)1619 execute_indirect_draw_supported(const struct anv_cmd_buffer *cmd_buffer)
1620 {
1621 #if GFX_VERx10 >= 125
1622 const struct intel_device_info *devinfo = cmd_buffer->device->info;
1623
1624 if (!devinfo->has_indirect_unroll)
1625 return false;
1626
1627 struct anv_graphics_pipeline *pipeline =
1628 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1629 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1630 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1631 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1632 const bool is_multiview = pipeline->instance_multiplier > 1;
1633
1634 const bool uses_draw_id =
1635 (vs_prog_data && vs_prog_data->uses_drawid) ||
1636 (mesh_prog_data && mesh_prog_data->uses_drawid) ||
1637 (task_prog_data && task_prog_data->uses_drawid);
1638
1639 const bool uses_firstvertex =
1640 (vs_prog_data && vs_prog_data->uses_firstvertex);
1641
1642 const bool uses_baseinstance =
1643 (vs_prog_data && vs_prog_data->uses_baseinstance);
1644
1645 return !is_multiview &&
1646 !uses_draw_id &&
1647 !uses_firstvertex &&
1648 !uses_baseinstance;
1649 #else
1650 return false;
1651 #endif
1652 }
1653
1654 static void
emit_indirect_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint32_t indirect_data_stride,uint32_t draw_count,bool indexed)1655 emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
1656 struct anv_address indirect_data_addr,
1657 uint32_t indirect_data_stride,
1658 uint32_t draw_count,
1659 bool indexed)
1660 {
1661 #if GFX_VER < 11
1662 struct anv_graphics_pipeline *pipeline =
1663 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1664 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1665 #endif
1666 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1667
1668 if (cmd_buffer->state.conditional_render_enabled)
1669 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1670
1671 uint32_t offset = 0;
1672 for (uint32_t i = 0; i < draw_count; i++) {
1673 struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1674
1675 #if GFX_VER < 11
1676 /* TODO: We need to stomp base vertex to 0 somehow */
1677
1678 /* With sequential draws, we're dealing with the VkDrawIndirectCommand
1679 * structure data. We want to load VkDrawIndirectCommand::firstVertex at
1680 * offset 8 in the structure.
1681 *
1682 * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
1683 * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
1684 * the structure.
1685 */
1686 if (vs_prog_data->uses_firstvertex ||
1687 vs_prog_data->uses_baseinstance) {
1688 emit_base_vertex_instance_bo(cmd_buffer,
1689 anv_address_add(draw, indexed ? 12 : 8));
1690 }
1691 if (vs_prog_data->uses_drawid)
1692 emit_draw_index(cmd_buffer, i);
1693 #endif
1694
1695 /* Emitting draw index or vertex index BOs may result in needing
1696 * additional VF cache flushes.
1697 */
1698 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1699
1700 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1701 * first one was handled by cmd_buffer_flush_gfx_state.
1702 */
1703 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1704 genX(emit_hs)(cmd_buffer);
1705 genX(emit_ds)(cmd_buffer);
1706
1707 load_indirect_parameters(cmd_buffer, draw, indexed, i);
1708
1709 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1710 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1711 #if GFX_VERx10 >= 125
1712 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1713 #endif
1714 prim.IndirectParameterEnable = true;
1715 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1716 prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
1717 #if GFX_VER >= 11
1718 prim.ExtendedParametersPresent = true;
1719 #endif
1720 }
1721
1722 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1723 cmd_buffer->device,
1724 cmd_buffer->state.gfx.primitive_topology,
1725 1);
1726
1727 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1728
1729 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
1730
1731 offset += indirect_data_stride;
1732 }
1733 }
1734
xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)1735 static inline const uint32_t xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)
1736 {
1737 #if GFX_VERx10 >= 125
1738 switch (cmd) {
1739 case VK_CMD_DRAW_INDIRECT:
1740 case VK_CMD_DRAW_INDIRECT_COUNT:
1741 return XI_DRAW;
1742 case VK_CMD_DRAW_INDEXED_INDIRECT:
1743 case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1744 return XI_DRAWINDEXED;
1745 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1746 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1747 return XI_MESH_3D;
1748 default:
1749 unreachable("unhandled cmd type");
1750 }
1751 #else
1752 unreachable("unsupported GFX VER");
1753 #endif
1754 }
1755
1756 static inline bool
cmd_buffer_set_indirect_stride(struct anv_cmd_buffer * cmd_buffer,uint32_t stride,enum vk_cmd_type cmd)1757 cmd_buffer_set_indirect_stride(struct anv_cmd_buffer *cmd_buffer,
1758 uint32_t stride, enum vk_cmd_type cmd)
1759 {
1760 /* Should have been sanitized by the caller */
1761 assert(stride != 0);
1762
1763 uint32_t data_stride = 0;
1764
1765 switch (cmd) {
1766 case VK_CMD_DRAW_INDIRECT:
1767 case VK_CMD_DRAW_INDIRECT_COUNT:
1768 data_stride = sizeof(VkDrawIndirectCommand);
1769 break;
1770 case VK_CMD_DRAW_INDEXED_INDIRECT:
1771 case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1772 data_stride = sizeof(VkDrawIndexedIndirectCommand);
1773 break;
1774 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1775 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1776 data_stride = sizeof(VkDrawMeshTasksIndirectCommandEXT);
1777 break;
1778 default:
1779 unreachable("unhandled cmd type");
1780 }
1781
1782 bool aligned = stride == data_stride;
1783
1784 #if GFX_VER >= 20
1785 /* The stride can change as long as it matches the default command stride
1786 * and STATE_BYTE_STRIDE::ByteStrideEnable=false, we can just do nothing.
1787 *
1788 * Otheriwse STATE_BYTE_STRIDE::ByteStrideEnable=true, any stride change
1789 * should be signaled.
1790 */
1791 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
1792 if (gfx_state->indirect_data_stride_aligned != aligned) {
1793 gfx_state->indirect_data_stride = stride;
1794 gfx_state->indirect_data_stride_aligned = aligned;
1795 gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1796 } else if (!gfx_state->indirect_data_stride_aligned &&
1797 gfx_state->indirect_data_stride != stride) {
1798 gfx_state->indirect_data_stride = stride;
1799 gfx_state->indirect_data_stride_aligned = aligned;
1800 gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1801 }
1802 #endif
1803
1804 return aligned;
1805 }
1806
1807 static void
genX(cmd_buffer_emit_execute_indirect_draws)1808 genX(cmd_buffer_emit_execute_indirect_draws)(struct anv_cmd_buffer *cmd_buffer,
1809 struct anv_address indirect_data_addr,
1810 uint32_t indirect_data_stride,
1811 struct anv_address count_addr,
1812 uint32_t max_draw_count,
1813 enum vk_cmd_type cmd)
1814 {
1815 #if GFX_VERx10 >= 125
1816 bool aligned_stride =
1817 cmd_buffer_set_indirect_stride(cmd_buffer, indirect_data_stride, cmd);
1818
1819 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1820
1821 if (cmd_buffer->state.conditional_render_enabled)
1822 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1823
1824 uint32_t offset = 0;
1825 for (uint32_t i = 0; i < max_draw_count; i++) {
1826 struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1827 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1828 anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
1829 ind.ArgumentFormat = xi_argument_format_for_vk_cmd(cmd);
1830 ind.TBIMREnabled = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1831 ind.PredicateEnable =
1832 cmd_buffer->state.conditional_render_enabled;
1833 ind.MaxCount = aligned_stride ? max_draw_count : 1;
1834 ind.ArgumentBufferStartAddress = draw;
1835 ind.CountBufferAddress = count_addr;
1836 ind.CountBufferIndirectEnable = !anv_address_is_null(count_addr);
1837 ind.MOCS =
1838 anv_mocs(cmd_buffer->device, draw.bo, 0);
1839
1840 }
1841
1842 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1843 cmd_buffer->device,
1844 cmd_buffer->state.gfx.primitive_topology,
1845 1);
1846 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1847
1848 /* If all the indirect structures are aligned, then we can let the HW
1849 * do the unrolling and we only need one instruction. Otherwise we
1850 * need to emit one instruction per draw, but we're still avoiding
1851 * the register loads with MI commands.
1852 */
1853 if (aligned_stride || GFX_VER >= 20)
1854 break;
1855
1856 offset += indirect_data_stride;
1857 }
1858 #endif // GFX_VERx10 >= 125
1859 }
genX(CmdDrawIndirect)1860 void genX(CmdDrawIndirect)(
1861 VkCommandBuffer commandBuffer,
1862 VkBuffer _buffer,
1863 VkDeviceSize offset,
1864 uint32_t drawCount,
1865 uint32_t stride)
1866 {
1867 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1868 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1869
1870 if (anv_batch_has_error(&cmd_buffer->batch))
1871 return;
1872
1873 anv_measure_snapshot(cmd_buffer,
1874 INTEL_SNAPSHOT_DRAW,
1875 "draw indirect",
1876 drawCount);
1877 trace_intel_begin_draw_indirect(&cmd_buffer->trace);
1878
1879 struct anv_address indirect_data_addr =
1880 anv_address_add(buffer->address, offset);
1881
1882 stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
1883
1884 if (execute_indirect_draw_supported(cmd_buffer)) {
1885 genX(cmd_buffer_emit_execute_indirect_draws)(
1886 cmd_buffer,
1887 indirect_data_addr,
1888 stride,
1889 ANV_NULL_ADDRESS /* count_addr */,
1890 drawCount,
1891 VK_CMD_DRAW_INDIRECT);
1892 } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1893 genX(cmd_buffer_emit_indirect_generated_draws)(
1894 cmd_buffer,
1895 indirect_data_addr,
1896 stride,
1897 ANV_NULL_ADDRESS /* count_addr */,
1898 drawCount,
1899 false /* indexed */);
1900 } else {
1901 emit_indirect_draws(cmd_buffer,
1902 indirect_data_addr,
1903 stride, drawCount, false /* indexed */);
1904 }
1905
1906 trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
1907 }
1908
genX(CmdDrawIndexedIndirect)1909 void genX(CmdDrawIndexedIndirect)(
1910 VkCommandBuffer commandBuffer,
1911 VkBuffer _buffer,
1912 VkDeviceSize offset,
1913 uint32_t drawCount,
1914 uint32_t stride)
1915 {
1916 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1917 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1918
1919 if (anv_batch_has_error(&cmd_buffer->batch))
1920 return;
1921
1922 anv_measure_snapshot(cmd_buffer,
1923 INTEL_SNAPSHOT_DRAW,
1924 "draw indexed indirect",
1925 drawCount);
1926 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
1927
1928 struct anv_address indirect_data_addr =
1929 anv_address_add(buffer->address, offset);
1930
1931 stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
1932
1933 if (execute_indirect_draw_supported(cmd_buffer)) {
1934 genX(cmd_buffer_emit_execute_indirect_draws)(
1935 cmd_buffer,
1936 indirect_data_addr,
1937 stride,
1938 ANV_NULL_ADDRESS /* count_addr */,
1939 drawCount,
1940 VK_CMD_DRAW_INDEXED_INDIRECT);
1941 } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1942 genX(cmd_buffer_emit_indirect_generated_draws)(
1943 cmd_buffer,
1944 indirect_data_addr,
1945 stride,
1946 ANV_NULL_ADDRESS /* count_addr */,
1947 drawCount,
1948 true /* indexed */);
1949 } else {
1950 emit_indirect_draws(cmd_buffer,
1951 indirect_data_addr,
1952 stride, drawCount, true /* indexed */);
1953 }
1954
1955 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
1956 }
1957
1958 #define MI_PREDICATE_SRC0 0x2400
1959 #define MI_PREDICATE_SRC1 0x2408
1960 #define MI_PREDICATE_RESULT 0x2418
1961
1962 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)1963 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1964 struct mi_builder *b,
1965 struct anv_address count_address)
1966 {
1967 struct mi_value ret = mi_imm(0);
1968
1969 if (cmd_buffer->state.conditional_render_enabled) {
1970 ret = mi_new_gpr(b);
1971 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
1972 } else {
1973 /* Upload the current draw count from the draw parameters buffer to
1974 * MI_PREDICATE_SRC0.
1975 */
1976 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
1977 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
1978 }
1979
1980 return ret;
1981 }
1982
1983 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)1984 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1985 struct mi_builder *b,
1986 uint32_t draw_index)
1987 {
1988 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
1989 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
1990
1991 if (draw_index == 0) {
1992 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1993 mip.LoadOperation = LOAD_LOADINV;
1994 mip.CombineOperation = COMBINE_SET;
1995 mip.CompareOperation = COMPARE_SRCS_EQUAL;
1996 }
1997 } else {
1998 /* While draw_index < draw_count the predicate's result will be
1999 * (draw_index == draw_count) ^ TRUE = TRUE
2000 * When draw_index == draw_count the result is
2001 * (TRUE) ^ TRUE = FALSE
2002 * After this all results will be:
2003 * (FALSE) ^ FALSE = FALSE
2004 */
2005 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
2006 mip.LoadOperation = LOAD_LOAD;
2007 mip.CombineOperation = COMBINE_XOR;
2008 mip.CompareOperation = COMPARE_SRCS_EQUAL;
2009 }
2010 }
2011 }
2012
2013 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2014 emit_draw_count_predicate_with_conditional_render(
2015 struct anv_cmd_buffer *cmd_buffer,
2016 struct mi_builder *b,
2017 uint32_t draw_index,
2018 struct mi_value max)
2019 {
2020 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
2021 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
2022
2023 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
2024 }
2025
2026 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2027 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
2028 struct mi_builder *b,
2029 uint32_t draw_index,
2030 struct mi_value max)
2031 {
2032 if (cmd_buffer->state.conditional_render_enabled) {
2033 emit_draw_count_predicate_with_conditional_render(
2034 cmd_buffer, b, draw_index, mi_value_ref(b, max));
2035 } else {
2036 emit_draw_count_predicate(cmd_buffer, b, draw_index);
2037 }
2038 }
2039
2040 static void
emit_indirect_count_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint64_t indirect_data_stride,struct anv_address draw_count_addr,uint32_t max_draw_count,bool indexed)2041 emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
2042 struct anv_address indirect_data_addr,
2043 uint64_t indirect_data_stride,
2044 struct anv_address draw_count_addr,
2045 uint32_t max_draw_count,
2046 bool indexed)
2047 {
2048 #if GFX_VER < 11
2049 struct anv_graphics_pipeline *pipeline =
2050 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2051 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
2052 #endif
2053
2054 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2055
2056 struct mi_builder b;
2057 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2058 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
2059 mi_builder_set_mocs(&b, mocs);
2060 struct mi_value max =
2061 prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
2062
2063 for (uint32_t i = 0; i < max_draw_count; i++) {
2064 struct anv_address draw =
2065 anv_address_add(indirect_data_addr, i * indirect_data_stride);
2066
2067 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2068
2069 #if GFX_VER < 11
2070 if (vs_prog_data->uses_firstvertex ||
2071 vs_prog_data->uses_baseinstance) {
2072 emit_base_vertex_instance_bo(cmd_buffer,
2073 anv_address_add(draw, indexed ? 12 : 8));
2074 }
2075 if (vs_prog_data->uses_drawid)
2076 emit_draw_index(cmd_buffer, i);
2077
2078 /* Emitting draw index or vertex index BOs may result in needing
2079 * additional VF cache flushes.
2080 */
2081 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2082 #endif
2083
2084 load_indirect_parameters(cmd_buffer, draw, indexed, i);
2085
2086 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
2087 * first one was handled by cmd_buffer_flush_gfx_state.
2088 */
2089 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
2090 genX(emit_hs)(cmd_buffer);
2091 genX(emit_ds)(cmd_buffer);
2092
2093 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
2094 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
2095 #if GFX_VERx10 >= 125
2096 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
2097 #endif
2098 prim.IndirectParameterEnable = true;
2099 prim.PredicateEnable = true;
2100 prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
2101 #if GFX_VER >= 11
2102 prim.ExtendedParametersPresent = true;
2103 #endif
2104 }
2105
2106 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
2107 cmd_buffer->device,
2108 cmd_buffer->state.gfx.primitive_topology,
2109 1);
2110 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
2111
2112 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
2113 }
2114
2115 mi_value_unref(&b, max);
2116 }
2117
genX(CmdDrawIndirectCount)2118 void genX(CmdDrawIndirectCount)(
2119 VkCommandBuffer commandBuffer,
2120 VkBuffer _buffer,
2121 VkDeviceSize offset,
2122 VkBuffer _countBuffer,
2123 VkDeviceSize countBufferOffset,
2124 uint32_t maxDrawCount,
2125 uint32_t stride)
2126 {
2127 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2128 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2129 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2130
2131 if (anv_batch_has_error(&cmd_buffer->batch))
2132 return;
2133
2134 anv_measure_snapshot(cmd_buffer,
2135 INTEL_SNAPSHOT_DRAW,
2136 "draw indirect count",
2137 0);
2138 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
2139
2140 struct anv_address indirect_data_address =
2141 anv_address_add(buffer->address, offset);
2142 struct anv_address count_address =
2143 anv_address_add(count_buffer->address, countBufferOffset);
2144 stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
2145
2146 if (execute_indirect_draw_supported(cmd_buffer)) {
2147 genX(cmd_buffer_emit_execute_indirect_draws)(
2148 cmd_buffer,
2149 indirect_data_address,
2150 stride,
2151 count_address,
2152 maxDrawCount,
2153 VK_CMD_DRAW_INDIRECT_COUNT);
2154 } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2155 genX(cmd_buffer_emit_indirect_generated_draws)(
2156 cmd_buffer,
2157 indirect_data_address,
2158 stride,
2159 count_address,
2160 maxDrawCount,
2161 false /* indexed */);
2162 } else {
2163 emit_indirect_count_draws(cmd_buffer,
2164 indirect_data_address,
2165 stride,
2166 count_address,
2167 maxDrawCount,
2168 false /* indexed */);
2169 }
2170
2171 trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
2172 anv_address_utrace(count_address));
2173 }
2174
genX(CmdDrawIndexedIndirectCount)2175 void genX(CmdDrawIndexedIndirectCount)(
2176 VkCommandBuffer commandBuffer,
2177 VkBuffer _buffer,
2178 VkDeviceSize offset,
2179 VkBuffer _countBuffer,
2180 VkDeviceSize countBufferOffset,
2181 uint32_t maxDrawCount,
2182 uint32_t stride)
2183 {
2184 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2185 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2186 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2187
2188 if (anv_batch_has_error(&cmd_buffer->batch))
2189 return;
2190
2191 anv_measure_snapshot(cmd_buffer,
2192 INTEL_SNAPSHOT_DRAW,
2193 "draw indexed indirect count",
2194 0);
2195 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
2196
2197 struct anv_address indirect_data_address =
2198 anv_address_add(buffer->address, offset);
2199 struct anv_address count_address =
2200 anv_address_add(count_buffer->address, countBufferOffset);
2201 stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
2202
2203 if (execute_indirect_draw_supported(cmd_buffer)) {
2204 genX(cmd_buffer_emit_execute_indirect_draws)(
2205 cmd_buffer,
2206 indirect_data_address,
2207 stride,
2208 count_address,
2209 maxDrawCount,
2210 VK_CMD_DRAW_INDEXED_INDIRECT_COUNT);
2211 } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2212 genX(cmd_buffer_emit_indirect_generated_draws)(
2213 cmd_buffer,
2214 indirect_data_address,
2215 stride,
2216 count_address,
2217 maxDrawCount,
2218 true /* indexed */);
2219 } else {
2220 emit_indirect_count_draws(cmd_buffer,
2221 indirect_data_address,
2222 stride,
2223 count_address,
2224 maxDrawCount,
2225 true /* indexed */);
2226 }
2227
2228 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
2229 anv_address_utrace(count_address));
2230
2231 }
2232
genX(CmdBeginTransformFeedbackEXT)2233 void genX(CmdBeginTransformFeedbackEXT)(
2234 VkCommandBuffer commandBuffer,
2235 uint32_t firstCounterBuffer,
2236 uint32_t counterBufferCount,
2237 const VkBuffer* pCounterBuffers,
2238 const VkDeviceSize* pCounterBufferOffsets)
2239 {
2240 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2241
2242 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2243 assert(counterBufferCount <= MAX_XFB_BUFFERS);
2244 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2245
2246 trace_intel_begin_xfb(&cmd_buffer->trace);
2247
2248 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2249 *
2250 * "Ssoftware must ensure that no HW stream output operations can be in
2251 * process or otherwise pending at the point that the MI_LOAD/STORE
2252 * commands are processed. This will likely require a pipeline flush."
2253 */
2254 anv_add_pending_pipe_bits(cmd_buffer,
2255 ANV_PIPE_CS_STALL_BIT,
2256 "begin transform feedback");
2257 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2258
2259 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
2260 /* If we have a counter buffer, this is a resume so we need to load the
2261 * value into the streamout offset register. Otherwise, this is a begin
2262 * and we need to reset it to zero.
2263 */
2264 if (pCounterBuffers &&
2265 idx >= firstCounterBuffer &&
2266 idx - firstCounterBuffer < counterBufferCount &&
2267 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
2268 uint32_t cb_idx = idx - firstCounterBuffer;
2269 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2270 uint64_t offset = pCounterBufferOffsets ?
2271 pCounterBufferOffsets[cb_idx] : 0;
2272
2273 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2274 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2275 lrm.MemoryAddress = anv_address_add(counter_buffer->address,
2276 offset);
2277 }
2278 } else {
2279 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2280 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2281 lri.DataDWord = 0;
2282 }
2283 }
2284 }
2285
2286 cmd_buffer->state.xfb_enabled = true;
2287 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2288 }
2289
genX(CmdEndTransformFeedbackEXT)2290 void genX(CmdEndTransformFeedbackEXT)(
2291 VkCommandBuffer commandBuffer,
2292 uint32_t firstCounterBuffer,
2293 uint32_t counterBufferCount,
2294 const VkBuffer* pCounterBuffers,
2295 const VkDeviceSize* pCounterBufferOffsets)
2296 {
2297 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2298
2299 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2300 assert(counterBufferCount <= MAX_XFB_BUFFERS);
2301 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2302
2303 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2304 *
2305 * "Ssoftware must ensure that no HW stream output operations can be in
2306 * process or otherwise pending at the point that the MI_LOAD/STORE
2307 * commands are processed. This will likely require a pipeline flush."
2308 */
2309 anv_add_pending_pipe_bits(cmd_buffer,
2310 ANV_PIPE_CS_STALL_BIT,
2311 "end transform feedback");
2312 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2313
2314 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
2315 unsigned idx = firstCounterBuffer + cb_idx;
2316
2317 /* If we have a counter buffer, this is a resume so we need to load the
2318 * value into the streamout offset register. Otherwise, this is a begin
2319 * and we need to reset it to zero.
2320 */
2321 if (pCounterBuffers &&
2322 cb_idx < counterBufferCount &&
2323 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
2324 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2325 uint64_t offset = pCounterBufferOffsets ?
2326 pCounterBufferOffsets[cb_idx] : 0;
2327
2328 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2329 srm.MemoryAddress = anv_address_add(counter_buffer->address,
2330 offset);
2331 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2332 }
2333 }
2334 }
2335
2336 trace_intel_end_xfb(&cmd_buffer->trace);
2337
2338 cmd_buffer->state.xfb_enabled = false;
2339 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2340 }
2341
2342 #if GFX_VERx10 >= 125
2343
2344 void
genX(CmdDrawMeshTasksEXT)2345 genX(CmdDrawMeshTasksEXT)(
2346 VkCommandBuffer commandBuffer,
2347 uint32_t x,
2348 uint32_t y,
2349 uint32_t z)
2350 {
2351 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2352
2353 if (anv_batch_has_error(&cmd_buffer->batch))
2354 return;
2355
2356 anv_measure_snapshot(cmd_buffer,
2357 INTEL_SNAPSHOT_DRAW,
2358 "draw mesh", x * y * z);
2359
2360 trace_intel_begin_draw_mesh(&cmd_buffer->trace);
2361
2362 /* TODO(mesh): Check if this is not emitting more packets than we need. */
2363 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2364
2365 if (cmd_buffer->state.conditional_render_enabled)
2366 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2367
2368 anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
2369 m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
2370 m.ThreadGroupCountX = x;
2371 m.ThreadGroupCountY = y;
2372 m.ThreadGroupCountZ = z;
2373 }
2374
2375 trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
2376 }
2377
2378 #define GFX125_3DMESH_TG_COUNT 0x26F0
2379 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
2380
2381 static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)2382 mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
2383 struct mi_builder *b,
2384 struct anv_address addr,
2385 bool emit_xp0,
2386 uint32_t xp0)
2387 {
2388 const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
2389 const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
2390 const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
2391
2392 mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
2393 mi_mem32(anv_address_add(addr, groupCountXOff)));
2394
2395 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
2396 mi_mem32(anv_address_add(addr, groupCountYOff)));
2397
2398 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
2399 mi_mem32(anv_address_add(addr, groupCountZOff)));
2400
2401 if (emit_xp0)
2402 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
2403 }
2404
2405 static void
emit_indirect_3dmesh_3d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)2406 emit_indirect_3dmesh_3d(struct anv_batch *batch,
2407 bool predicate_enable,
2408 bool uses_drawid)
2409 {
2410 uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
2411 uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
2412 .PredicateEnable = predicate_enable,
2413 .IndirectParameterEnable = true,
2414 .ExtendedParameter0Present = uses_drawid);
2415 if (uses_drawid)
2416 dw[len - 1] = 0;
2417 }
2418
2419 void
genX(CmdDrawMeshTasksIndirectEXT)2420 genX(CmdDrawMeshTasksIndirectEXT)(
2421 VkCommandBuffer commandBuffer,
2422 VkBuffer _buffer,
2423 VkDeviceSize offset,
2424 uint32_t drawCount,
2425 uint32_t stride)
2426 {
2427 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2428 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2429 struct anv_graphics_pipeline *pipeline =
2430 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2431 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2432 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2433 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
2434
2435 if (anv_batch_has_error(&cmd_buffer->batch))
2436 return;
2437
2438 anv_measure_snapshot(cmd_buffer,
2439 INTEL_SNAPSHOT_DRAW,
2440 "draw mesh indirect", drawCount);
2441
2442 trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
2443
2444 if (execute_indirect_draw_supported(cmd_buffer)) {
2445 genX(cmd_buffer_emit_execute_indirect_draws)(
2446 cmd_buffer,
2447 anv_address_add(buffer->address, offset),
2448 MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2449 ANV_NULL_ADDRESS /* count_addr */,
2450 drawCount,
2451 VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT);
2452
2453 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2454 return;
2455 }
2456
2457 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2458
2459 if (cmd_state->conditional_render_enabled)
2460 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2461
2462 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2463 mesh_prog_data->uses_drawid;
2464 struct mi_builder b;
2465 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2466
2467 for (uint32_t i = 0; i < drawCount; i++) {
2468 struct anv_address draw = anv_address_add(buffer->address, offset);
2469
2470 mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2471
2472 emit_indirect_3dmesh_3d(&cmd_buffer->batch,
2473 cmd_state->conditional_render_enabled, uses_drawid);
2474
2475 offset += stride;
2476 }
2477
2478 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2479 }
2480
2481 void
genX(CmdDrawMeshTasksIndirectCountEXT)2482 genX(CmdDrawMeshTasksIndirectCountEXT)(
2483 VkCommandBuffer commandBuffer,
2484 VkBuffer _buffer,
2485 VkDeviceSize offset,
2486 VkBuffer _countBuffer,
2487 VkDeviceSize countBufferOffset,
2488 uint32_t maxDrawCount,
2489 uint32_t stride)
2490 {
2491 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2492 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2493 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2494 struct anv_graphics_pipeline *pipeline =
2495 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2496 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2497 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2498
2499 if (anv_batch_has_error(&cmd_buffer->batch))
2500 return;
2501
2502 anv_measure_snapshot(cmd_buffer,
2503 INTEL_SNAPSHOT_DRAW,
2504 "draw mesh indirect count", 0);
2505
2506 trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
2507
2508 struct anv_address count_addr =
2509 anv_address_add(count_buffer->address, countBufferOffset);
2510
2511
2512 if (execute_indirect_draw_supported(cmd_buffer)) {
2513 genX(cmd_buffer_emit_execute_indirect_draws)(
2514 cmd_buffer,
2515 anv_address_add(buffer->address, offset),
2516 MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2517 count_addr /* count_addr */,
2518 maxDrawCount,
2519 VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT);
2520
2521 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, maxDrawCount);
2522 return;
2523 }
2524
2525 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2526
2527 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2528 mesh_prog_data->uses_drawid;
2529
2530 struct mi_builder b;
2531 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2532 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
2533 mi_builder_set_mocs(&b, mocs);
2534
2535 struct mi_value max =
2536 prepare_for_draw_count_predicate(
2537 cmd_buffer, &b, count_addr);
2538
2539 for (uint32_t i = 0; i < maxDrawCount; i++) {
2540 struct anv_address draw = anv_address_add(buffer->address, offset);
2541
2542 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2543
2544 mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2545
2546 emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
2547
2548 offset += stride;
2549 }
2550
2551 trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace,
2552 anv_address_utrace(count_addr));
2553 }
2554
2555 #endif /* GFX_VERx10 >= 125 */
2556