xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/genX_cmd_draw.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 
30 #include "genxml/gen_macros.h"
31 #include "genxml/genX_pack.h"
32 #include "common/intel_genX_state_brw.h"
33 
34 #include "ds/intel_tracepoints.h"
35 
36 #include "genX_mi_builder.h"
37 
38 static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer)39 cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
40 {
41    struct anv_graphics_pipeline *pipeline =
42       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
43    VkShaderStageFlags stages = pipeline->base.base.active_stages;
44 
45    /* In order to avoid thrash, we assume that vertex and fragment stages
46     * always exist.  In the rare case where one is missing *and* the other
47     * uses push concstants, this may be suboptimal.  However, avoiding stalls
48     * seems more important.
49     */
50    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
51    if (anv_pipeline_is_primitive(pipeline))
52       stages |= VK_SHADER_STAGE_VERTEX_BIT;
53 
54    if (stages == cmd_buffer->state.gfx.push_constant_stages)
55       return;
56 
57    unsigned push_constant_kb;
58 
59    const struct intel_device_info *devinfo = cmd_buffer->device->info;
60    if (anv_pipeline_is_mesh(pipeline))
61       push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
62    else
63       push_constant_kb = devinfo->max_constant_urb_size_kb;
64 
65    const unsigned num_stages =
66       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
67    unsigned size_per_stage = push_constant_kb / num_stages;
68 
69    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
70     * units of 2KB.  Incidentally, these are the same platforms that have
71     * 32KB worth of push constant space.
72     */
73    if (push_constant_kb == 32)
74       size_per_stage &= ~1u;
75 
76    uint32_t kb_used = 0;
77    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
78       const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
79       anv_batch_emit(&cmd_buffer->batch,
80                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
81          alloc._3DCommandSubOpcode  = 18 + i;
82          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
83          alloc.ConstantBufferSize   = push_size;
84       }
85       kb_used += push_size;
86    }
87 
88    anv_batch_emit(&cmd_buffer->batch,
89                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
90       alloc.ConstantBufferOffset = kb_used;
91       alloc.ConstantBufferSize = push_constant_kb - kb_used;
92    }
93 
94 #if GFX_VERx10 == 125
95    /* DG2: Wa_22011440098
96     * MTL: Wa_18022330953
97     *
98     * In 3D mode, after programming push constant alloc command immediately
99     * program push constant command(ZERO length) without any commit between
100     * them.
101     */
102    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
103       /* Update empty push constants for all stages (bitmask = 11111b) */
104       c.ShaderUpdateEnable = 0x1f;
105       c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
106    }
107 #endif
108 
109    cmd_buffer->state.gfx.push_constant_stages = stages;
110 
111    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
112     *
113     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
114     *    the next 3DPRIMITIVE command after programming the
115     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
116     *
117     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
118     * pipeline setup, we need to dirty push constants.
119     */
120    cmd_buffer->state.push_constants_dirty |= stages;
121 }
122 
123 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)124 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
125                                     uint32_t stages)
126 {
127    static const uint32_t sampler_state_opcodes[] = {
128       [MESA_SHADER_VERTEX]                      = 43,
129       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
130       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
131       [MESA_SHADER_GEOMETRY]                    = 46,
132       [MESA_SHADER_FRAGMENT]                    = 47,
133    };
134 
135    static const uint32_t binding_table_opcodes[] = {
136       [MESA_SHADER_VERTEX]                      = 38,
137       [MESA_SHADER_TESS_CTRL]                   = 39,
138       [MESA_SHADER_TESS_EVAL]                   = 40,
139       [MESA_SHADER_GEOMETRY]                    = 41,
140       [MESA_SHADER_FRAGMENT]                    = 42,
141    };
142 
143    anv_foreach_stage(s, stages) {
144       assert(s < ARRAY_SIZE(binding_table_opcodes));
145 
146       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
147          anv_batch_emit(&cmd_buffer->batch,
148                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
149             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
150             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
151          }
152       }
153 
154       /* Always emit binding table pointers if we're asked to, since on SKL
155        * this is what flushes push constants. */
156       anv_batch_emit(&cmd_buffer->batch,
157                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
158          btp._3DCommandSubOpcode = binding_table_opcodes[s];
159          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
160       }
161    }
162 }
163 
164 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)165 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
166                        const struct anv_shader_bin *shader,
167                        const struct anv_push_range *range)
168 {
169    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
170    switch (range->set) {
171    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
172       /* This is a descriptor set buffer so the set index is
173        * actually given by binding->binding.  (Yes, that's
174        * confusing.)
175        */
176       struct anv_descriptor_set *set =
177          gfx_state->base.descriptors[range->index];
178       return anv_descriptor_set_address(set);
179    }
180 
181    case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
182       return anv_address_from_u64(
183          anv_cmd_buffer_descriptor_buffer_address(
184             cmd_buffer,
185             gfx_state->base.descriptor_buffers[range->index].buffer_index) +
186          gfx_state->base.descriptor_buffers[range->index].buffer_offset);
187    }
188 
189    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
190       if (gfx_state->base.push_constants_state.alloc_size == 0) {
191          gfx_state->base.push_constants_state =
192             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
193       }
194       return anv_cmd_buffer_temporary_state_address(
195          cmd_buffer, gfx_state->base.push_constants_state);
196    }
197 
198    default: {
199       assert(range->set < MAX_SETS);
200       struct anv_descriptor_set *set =
201          gfx_state->base.descriptors[range->set];
202       const struct anv_descriptor *desc =
203          &set->descriptors[range->index];
204 
205       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
206          if (desc->buffer) {
207             return anv_address_add(desc->buffer->address,
208                                    desc->offset);
209          }
210       } else {
211          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
212          if (desc->buffer) {
213             const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
214             uint32_t dynamic_offset =
215                pipe_state->dynamic_offsets[
216                   range->set].offsets[range->dynamic_offset_index];
217             return anv_address_add(desc->buffer->address,
218                                    desc->offset + dynamic_offset);
219          }
220       }
221 
222       /* For NULL UBOs, we just return an address in the workaround BO.  We do
223        * writes to it for workarounds but always at the bottom.  The higher
224        * bytes should be all zeros.
225        */
226       assert(range->length * 32 <= 2048);
227       return cmd_buffer->device->workaround_address;
228    }
229    }
230 }
231 
232 
233 /** Returns the size in bytes of the bound buffer
234  *
235  * The range is relative to the start of the buffer, not the start of the
236  * range.  The returned range may be smaller than
237  *
238  *    (range->start + range->length) * 32;
239  */
240 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)241 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
242                           const struct anv_shader_bin *shader,
243                           const struct anv_push_range *range)
244 {
245    assert(shader->stage != MESA_SHADER_COMPUTE);
246    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
247    switch (range->set) {
248    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
249       struct anv_descriptor_set *set =
250          gfx_state->base.descriptors[range->index];
251       struct anv_state state = set->desc_surface_mem;
252       assert(range->start * 32 < state.alloc_size);
253       assert((range->start + range->length) * 32 <= state.alloc_size);
254       return state.alloc_size;
255    }
256 
257    case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
258       return gfx_state->base.pipeline->layout.set[
259          range->index].layout->descriptor_buffer_surface_size;
260 
261    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
262       return (range->start + range->length) * 32;
263 
264    default: {
265       assert(range->set < MAX_SETS);
266       struct anv_descriptor_set *set =
267          gfx_state->base.descriptors[range->set];
268       const struct anv_descriptor *desc =
269          &set->descriptors[range->index];
270 
271       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
272          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
273             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
274          */
275          if (!desc->buffer)
276             return 0;
277 
278          if (range->start * 32 > desc->bind_range)
279             return 0;
280 
281          return desc->bind_range;
282       } else {
283          if (!desc->buffer)
284             return 0;
285 
286          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
287          /* Compute the offset within the buffer */
288          const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
289          uint32_t dynamic_offset =
290             pipe_state->dynamic_offsets[
291                range->set].offsets[range->dynamic_offset_index];
292          uint64_t offset = desc->offset + dynamic_offset;
293          /* Clamp to the buffer size */
294          offset = MIN2(offset, desc->buffer->vk.size);
295          /* Clamp the range to the buffer size */
296          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
297 
298          /* Align the range for consistency */
299          bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
300 
301          return bound_range;
302       }
303    }
304    }
305 }
306 
307 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)308 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
309                               gl_shader_stage stage,
310                               struct anv_address *buffers,
311                               unsigned buffer_count)
312 {
313    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
314    const struct anv_graphics_pipeline *pipeline =
315       anv_pipeline_to_graphics(gfx_state->base.pipeline);
316 
317    static const uint32_t push_constant_opcodes[] = {
318       [MESA_SHADER_VERTEX]                      = 21,
319       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
320       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
321       [MESA_SHADER_GEOMETRY]                    = 22,
322       [MESA_SHADER_FRAGMENT]                    = 23,
323    };
324 
325    assert(stage < ARRAY_SIZE(push_constant_opcodes));
326 
327    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
328 
329    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
330       c._3DCommandSubOpcode = push_constant_opcodes[stage];
331 
332       /* Set MOCS.
333        *
334        * We only have one MOCS field for the whole packet, not one per
335        * buffer.  We could go out of our way here to walk over all of
336        * the buffers and see if any of them are used externally and use
337        * the external MOCS.  However, the notion that someone would use
338        * the same bit of memory for both scanout and a UBO is nuts.
339        *
340        * Let's not bother and assume it's all internal.
341        */
342       c.MOCS = mocs;
343 
344       if (anv_pipeline_has_stage(pipeline, stage)) {
345          const struct anv_pipeline_bind_map *bind_map =
346             &pipeline->base.shaders[stage]->bind_map;
347 
348          /* The Skylake PRM contains the following restriction:
349           *
350           *    "The driver must ensure The following case does not occur
351           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
352           *     buffer 3 read length equal to zero committed followed by a
353           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
354           *     zero committed."
355           *
356           * To avoid this, we program the buffers in the highest slots.
357           * This way, slot 0 is only used if slot 3 is also used.
358           */
359          assert(buffer_count <= 4);
360          const unsigned shift = 4 - buffer_count;
361          for (unsigned i = 0; i < buffer_count; i++) {
362             const struct anv_push_range *range = &bind_map->push_ranges[i];
363 
364             /* At this point we only have non-empty ranges */
365             assert(range->length > 0);
366 
367             c.ConstantBody.ReadLength[i + shift] = range->length;
368             c.ConstantBody.Buffer[i + shift] =
369                anv_address_add(buffers[i], range->start * 32);
370          }
371       }
372    }
373 }
374 
375 #if GFX_VER >= 12
376 static void
emit_null_push_constant_tbimr_workaround(struct anv_cmd_buffer * cmd_buffer)377 emit_null_push_constant_tbimr_workaround(struct anv_cmd_buffer *cmd_buffer)
378 {
379    /* Pass a single-register push constant payload for the PS
380     * stage even if empty, since PS invocations with zero push
381     * constant cycles have been found to cause hangs with TBIMR
382     * enabled.  See HSDES #22020184996.
383     *
384     * XXX - Use workaround infrastructure and final workaround
385     *       when provided by hardware team.
386     */
387    const struct anv_address null_addr = cmd_buffer->device->workaround_address;
388    uint32_t *dw = anv_batch_emitn(
389       &cmd_buffer->batch, 4,
390       GENX(3DSTATE_CONSTANT_ALL),
391       .ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT),
392       .PointerBufferMask = 1,
393       .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
394    GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
395       &cmd_buffer->batch, dw + 2,
396       &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
397          .PointerToConstantBuffer = null_addr,
398          .ConstantBufferReadLength = 1,
399       });
400 }
401 
402 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)403 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
404                                   uint32_t shader_mask,
405                                   struct anv_address *buffers,
406                                   uint32_t buffer_count)
407 {
408    if (buffer_count == 0) {
409       if (cmd_buffer->device->info->needs_null_push_constant_tbimr_workaround &&
410           (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
411          emit_null_push_constant_tbimr_workaround(cmd_buffer);
412          shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
413       }
414 
415       if (shader_mask) {
416          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
417             c.ShaderUpdateEnable = shader_mask;
418             c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
419          }
420       }
421 
422       return;
423    }
424 
425    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
426    const struct anv_graphics_pipeline *pipeline =
427       anv_pipeline_to_graphics(gfx_state->base.pipeline);
428 
429    gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
430 
431    const struct anv_pipeline_bind_map *bind_map =
432       &pipeline->base.shaders[stage]->bind_map;
433 
434    uint32_t *dw;
435    const uint32_t buffer_mask = (1 << buffer_count) - 1;
436    const uint32_t num_dwords = 2 + 2 * buffer_count;
437 
438    dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
439                         GENX(3DSTATE_CONSTANT_ALL),
440                         .ShaderUpdateEnable = shader_mask,
441                         .PointerBufferMask = buffer_mask,
442                         .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
443 
444    for (int i = 0; i < buffer_count; i++) {
445       const struct anv_push_range *range = &bind_map->push_ranges[i];
446       GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
447          &cmd_buffer->batch, dw + 2 + i * 2,
448          &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
449             .PointerToConstantBuffer =
450                anv_address_add(buffers[i], range->start * 32),
451             .ConstantBufferReadLength = range->length,
452          });
453    }
454 }
455 #endif
456 
457 static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)458 cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
459                                     VkShaderStageFlags dirty_stages)
460 {
461    VkShaderStageFlags flushed = 0;
462    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
463    const struct anv_graphics_pipeline *pipeline =
464       anv_pipeline_to_graphics(gfx_state->base.pipeline);
465 
466 #if GFX_VER >= 12
467    uint32_t nobuffer_stages = 0;
468 #endif
469 
470    /* Compute robust pushed register access mask for each stage. */
471    anv_foreach_stage(stage, dirty_stages) {
472       if (!anv_pipeline_has_stage(pipeline, stage))
473          continue;
474 
475       const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
476       if (shader->prog_data->zero_push_reg) {
477          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
478          struct anv_push_constants *push = &gfx_state->base.push_constants;
479 
480          push->push_reg_mask[stage] = 0;
481          /* Start of the current range in the shader, relative to the start of
482           * push constants in the shader.
483           */
484          unsigned range_start_reg = 0;
485          for (unsigned i = 0; i < 4; i++) {
486             const struct anv_push_range *range = &bind_map->push_ranges[i];
487             if (range->length == 0)
488                continue;
489 
490             unsigned bound_size =
491                get_push_range_bound_size(cmd_buffer, shader, range);
492             if (bound_size >= range->start * 32) {
493                unsigned bound_regs =
494                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
495                        range->length);
496                assert(range_start_reg + bound_regs <= 64);
497                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
498                                                               bound_regs);
499             }
500 
501             cmd_buffer->state.push_constants_dirty |=
502                mesa_to_vk_shader_stage(stage);
503 
504             range_start_reg += range->length;
505          }
506       }
507    }
508 
509     /* Setting NULL resets the push constant state so that we allocate a new one
510     * if needed. If push constant data not dirty, get_push_range_address can
511     * re-use existing allocation.
512     *
513     * Always reallocate on gfx9, gfx11 to fix push constant related flaky tests.
514     * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/11064
515     */
516    if (gfx_state->base.push_constants_data_dirty || GFX_VER < 12)
517       gfx_state->base.push_constants_state = ANV_STATE_NULL;
518 
519    anv_foreach_stage(stage, dirty_stages) {
520       unsigned buffer_count = 0;
521       flushed |= mesa_to_vk_shader_stage(stage);
522       UNUSED uint32_t max_push_range = 0;
523 
524       struct anv_address buffers[4] = {};
525       if (anv_pipeline_has_stage(pipeline, stage)) {
526          const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
527          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
528 
529          /* We have to gather buffer addresses as a second step because the
530           * loop above puts data into the push constant area and the call to
531           * get_push_range_address is what locks our push constants and copies
532           * them into the actual GPU buffer.  If we did the two loops at the
533           * same time, we'd risk only having some of the sizes in the push
534           * constant buffer when we did the copy.
535           */
536          for (unsigned i = 0; i < 4; i++) {
537             const struct anv_push_range *range = &bind_map->push_ranges[i];
538             if (range->length == 0)
539                break;
540 
541             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
542             max_push_range = MAX2(max_push_range, range->length);
543             buffer_count++;
544          }
545 
546          /* We have at most 4 buffers but they should be tightly packed */
547          for (unsigned i = buffer_count; i < 4; i++)
548             assert(bind_map->push_ranges[i].length == 0);
549       }
550 
551 #if GFX_VER >= 12
552       /* If this stage doesn't have any push constants, emit it later in a
553        * single CONSTANT_ALL packet.
554        */
555       if (buffer_count == 0) {
556          nobuffer_stages |= 1 << stage;
557          continue;
558       }
559 
560       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
561        * contains only 5 bits, so we can only use it for buffers smaller than
562        * 32.
563        *
564        * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
565        * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
566        * for disabling stages, where all address bits are zero.  However, we
567        * can't safely use it for general buffers with arbitrary addresses.
568        * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
569        * case.
570        */
571       if (max_push_range < 32 && GFX_VERx10 > 120) {
572          cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
573                                            buffers, buffer_count);
574          continue;
575       }
576 #endif
577 
578       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
579    }
580 
581 #if GFX_VER >= 12
582    if (nobuffer_stages)
583       /* Wa_16011448509: all address bits are zero */
584       cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
585 #endif
586 
587    cmd_buffer->state.push_constants_dirty &= ~flushed;
588    gfx_state->base.push_constants_data_dirty = false;
589 }
590 
591 #if GFX_VERx10 >= 125
592 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)593 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
594                                   VkShaderStageFlags dirty_stages)
595 {
596    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
597    const struct anv_graphics_pipeline *pipeline =
598       anv_pipeline_to_graphics(gfx_state->base.pipeline);
599 
600    if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
601        anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
602 
603       const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
604       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
605 
606       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
607          const struct anv_push_range *range = &bind_map->push_ranges[0];
608          if (range->length > 0) {
609             struct anv_address buffer =
610                get_push_range_address(cmd_buffer, shader, range);
611 
612             uint64_t addr = anv_address_physical(buffer);
613             data.InlineData[0] = addr & 0xffffffff;
614             data.InlineData[1] = addr >> 32;
615 
616             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
617                    cmd_buffer->state.gfx.base.push_constants.client_data,
618                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
619          }
620       }
621    }
622 
623    if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
624        anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
625 
626       const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
627       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
628 
629       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
630          const struct anv_push_range *range = &bind_map->push_ranges[0];
631          if (range->length > 0) {
632             struct anv_address buffer =
633                get_push_range_address(cmd_buffer, shader, range);
634 
635             uint64_t addr = anv_address_physical(buffer);
636             data.InlineData[0] = addr & 0xffffffff;
637             data.InlineData[1] = addr >> 32;
638 
639             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
640                    cmd_buffer->state.gfx.base.push_constants.client_data,
641                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
642          }
643       }
644    }
645 
646    cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
647 }
648 #endif
649 
650 ALWAYS_INLINE static void
genX(emit_hs)651 genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
652 {
653    struct anv_graphics_pipeline *pipeline =
654       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
655    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
656       return;
657 
658    anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
659 }
660 
661 ALWAYS_INLINE static void
genX(emit_ds)662 genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
663 {
664 #if INTEL_NEEDS_WA_22018402687
665    /* Wa_22018402687:
666     *   In any 3D enabled context, just before any Tessellation enabled draw
667     *   call (3D Primitive), re-send the last programmed 3DSTATE_DS again.
668     *   This will make sure that the 3DSTATE_INT generated just before the
669     *   draw call will have TDS dirty which will make sure TDS will launch the
670     *   state thread before the draw call.
671     *
672     * This fixes a hang resulting from running anything using tessellation
673     * after a switch away from the mesh pipeline.
674     * We don't need to track said switch, as it matters at the HW level, and
675     * can be triggered even across processes, so we apply the Wa at all times.
676     */
677    struct anv_graphics_pipeline *pipeline =
678       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
679    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
680       return;
681 
682    const bool protected = cmd_buffer->vk.pool->flags &
683                           VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
684 
685    anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
686                                            final.ds, protected);
687 #endif
688 }
689 
690 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)691 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
692 {
693    struct anv_graphics_pipeline *pipeline =
694       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
695    const struct vk_dynamic_graphics_state *dyn =
696       &cmd_buffer->vk.dynamic_graphics_state;
697    uint32_t *p;
698 
699    assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
700 
701    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
702 
703    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
704 
705    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
706 
707    genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
708 
709    genX(flush_pipeline_select_3d)(cmd_buffer);
710 
711    /* Wa_14015814527
712     *
713     * Apply task URB workaround when switching from task to primitive.
714     */
715    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
716       if (anv_pipeline_is_primitive(pipeline)) {
717          genX(apply_task_urb_workaround)(cmd_buffer);
718       } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
719          cmd_buffer->state.gfx.used_task_shader = true;
720       }
721    }
722 
723    /* Apply any pending pipeline flushes we may have.  We want to apply them
724     * now because, if any of those flushes are for things like push constants,
725     * the GPU will read the state at weird times.
726     */
727    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
728 
729    /* Check what vertex buffers have been rebound against the set of bindings
730     * being used by the current set of vertex attributes.
731     */
732    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
733    /* If the pipeline changed, the we have to consider all the valid bindings. */
734    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
735        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
736       vb_emit |= dyn->vi->bindings_valid;
737 
738    if (vb_emit) {
739       const uint32_t num_buffers = __builtin_popcount(vb_emit);
740       const uint32_t num_dwords = 1 + num_buffers * 4;
741 
742       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
743                           GENX(3DSTATE_VERTEX_BUFFERS));
744       uint32_t i = 0;
745       u_foreach_bit(vb, vb_emit) {
746          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
747          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
748 
749          struct GENX(VERTEX_BUFFER_STATE) state;
750          if (buffer) {
751             uint32_t stride = dyn->vi_binding_strides[vb];
752             UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
753 
754             state = (struct GENX(VERTEX_BUFFER_STATE)) {
755                .VertexBufferIndex = vb,
756 
757                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
758                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
759                .AddressModifyEnable = true,
760                .BufferPitch = stride,
761                .BufferStartingAddress = anv_address_add(buffer->address, offset),
762                .NullVertexBuffer = offset >= buffer->vk.size,
763 #if GFX_VER >= 12
764                .L3BypassDisable = true,
765 #endif
766 
767                .BufferSize = size,
768             };
769          } else {
770             state = (struct GENX(VERTEX_BUFFER_STATE)) {
771                .VertexBufferIndex = vb,
772                .NullVertexBuffer = true,
773                .MOCS = anv_mocs(cmd_buffer->device, NULL,
774                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
775             };
776          }
777 
778 #if GFX_VER == 9
779          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
780                                                         state.BufferStartingAddress,
781                                                         state.BufferSize);
782 #endif
783 
784          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
785          i++;
786       }
787    }
788 
789    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
790 
791    const bool any_dynamic_state_dirty =
792       vk_dynamic_graphics_state_any_dirty(dyn);
793    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
794                                 pipeline->base.base.active_stages;
795 
796    descriptors_dirty |=
797       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
798                                               &cmd_buffer->state.gfx.base,
799                                               &pipeline->base.base);
800 
801    /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive. */
802    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE ||
803        (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343)) {
804       genX(emit_hs)(cmd_buffer);
805    }
806 
807    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
808        !any_dynamic_state_dirty &&
809        ((cmd_buffer->state.push_constants_dirty &
810          (VK_SHADER_STAGE_ALL_GRAPHICS |
811           VK_SHADER_STAGE_TASK_BIT_EXT |
812           VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
813       return;
814 
815    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
816       /* Wa_16011411144:
817        *
818        * SW must insert a PIPE_CONTROL cmd before and after the
819        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
820        * state is not combined with other state changes.
821        */
822       if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
823          anv_add_pending_pipe_bits(cmd_buffer,
824                                    ANV_PIPE_CS_STALL_BIT,
825                                    "before SO_BUFFER change WA");
826          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
827       }
828 
829       /* We don't need any per-buffer dirty tracking because you're not
830        * allowed to bind different XFB buffers while XFB is enabled.
831        */
832       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
833          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
834          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
835 #if GFX_VER < 12
836             sob.SOBufferIndex = idx;
837 #else
838             sob._3DCommandOpcode = 0;
839             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
840 #endif
841 
842             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
843                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
844                                    ISL_SURF_USAGE_STREAM_OUT_BIT);
845                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
846                                                         xfb->offset);
847                sob.SOBufferEnable = true;
848                sob.StreamOffsetWriteEnable = false;
849                /* Size is in DWords - 1 */
850                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
851             } else {
852                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
853             }
854          }
855       }
856 
857       if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
858          /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
859          anv_add_pending_pipe_bits(cmd_buffer,
860                                    ANV_PIPE_CS_STALL_BIT,
861                                    "after SO_BUFFER change WA");
862          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
863       } else if (GFX_VER >= 10) {
864          /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
865          anv_add_pending_pipe_bits(cmd_buffer,
866                                    ANV_PIPE_CS_STALL_BIT,
867                                    "after 3DSTATE_SO_BUFFER call");
868       }
869    }
870 
871    /* State left dirty after flushing runtime state. */
872    anv_cmd_dirty_mask_t dirty_state_mask = 0;
873 
874    /* Flush the runtime state into the HW state tracking */
875    if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
876       dirty_state_mask = genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
877 
878    /* Flush the HW state into the commmand buffer */
879    if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
880       genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
881 
882    /* If the pipeline changed, we may need to re-allocate push constant space
883     * in the URB.
884     */
885    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
886       cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
887 
888       /* Also add the relocations (scratch buffers) */
889       VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
890                                               pipeline->base.base.batch.relocs);
891       if (result != VK_SUCCESS) {
892          anv_batch_set_error(&cmd_buffer->batch, result);
893          return;
894       }
895    }
896 
897    /* Render targets live in the same binding table as fragment descriptors */
898    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
899       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
900 
901    /* We emit the binding tables and sampler tables first, then emit push
902     * constants and then finally emit binding table and sampler table
903     * pointers.  It has to happen in this order, since emitting the binding
904     * tables may change the push constants (in case of storage images). After
905     * emitting push constants, on SKL+ we have to emit the corresponding
906     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
907     */
908    uint32_t dirty = 0;
909    if (descriptors_dirty) {
910       dirty = genX(cmd_buffer_flush_descriptor_sets)(
911          cmd_buffer,
912          &cmd_buffer->state.gfx.base,
913          descriptors_dirty,
914          pipeline->base.shaders,
915          ARRAY_SIZE(pipeline->base.shaders));
916       cmd_buffer->state.descriptors_dirty &= ~dirty;
917    }
918 
919    if (dirty || cmd_buffer->state.push_constants_dirty) {
920       /* Because we're pushing UBOs, we have to push whenever either
921        * descriptors or push constants is dirty.
922        */
923       dirty |= cmd_buffer->state.push_constants_dirty &
924                pipeline->base.base.active_stages;
925       cmd_buffer_flush_gfx_push_constants(cmd_buffer,
926                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
927 #if GFX_VERx10 >= 125
928       cmd_buffer_flush_mesh_inline_data(
929          cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
930                               VK_SHADER_STAGE_MESH_BIT_EXT));
931 #endif
932    }
933 
934    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
935       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
936                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
937    }
938 
939 #if GFX_VER >= 20
940    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE) {
941       anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BYTE_STRIDE), sb_stride) {
942          sb_stride.ByteStride = cmd_buffer->state.gfx.indirect_data_stride;
943          sb_stride.ByteStrideEnable = !cmd_buffer->state.gfx.indirect_data_stride_aligned;
944       }
945    }
946 #endif
947 
948    /* When we're done, only thing left is the possible dirty state
949     * returned by cmd_buffer_flush_gfx_runtime_state.
950     */
951    cmd_buffer->state.gfx.dirty = dirty_state_mask;
952 }
953 
954 ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer * cmd_buffer,uint32_t count)955 anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
956 {
957    const struct anv_device *device = cmd_buffer->device;
958    const struct anv_graphics_pipeline *pipeline =
959       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
960 
961    /* We cannot generate readable commands in protected mode. */
962    if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
963       return false;
964 
965    /* Limit generated draws to pipelines without HS stage. This makes things
966     * simpler for implementing Wa_1306463417, Wa_16011107343.
967     */
968    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
969        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
970       return false;
971 
972    return count >= device->physical->instance->generated_indirect_threshold;
973 }
974 
975 #include "genX_cmd_draw_helpers.h"
976 #include "genX_cmd_draw_generated_indirect.h"
977 
978 #if GFX_VER >= 11
979 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
980 #else
981 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
982 #endif
983 
genX(CmdDraw)984 void genX(CmdDraw)(
985     VkCommandBuffer                             commandBuffer,
986     uint32_t                                    vertexCount,
987     uint32_t                                    instanceCount,
988     uint32_t                                    firstVertex,
989     uint32_t                                    firstInstance)
990 {
991    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
992    struct anv_graphics_pipeline *pipeline =
993       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
994 
995    if (anv_batch_has_error(&cmd_buffer->batch))
996       return;
997 
998    const uint32_t count =
999       vertexCount * instanceCount * pipeline->instance_multiplier;
1000    anv_measure_snapshot(cmd_buffer,
1001                         INTEL_SNAPSHOT_DRAW,
1002                         "draw", count);
1003    trace_intel_begin_draw(&cmd_buffer->trace);
1004 
1005    /* Select pipeline here to allow
1006     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1007     * cmd_buffer_flush_gfx_state().
1008     */
1009    genX(flush_pipeline_select_3d)(cmd_buffer);
1010 
1011 #if GFX_VER < 11
1012    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1013                                               get_vs_prog_data(pipeline),
1014                                               firstVertex, firstInstance, 0,
1015                                               false /* force_flush */);
1016 #endif
1017 
1018    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1019    genX(emit_ds)(cmd_buffer);
1020 
1021    if (cmd_buffer->state.conditional_render_enabled)
1022       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1023 
1024    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1025 
1026    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1027       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1028 #if GFX_VERx10 >= 125
1029       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1030 #endif
1031       prim.VertexAccessType         = SEQUENTIAL;
1032       prim.VertexCountPerInstance   = vertexCount;
1033       prim.StartVertexLocation      = firstVertex;
1034       prim.InstanceCount            = instanceCount *
1035                                       pipeline->instance_multiplier;
1036       prim.StartInstanceLocation    = firstInstance;
1037       prim.BaseVertexLocation       = 0;
1038 #if GFX_VER >= 11
1039       prim.ExtendedParametersPresent = true;
1040       prim.ExtendedParameter0       = firstVertex;
1041       prim.ExtendedParameter1       = firstInstance;
1042       prim.ExtendedParameter2       = 0;
1043 #endif
1044    }
1045 
1046    genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1047                                          cmd_buffer->device,
1048                                          cmd_buffer->state.gfx.primitive_topology,
1049                                          vertexCount);
1050    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1051 
1052    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1053 
1054    trace_intel_end_draw(&cmd_buffer->trace, count);
1055 }
1056 
genX(CmdDrawMultiEXT)1057 void genX(CmdDrawMultiEXT)(
1058     VkCommandBuffer                             commandBuffer,
1059     uint32_t                                    drawCount,
1060     const VkMultiDrawInfoEXT                   *pVertexInfo,
1061     uint32_t                                    instanceCount,
1062     uint32_t                                    firstInstance,
1063     uint32_t                                    stride)
1064 {
1065    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1066    UNUSED struct anv_graphics_pipeline *pipeline =
1067       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1068 
1069    if (anv_batch_has_error(&cmd_buffer->batch))
1070       return;
1071 
1072    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1073 
1074    if (cmd_buffer->state.conditional_render_enabled)
1075       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1076 
1077    uint32_t i = 0;
1078 #if GFX_VER < 11
1079    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1080       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1081                                                  get_vs_prog_data(pipeline),
1082                                                  draw->firstVertex,
1083                                                  firstInstance, i, !i);
1084 
1085       const uint32_t count =
1086          draw->vertexCount * instanceCount * pipeline->instance_multiplier;
1087       anv_measure_snapshot(cmd_buffer,
1088                            INTEL_SNAPSHOT_DRAW,
1089                            "draw multi", count);
1090       trace_intel_begin_draw_multi(&cmd_buffer->trace);
1091 
1092       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1093 
1094       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1095          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1096          prim.VertexAccessType         = SEQUENTIAL;
1097          prim.VertexCountPerInstance   = draw->vertexCount;
1098          prim.StartVertexLocation      = draw->firstVertex;
1099          prim.InstanceCount            = instanceCount *
1100                                          pipeline->instance_multiplier;
1101          prim.StartInstanceLocation    = firstInstance;
1102          prim.BaseVertexLocation       = 0;
1103       }
1104 
1105       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1106                                             cmd_buffer->device,
1107                                             cmd_buffer->state.gfx.primitive_topology,
1108                                             drawCount == 0 ? 0 :
1109                                             pVertexInfo[drawCount - 1].vertexCount);
1110 
1111       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1112       trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1113    }
1114 #else
1115    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1116 
1117       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1118        * first one was handled by cmd_buffer_flush_gfx_state.
1119        */
1120       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1121          genX(emit_hs)(cmd_buffer);
1122       genX(emit_ds)(cmd_buffer);
1123 
1124       const uint32_t count = draw->vertexCount * instanceCount;
1125       anv_measure_snapshot(cmd_buffer,
1126                            INTEL_SNAPSHOT_DRAW,
1127                            "draw multi", count);
1128       trace_intel_begin_draw_multi(&cmd_buffer->trace);
1129 
1130       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1131 
1132       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1133 #if GFX_VERx10 >= 125
1134          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1135 #endif
1136          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1137          prim.VertexAccessType         = SEQUENTIAL;
1138          prim.VertexCountPerInstance   = draw->vertexCount;
1139          prim.StartVertexLocation      = draw->firstVertex;
1140          prim.InstanceCount            = instanceCount;
1141          prim.StartInstanceLocation    = firstInstance;
1142          prim.BaseVertexLocation       = 0;
1143          prim.ExtendedParametersPresent = true;
1144          prim.ExtendedParameter0       = draw->firstVertex;
1145          prim.ExtendedParameter1       = firstInstance;
1146          prim.ExtendedParameter2       = i;
1147       }
1148 
1149       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1150                                             cmd_buffer->device,
1151                                             cmd_buffer->state.gfx.primitive_topology,
1152                                             drawCount == 0 ? 0 :
1153                                             pVertexInfo[drawCount - 1].vertexCount);
1154 
1155       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1156       trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1157    }
1158 #endif
1159 
1160    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1161 }
1162 
genX(CmdDrawIndexed)1163 void genX(CmdDrawIndexed)(
1164     VkCommandBuffer                             commandBuffer,
1165     uint32_t                                    indexCount,
1166     uint32_t                                    instanceCount,
1167     uint32_t                                    firstIndex,
1168     int32_t                                     vertexOffset,
1169     uint32_t                                    firstInstance)
1170 {
1171    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1172    struct anv_graphics_pipeline *pipeline =
1173       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1174 
1175    if (anv_batch_has_error(&cmd_buffer->batch))
1176       return;
1177 
1178    const uint32_t count =
1179       indexCount * instanceCount * pipeline->instance_multiplier;
1180    anv_measure_snapshot(cmd_buffer,
1181                         INTEL_SNAPSHOT_DRAW,
1182                         "draw indexed",
1183                         count);
1184    trace_intel_begin_draw_indexed(&cmd_buffer->trace);
1185 
1186    /* Select pipeline here to allow
1187     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1188     * cmd_buffer_flush_gfx_state().
1189     */
1190    genX(flush_pipeline_select_3d)(cmd_buffer);
1191 
1192 #if GFX_VER < 11
1193    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1194    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1195                                               vertexOffset, firstInstance,
1196                                               0, false /* force_flush */);
1197 #endif
1198 
1199    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1200 
1201    if (cmd_buffer->state.conditional_render_enabled)
1202       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1203 
1204    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1205 
1206    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1207       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1208 #if GFX_VERx10 >= 125
1209       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1210 #endif
1211       prim.VertexAccessType         = RANDOM;
1212       prim.VertexCountPerInstance   = indexCount;
1213       prim.StartVertexLocation      = firstIndex;
1214       prim.InstanceCount            = instanceCount *
1215                                       pipeline->instance_multiplier;
1216       prim.StartInstanceLocation    = firstInstance;
1217       prim.BaseVertexLocation       = vertexOffset;
1218 #if GFX_VER >= 11
1219       prim.ExtendedParametersPresent = true;
1220       prim.ExtendedParameter0       = vertexOffset;
1221       prim.ExtendedParameter1       = firstInstance;
1222       prim.ExtendedParameter2       = 0;
1223 #endif
1224    }
1225 
1226    genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1227                                          cmd_buffer->device,
1228                                          cmd_buffer->state.gfx.primitive_topology,
1229                                          indexCount);
1230    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1231 
1232    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1233 
1234    trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
1235 }
1236 
genX(CmdDrawMultiIndexedEXT)1237 void genX(CmdDrawMultiIndexedEXT)(
1238     VkCommandBuffer                             commandBuffer,
1239     uint32_t                                    drawCount,
1240     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
1241     uint32_t                                    instanceCount,
1242     uint32_t                                    firstInstance,
1243     uint32_t                                    stride,
1244     const int32_t                              *pVertexOffset)
1245 {
1246    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1247    struct anv_graphics_pipeline *pipeline =
1248       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1249 
1250    if (anv_batch_has_error(&cmd_buffer->batch))
1251       return;
1252 
1253    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1254 
1255    if (cmd_buffer->state.conditional_render_enabled)
1256       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1257 
1258    uint32_t i = 0;
1259 #if GFX_VER < 11
1260    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1261    if (pVertexOffset) {
1262       if (vs_prog_data->uses_drawid) {
1263          bool emitted = true;
1264          if (vs_prog_data->uses_firstvertex ||
1265              vs_prog_data->uses_baseinstance) {
1266             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1267             emitted = true;
1268          }
1269          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1270             if (vs_prog_data->uses_drawid) {
1271                emit_draw_index(cmd_buffer, i);
1272                emitted = true;
1273             }
1274             /* Emitting draw index or vertex index BOs may result in needing
1275              * additional VF cache flushes.
1276              */
1277             if (emitted)
1278                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1279 
1280             const uint32_t count =
1281                draw->indexCount * instanceCount * pipeline->instance_multiplier;
1282             anv_measure_snapshot(cmd_buffer,
1283                                  INTEL_SNAPSHOT_DRAW,
1284                                  "draw indexed multi",
1285                                  count);
1286             trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1287             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1288                                   true);
1289 
1290             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1291                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1292                prim.VertexAccessType         = RANDOM;
1293                prim.VertexCountPerInstance   = draw->indexCount;
1294                prim.StartVertexLocation      = draw->firstIndex;
1295                prim.InstanceCount            = instanceCount *
1296                                                pipeline->instance_multiplier;
1297                prim.StartInstanceLocation    = firstInstance;
1298                prim.BaseVertexLocation       = *pVertexOffset;
1299             }
1300 
1301             genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1302                                                   cmd_buffer->device,
1303                                                   cmd_buffer->state.gfx.primitive_topology,
1304                                                   drawCount == 0 ? 0 :
1305                                                   pIndexInfo[drawCount - 1].indexCount);
1306 
1307             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1308                                   false);
1309             trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1310             emitted = false;
1311          }
1312       } else {
1313          if (vs_prog_data->uses_firstvertex ||
1314              vs_prog_data->uses_baseinstance) {
1315             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1316             /* Emitting draw index or vertex index BOs may result in needing
1317              * additional VF cache flushes.
1318              */
1319             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1320          }
1321          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1322             const uint32_t count =
1323                draw->indexCount * instanceCount * pipeline->instance_multiplier;
1324             anv_measure_snapshot(cmd_buffer,
1325                                  INTEL_SNAPSHOT_DRAW,
1326                                  "draw indexed multi",
1327                                  count);
1328             trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1329             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1330                                   true);
1331 
1332             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1333                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1334                prim.VertexAccessType         = RANDOM;
1335                prim.VertexCountPerInstance   = draw->indexCount;
1336                prim.StartVertexLocation      = draw->firstIndex;
1337                prim.InstanceCount            = instanceCount *
1338                                                pipeline->instance_multiplier;
1339                prim.StartInstanceLocation    = firstInstance;
1340                prim.BaseVertexLocation       = *pVertexOffset;
1341             }
1342 
1343             genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1344                                                   cmd_buffer->device,
1345                                                   cmd_buffer->state.gfx.primitive_topology,
1346                                                   drawCount == 0 ? 0 :
1347                                                   pIndexInfo[drawCount - 1].indexCount);
1348 
1349             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1350                                   false);
1351             trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1352          }
1353       }
1354    } else {
1355       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1356          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1357                                                     draw->vertexOffset,
1358                                                     firstInstance, i, i != 0);
1359 
1360          const uint32_t count =
1361             draw->indexCount * instanceCount * pipeline->instance_multiplier;
1362          anv_measure_snapshot(cmd_buffer,
1363                               INTEL_SNAPSHOT_DRAW,
1364                               "draw indexed multi",
1365                               count);
1366          trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1367          genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1368 
1369          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1370             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1371             prim.VertexAccessType         = RANDOM;
1372             prim.VertexCountPerInstance   = draw->indexCount;
1373             prim.StartVertexLocation      = draw->firstIndex;
1374             prim.InstanceCount            = instanceCount *
1375                                             pipeline->instance_multiplier;
1376             prim.StartInstanceLocation    = firstInstance;
1377             prim.BaseVertexLocation       = draw->vertexOffset;
1378          }
1379 
1380          genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1381                                                cmd_buffer->device,
1382                                                cmd_buffer->state.gfx.primitive_topology,
1383                                                drawCount == 0 ? 0 :
1384                                                pIndexInfo[drawCount - 1].indexCount);
1385 
1386          genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1387          trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1388       }
1389    }
1390 #else
1391    vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1392 
1393       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1394        * first one was handled by cmd_buffer_flush_gfx_state.
1395        */
1396       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1397          genX(emit_hs)(cmd_buffer);
1398       genX(emit_ds)(cmd_buffer);
1399 
1400       const uint32_t count =
1401          draw->indexCount * instanceCount * pipeline->instance_multiplier;
1402       anv_measure_snapshot(cmd_buffer,
1403                            INTEL_SNAPSHOT_DRAW,
1404                            "draw indexed multi",
1405                            count);
1406       trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1407       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1408 
1409       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
1410 #if GFX_VERx10 >= 125
1411          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1412 #endif
1413          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1414          prim.VertexAccessType         = RANDOM;
1415          prim.VertexCountPerInstance   = draw->indexCount;
1416          prim.StartVertexLocation      = draw->firstIndex;
1417          prim.InstanceCount            = instanceCount *
1418                                          pipeline->instance_multiplier;
1419          prim.StartInstanceLocation    = firstInstance;
1420          prim.BaseVertexLocation       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1421          prim.ExtendedParametersPresent = true;
1422          prim.ExtendedParameter0       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1423          prim.ExtendedParameter1       = firstInstance;
1424          prim.ExtendedParameter2       = i;
1425       }
1426 
1427       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1428                                             cmd_buffer->device,
1429                                             cmd_buffer->state.gfx.primitive_topology,
1430                                             drawCount == 0 ? 0 :
1431                                             pIndexInfo[drawCount - 1].indexCount);
1432 
1433       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1434       trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1435    }
1436 #endif
1437 
1438    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1439 }
1440 
1441 /* Auto-Draw / Indirect Registers */
1442 #define GFX7_3DPRIM_END_OFFSET          0x2420
1443 #define GFX7_3DPRIM_START_VERTEX        0x2430
1444 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
1445 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
1446 #define GFX7_3DPRIM_START_INSTANCE      0x243C
1447 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
1448 
1449 /* On Gen11+, we have three custom "extended parameters" which we can use to
1450  * provide extra system-generated values to shaders.  Our assignment of these
1451  * is arbitrary; we choose to assign them as follows:
1452  *
1453  *    gl_BaseVertex = XP0
1454  *    gl_BaseInstance = XP1
1455  *    gl_DrawID = XP2
1456  *
1457  * For gl_BaseInstance, we never actually have to set up the value because we
1458  * can just program 3DSTATE_VF_SGVS_2 to load it implicitly.  We can also do
1459  * that for gl_BaseVertex but it does the wrong thing for indexed draws.
1460  */
1461 #define GEN11_3DPRIM_XP0                0x2690
1462 #define GEN11_3DPRIM_XP1                0x2694
1463 #define GEN11_3DPRIM_XP2                0x2698
1464 #define GEN11_3DPRIM_XP_BASE_VERTEX     GEN11_3DPRIM_XP0
1465 #define GEN11_3DPRIM_XP_BASE_INSTANCE   GEN11_3DPRIM_XP1
1466 #define GEN11_3DPRIM_XP_DRAW_ID         GEN11_3DPRIM_XP2
1467 
genX(CmdDrawIndirectByteCountEXT)1468 void genX(CmdDrawIndirectByteCountEXT)(
1469     VkCommandBuffer                             commandBuffer,
1470     uint32_t                                    instanceCount,
1471     uint32_t                                    firstInstance,
1472     VkBuffer                                    counterBuffer,
1473     VkDeviceSize                                counterBufferOffset,
1474     uint32_t                                    counterOffset,
1475     uint32_t                                    vertexStride)
1476 {
1477    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1478    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
1479    struct anv_graphics_pipeline *pipeline =
1480       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1481 
1482    /* firstVertex is always zero for this draw function */
1483    const uint32_t firstVertex = 0;
1484 
1485    if (anv_batch_has_error(&cmd_buffer->batch))
1486       return;
1487 
1488    anv_measure_snapshot(cmd_buffer,
1489                         INTEL_SNAPSHOT_DRAW,
1490                         "draw indirect byte count",
1491                         instanceCount * pipeline->instance_multiplier);
1492    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
1493 
1494    /* Select pipeline here to allow
1495     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1496     * emit_base_vertex_instance() & emit_draw_index().
1497     */
1498    genX(flush_pipeline_select_3d)(cmd_buffer);
1499 
1500 #if GFX_VER < 11
1501    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1502    if (vs_prog_data->uses_firstvertex ||
1503        vs_prog_data->uses_baseinstance)
1504       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1505    if (vs_prog_data->uses_drawid)
1506       emit_draw_index(cmd_buffer, 0);
1507 #endif
1508 
1509    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1510 
1511    if (cmd_buffer->state.conditional_render_enabled)
1512       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1513 
1514    struct mi_builder b;
1515    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1516    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
1517    mi_builder_set_mocs(&b, mocs);
1518    struct mi_value count =
1519       mi_mem32(anv_address_add(counter_buffer->address,
1520                                    counterBufferOffset));
1521    if (counterOffset)
1522       count = mi_isub(&b, count, mi_imm(counterOffset));
1523    count = mi_udiv32_imm(&b, count, vertexStride);
1524    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
1525 
1526    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
1527    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
1528             mi_imm(instanceCount * pipeline->instance_multiplier));
1529    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
1530    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1531 
1532 #if GFX_VER >= 11
1533    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1534                 mi_imm(firstVertex));
1535    /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1536    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
1537 #endif
1538 
1539    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1540    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1541 #if GFX_VERx10 >= 125
1542       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1543 #endif
1544       prim.IndirectParameterEnable  = true;
1545       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1546       prim.VertexAccessType         = SEQUENTIAL;
1547 #if GFX_VER >= 11
1548       prim.ExtendedParametersPresent = true;
1549 #endif
1550    }
1551 
1552    genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1553                                          cmd_buffer->device,
1554                                          cmd_buffer->state.gfx.primitive_topology,
1555                                          1);
1556    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1557 
1558    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1559 
1560    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
1561       instanceCount * pipeline->instance_multiplier);
1562 }
1563 
1564 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed,uint32_t draw_id)1565 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
1566                          struct anv_address addr,
1567                          bool indexed,
1568                          uint32_t draw_id)
1569 {
1570    struct anv_graphics_pipeline *pipeline =
1571       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1572 
1573    struct mi_builder b;
1574    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1575    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
1576    mi_builder_set_mocs(&b, mocs);
1577 
1578    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
1579                 mi_mem32(anv_address_add(addr, 0)));
1580 
1581    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
1582    if (pipeline->instance_multiplier > 1) {
1583       instance_count = mi_imul_imm(&b, instance_count,
1584                                    pipeline->instance_multiplier);
1585    }
1586    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
1587 
1588    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
1589                 mi_mem32(anv_address_add(addr, 8)));
1590 
1591    if (indexed) {
1592       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
1593                    mi_mem32(anv_address_add(addr, 12)));
1594       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1595                    mi_mem32(anv_address_add(addr, 16)));
1596 #if GFX_VER >= 11
1597       mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1598                    mi_mem32(anv_address_add(addr, 12)));
1599       /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1600 #endif
1601    } else {
1602       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1603                    mi_mem32(anv_address_add(addr, 12)));
1604       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1605 #if GFX_VER >= 11
1606       mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1607                    mi_mem32(anv_address_add(addr, 8)));
1608       /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1609 #endif
1610    }
1611 
1612 #if GFX_VER >= 11
1613    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
1614                 mi_imm(draw_id));
1615 #endif
1616 }
1617 
1618 static const inline bool
execute_indirect_draw_supported(const struct anv_cmd_buffer * cmd_buffer)1619 execute_indirect_draw_supported(const struct anv_cmd_buffer *cmd_buffer)
1620 {
1621 #if GFX_VERx10 >= 125
1622    const struct intel_device_info *devinfo = cmd_buffer->device->info;
1623 
1624    if (!devinfo->has_indirect_unroll)
1625       return false;
1626 
1627    struct anv_graphics_pipeline *pipeline =
1628       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1629    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1630    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1631    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1632    const bool is_multiview = pipeline->instance_multiplier > 1;
1633 
1634    const bool uses_draw_id =
1635       (vs_prog_data && vs_prog_data->uses_drawid) ||
1636       (mesh_prog_data && mesh_prog_data->uses_drawid) ||
1637       (task_prog_data && task_prog_data->uses_drawid);
1638 
1639    const bool uses_firstvertex =
1640       (vs_prog_data && vs_prog_data->uses_firstvertex);
1641 
1642    const bool uses_baseinstance =
1643       (vs_prog_data && vs_prog_data->uses_baseinstance);
1644 
1645    return !is_multiview &&
1646           !uses_draw_id &&
1647           !uses_firstvertex &&
1648           !uses_baseinstance;
1649 #else
1650    return false;
1651 #endif
1652 }
1653 
1654 static void
emit_indirect_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint32_t indirect_data_stride,uint32_t draw_count,bool indexed)1655 emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
1656                     struct anv_address indirect_data_addr,
1657                     uint32_t indirect_data_stride,
1658                     uint32_t draw_count,
1659                     bool indexed)
1660 {
1661 #if GFX_VER < 11
1662    struct anv_graphics_pipeline *pipeline =
1663       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1664    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1665 #endif
1666    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1667 
1668    if (cmd_buffer->state.conditional_render_enabled)
1669       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1670 
1671    uint32_t offset = 0;
1672    for (uint32_t i = 0; i < draw_count; i++) {
1673       struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1674 
1675 #if GFX_VER < 11
1676       /* TODO: We need to stomp base vertex to 0 somehow */
1677 
1678       /* With sequential draws, we're dealing with the VkDrawIndirectCommand
1679        * structure data. We want to load VkDrawIndirectCommand::firstVertex at
1680        * offset 8 in the structure.
1681        *
1682        * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
1683        * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
1684        * the structure.
1685        */
1686       if (vs_prog_data->uses_firstvertex ||
1687           vs_prog_data->uses_baseinstance) {
1688          emit_base_vertex_instance_bo(cmd_buffer,
1689                                       anv_address_add(draw, indexed ? 12 : 8));
1690       }
1691       if (vs_prog_data->uses_drawid)
1692          emit_draw_index(cmd_buffer, i);
1693 #endif
1694 
1695       /* Emitting draw index or vertex index BOs may result in needing
1696        * additional VF cache flushes.
1697        */
1698       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1699 
1700       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1701        * first one was handled by cmd_buffer_flush_gfx_state.
1702        */
1703       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1704          genX(emit_hs)(cmd_buffer);
1705       genX(emit_ds)(cmd_buffer);
1706 
1707       load_indirect_parameters(cmd_buffer, draw, indexed, i);
1708 
1709       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1710       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1711 #if GFX_VERx10 >= 125
1712          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1713 #endif
1714          prim.IndirectParameterEnable  = true;
1715          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1716          prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
1717 #if GFX_VER >= 11
1718          prim.ExtendedParametersPresent = true;
1719 #endif
1720       }
1721 
1722       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1723                                             cmd_buffer->device,
1724                                             cmd_buffer->state.gfx.primitive_topology,
1725                                             1);
1726 
1727       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1728 
1729       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
1730 
1731       offset += indirect_data_stride;
1732    }
1733 }
1734 
xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)1735 static inline const uint32_t xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)
1736 {
1737 #if GFX_VERx10 >= 125
1738    switch (cmd) {
1739       case VK_CMD_DRAW_INDIRECT:
1740       case VK_CMD_DRAW_INDIRECT_COUNT:
1741          return XI_DRAW;
1742       case VK_CMD_DRAW_INDEXED_INDIRECT:
1743       case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1744          return XI_DRAWINDEXED;
1745       case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1746       case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1747          return XI_MESH_3D;
1748       default:
1749          unreachable("unhandled cmd type");
1750    }
1751 #else
1752    unreachable("unsupported GFX VER");
1753 #endif
1754 }
1755 
1756 static inline bool
cmd_buffer_set_indirect_stride(struct anv_cmd_buffer * cmd_buffer,uint32_t stride,enum vk_cmd_type cmd)1757 cmd_buffer_set_indirect_stride(struct anv_cmd_buffer *cmd_buffer,
1758                                uint32_t stride, enum vk_cmd_type cmd)
1759 {
1760    /* Should have been sanitized by the caller */
1761    assert(stride != 0);
1762 
1763    uint32_t data_stride = 0;
1764 
1765    switch (cmd) {
1766    case VK_CMD_DRAW_INDIRECT:
1767    case VK_CMD_DRAW_INDIRECT_COUNT:
1768       data_stride = sizeof(VkDrawIndirectCommand);
1769       break;
1770    case VK_CMD_DRAW_INDEXED_INDIRECT:
1771    case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1772       data_stride = sizeof(VkDrawIndexedIndirectCommand);
1773       break;
1774    case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1775    case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1776       data_stride = sizeof(VkDrawMeshTasksIndirectCommandEXT);
1777       break;
1778    default:
1779       unreachable("unhandled cmd type");
1780    }
1781 
1782    bool aligned = stride == data_stride;
1783 
1784 #if GFX_VER >= 20
1785    /* The stride can change as long as it matches the default command stride
1786     * and STATE_BYTE_STRIDE::ByteStrideEnable=false, we can just do nothing.
1787     *
1788     * Otheriwse STATE_BYTE_STRIDE::ByteStrideEnable=true, any stride change
1789     * should be signaled.
1790     */
1791    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
1792    if (gfx_state->indirect_data_stride_aligned != aligned) {
1793       gfx_state->indirect_data_stride = stride;
1794       gfx_state->indirect_data_stride_aligned = aligned;
1795       gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1796    } else if (!gfx_state->indirect_data_stride_aligned &&
1797               gfx_state->indirect_data_stride != stride) {
1798       gfx_state->indirect_data_stride = stride;
1799       gfx_state->indirect_data_stride_aligned = aligned;
1800       gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1801    }
1802 #endif
1803 
1804    return aligned;
1805 }
1806 
1807 static void
genX(cmd_buffer_emit_execute_indirect_draws)1808 genX(cmd_buffer_emit_execute_indirect_draws)(struct anv_cmd_buffer *cmd_buffer,
1809                                              struct anv_address indirect_data_addr,
1810                                              uint32_t indirect_data_stride,
1811                                              struct anv_address count_addr,
1812                                              uint32_t max_draw_count,
1813                                              enum vk_cmd_type cmd)
1814 {
1815 #if GFX_VERx10 >= 125
1816    bool aligned_stride =
1817       cmd_buffer_set_indirect_stride(cmd_buffer, indirect_data_stride, cmd);
1818 
1819    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1820 
1821    if (cmd_buffer->state.conditional_render_enabled)
1822       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1823 
1824    uint32_t offset = 0;
1825    for (uint32_t i = 0; i < max_draw_count; i++) {
1826       struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1827       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1828       anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
1829          ind.ArgumentFormat             = xi_argument_format_for_vk_cmd(cmd);
1830          ind.TBIMREnabled               = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1831          ind.PredicateEnable            =
1832             cmd_buffer->state.conditional_render_enabled;
1833          ind.MaxCount                   = aligned_stride ? max_draw_count : 1;
1834          ind.ArgumentBufferStartAddress = draw;
1835          ind.CountBufferAddress         = count_addr;
1836          ind.CountBufferIndirectEnable  = !anv_address_is_null(count_addr);
1837          ind.MOCS                       =
1838             anv_mocs(cmd_buffer->device, draw.bo, 0);
1839 
1840       }
1841 
1842       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1843                                             cmd_buffer->device,
1844                                             cmd_buffer->state.gfx.primitive_topology,
1845                                             1);
1846       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1847 
1848       /* If all the indirect structures are aligned, then we can let the HW
1849        * do the unrolling and we only need one instruction. Otherwise we
1850        * need to emit one instruction per draw, but we're still avoiding
1851        * the register loads with MI commands.
1852        */
1853       if (aligned_stride || GFX_VER >= 20)
1854          break;
1855 
1856       offset += indirect_data_stride;
1857    }
1858 #endif // GFX_VERx10 >= 125
1859 }
genX(CmdDrawIndirect)1860 void genX(CmdDrawIndirect)(
1861     VkCommandBuffer                             commandBuffer,
1862     VkBuffer                                    _buffer,
1863     VkDeviceSize                                offset,
1864     uint32_t                                    drawCount,
1865     uint32_t                                    stride)
1866 {
1867    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1868    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1869 
1870    if (anv_batch_has_error(&cmd_buffer->batch))
1871       return;
1872 
1873    anv_measure_snapshot(cmd_buffer,
1874                         INTEL_SNAPSHOT_DRAW,
1875                         "draw indirect",
1876                         drawCount);
1877    trace_intel_begin_draw_indirect(&cmd_buffer->trace);
1878 
1879    struct anv_address indirect_data_addr =
1880       anv_address_add(buffer->address, offset);
1881 
1882    stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
1883 
1884    if (execute_indirect_draw_supported(cmd_buffer)) {
1885       genX(cmd_buffer_emit_execute_indirect_draws)(
1886          cmd_buffer,
1887          indirect_data_addr,
1888          stride,
1889          ANV_NULL_ADDRESS /* count_addr */,
1890          drawCount,
1891          VK_CMD_DRAW_INDIRECT);
1892    } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1893       genX(cmd_buffer_emit_indirect_generated_draws)(
1894          cmd_buffer,
1895          indirect_data_addr,
1896          stride,
1897          ANV_NULL_ADDRESS /* count_addr */,
1898          drawCount,
1899          false /* indexed */);
1900    } else {
1901       emit_indirect_draws(cmd_buffer,
1902                           indirect_data_addr,
1903                           stride, drawCount, false /* indexed */);
1904    }
1905 
1906    trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
1907 }
1908 
genX(CmdDrawIndexedIndirect)1909 void genX(CmdDrawIndexedIndirect)(
1910     VkCommandBuffer                             commandBuffer,
1911     VkBuffer                                    _buffer,
1912     VkDeviceSize                                offset,
1913     uint32_t                                    drawCount,
1914     uint32_t                                    stride)
1915 {
1916    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1917    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1918 
1919    if (anv_batch_has_error(&cmd_buffer->batch))
1920       return;
1921 
1922    anv_measure_snapshot(cmd_buffer,
1923                         INTEL_SNAPSHOT_DRAW,
1924                         "draw indexed indirect",
1925                         drawCount);
1926    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
1927 
1928    struct anv_address indirect_data_addr =
1929       anv_address_add(buffer->address, offset);
1930 
1931    stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
1932 
1933    if (execute_indirect_draw_supported(cmd_buffer)) {
1934       genX(cmd_buffer_emit_execute_indirect_draws)(
1935          cmd_buffer,
1936          indirect_data_addr,
1937          stride,
1938          ANV_NULL_ADDRESS /* count_addr */,
1939          drawCount,
1940          VK_CMD_DRAW_INDEXED_INDIRECT);
1941    } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1942       genX(cmd_buffer_emit_indirect_generated_draws)(
1943          cmd_buffer,
1944          indirect_data_addr,
1945          stride,
1946          ANV_NULL_ADDRESS /* count_addr */,
1947          drawCount,
1948          true /* indexed */);
1949    } else {
1950       emit_indirect_draws(cmd_buffer,
1951                           indirect_data_addr,
1952                           stride, drawCount, true /* indexed */);
1953    }
1954 
1955    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
1956 }
1957 
1958 #define MI_PREDICATE_SRC0    0x2400
1959 #define MI_PREDICATE_SRC1    0x2408
1960 #define MI_PREDICATE_RESULT  0x2418
1961 
1962 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)1963 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1964                                  struct mi_builder *b,
1965                                  struct anv_address count_address)
1966 {
1967    struct mi_value ret = mi_imm(0);
1968 
1969    if (cmd_buffer->state.conditional_render_enabled) {
1970       ret = mi_new_gpr(b);
1971       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
1972    } else {
1973       /* Upload the current draw count from the draw parameters buffer to
1974        * MI_PREDICATE_SRC0.
1975        */
1976       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
1977       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
1978    }
1979 
1980    return ret;
1981 }
1982 
1983 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)1984 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1985                           struct mi_builder *b,
1986                           uint32_t draw_index)
1987 {
1988    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
1989    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
1990 
1991    if (draw_index == 0) {
1992       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1993          mip.LoadOperation    = LOAD_LOADINV;
1994          mip.CombineOperation = COMBINE_SET;
1995          mip.CompareOperation = COMPARE_SRCS_EQUAL;
1996       }
1997    } else {
1998       /* While draw_index < draw_count the predicate's result will be
1999        *  (draw_index == draw_count) ^ TRUE = TRUE
2000        * When draw_index == draw_count the result is
2001        *  (TRUE) ^ TRUE = FALSE
2002        * After this all results will be:
2003        *  (FALSE) ^ FALSE = FALSE
2004        */
2005       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
2006          mip.LoadOperation    = LOAD_LOAD;
2007          mip.CombineOperation = COMBINE_XOR;
2008          mip.CompareOperation = COMPARE_SRCS_EQUAL;
2009       }
2010    }
2011 }
2012 
2013 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2014 emit_draw_count_predicate_with_conditional_render(
2015                           struct anv_cmd_buffer *cmd_buffer,
2016                           struct mi_builder *b,
2017                           uint32_t draw_index,
2018                           struct mi_value max)
2019 {
2020    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
2021    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
2022 
2023    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
2024 }
2025 
2026 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2027 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
2028                                struct mi_builder *b,
2029                                uint32_t draw_index,
2030                                struct mi_value max)
2031 {
2032    if (cmd_buffer->state.conditional_render_enabled) {
2033       emit_draw_count_predicate_with_conditional_render(
2034             cmd_buffer, b, draw_index, mi_value_ref(b, max));
2035    } else {
2036       emit_draw_count_predicate(cmd_buffer, b, draw_index);
2037    }
2038 }
2039 
2040 static void
emit_indirect_count_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint64_t indirect_data_stride,struct anv_address draw_count_addr,uint32_t max_draw_count,bool indexed)2041 emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
2042                           struct anv_address indirect_data_addr,
2043                           uint64_t indirect_data_stride,
2044                           struct anv_address draw_count_addr,
2045                           uint32_t max_draw_count,
2046                           bool indexed)
2047 {
2048 #if GFX_VER < 11
2049    struct anv_graphics_pipeline *pipeline =
2050       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2051    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
2052 #endif
2053 
2054    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2055 
2056    struct mi_builder b;
2057    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2058    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
2059    mi_builder_set_mocs(&b, mocs);
2060    struct mi_value max =
2061       prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
2062 
2063    for (uint32_t i = 0; i < max_draw_count; i++) {
2064       struct anv_address draw =
2065          anv_address_add(indirect_data_addr, i * indirect_data_stride);
2066 
2067       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2068 
2069 #if GFX_VER < 11
2070       if (vs_prog_data->uses_firstvertex ||
2071           vs_prog_data->uses_baseinstance) {
2072          emit_base_vertex_instance_bo(cmd_buffer,
2073                                       anv_address_add(draw, indexed ? 12 : 8));
2074       }
2075       if (vs_prog_data->uses_drawid)
2076          emit_draw_index(cmd_buffer, i);
2077 
2078       /* Emitting draw index or vertex index BOs may result in needing
2079        * additional VF cache flushes.
2080        */
2081       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2082 #endif
2083 
2084       load_indirect_parameters(cmd_buffer, draw, indexed, i);
2085 
2086       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
2087        * first one was handled by cmd_buffer_flush_gfx_state.
2088        */
2089       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
2090          genX(emit_hs)(cmd_buffer);
2091       genX(emit_ds)(cmd_buffer);
2092 
2093       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
2094       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
2095 #if GFX_VERx10 >= 125
2096          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
2097 #endif
2098          prim.IndirectParameterEnable  = true;
2099          prim.PredicateEnable          = true;
2100          prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
2101 #if GFX_VER >= 11
2102          prim.ExtendedParametersPresent = true;
2103 #endif
2104       }
2105 
2106       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
2107                                             cmd_buffer->device,
2108                                             cmd_buffer->state.gfx.primitive_topology,
2109                                             1);
2110       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
2111 
2112       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
2113    }
2114 
2115    mi_value_unref(&b, max);
2116 }
2117 
genX(CmdDrawIndirectCount)2118 void genX(CmdDrawIndirectCount)(
2119     VkCommandBuffer                             commandBuffer,
2120     VkBuffer                                    _buffer,
2121     VkDeviceSize                                offset,
2122     VkBuffer                                    _countBuffer,
2123     VkDeviceSize                                countBufferOffset,
2124     uint32_t                                    maxDrawCount,
2125     uint32_t                                    stride)
2126 {
2127    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2128    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2129    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2130 
2131    if (anv_batch_has_error(&cmd_buffer->batch))
2132       return;
2133 
2134    anv_measure_snapshot(cmd_buffer,
2135                         INTEL_SNAPSHOT_DRAW,
2136                         "draw indirect count",
2137                         0);
2138    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
2139 
2140    struct anv_address indirect_data_address =
2141       anv_address_add(buffer->address, offset);
2142    struct anv_address count_address =
2143       anv_address_add(count_buffer->address, countBufferOffset);
2144    stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
2145 
2146    if (execute_indirect_draw_supported(cmd_buffer)) {
2147       genX(cmd_buffer_emit_execute_indirect_draws)(
2148          cmd_buffer,
2149          indirect_data_address,
2150          stride,
2151          count_address,
2152          maxDrawCount,
2153          VK_CMD_DRAW_INDIRECT_COUNT);
2154    } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2155       genX(cmd_buffer_emit_indirect_generated_draws)(
2156          cmd_buffer,
2157          indirect_data_address,
2158          stride,
2159          count_address,
2160          maxDrawCount,
2161          false /* indexed */);
2162    } else {
2163       emit_indirect_count_draws(cmd_buffer,
2164                                 indirect_data_address,
2165                                 stride,
2166                                 count_address,
2167                                 maxDrawCount,
2168                                 false /* indexed */);
2169    }
2170 
2171    trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
2172                                        anv_address_utrace(count_address));
2173 }
2174 
genX(CmdDrawIndexedIndirectCount)2175 void genX(CmdDrawIndexedIndirectCount)(
2176     VkCommandBuffer                             commandBuffer,
2177     VkBuffer                                    _buffer,
2178     VkDeviceSize                                offset,
2179     VkBuffer                                    _countBuffer,
2180     VkDeviceSize                                countBufferOffset,
2181     uint32_t                                    maxDrawCount,
2182     uint32_t                                    stride)
2183 {
2184    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2185    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2186    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2187 
2188    if (anv_batch_has_error(&cmd_buffer->batch))
2189       return;
2190 
2191    anv_measure_snapshot(cmd_buffer,
2192                         INTEL_SNAPSHOT_DRAW,
2193                         "draw indexed indirect count",
2194                         0);
2195    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
2196 
2197    struct anv_address indirect_data_address =
2198       anv_address_add(buffer->address, offset);
2199    struct anv_address count_address =
2200       anv_address_add(count_buffer->address, countBufferOffset);
2201    stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
2202 
2203    if (execute_indirect_draw_supported(cmd_buffer)) {
2204       genX(cmd_buffer_emit_execute_indirect_draws)(
2205          cmd_buffer,
2206          indirect_data_address,
2207          stride,
2208          count_address,
2209          maxDrawCount,
2210          VK_CMD_DRAW_INDEXED_INDIRECT_COUNT);
2211    } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2212       genX(cmd_buffer_emit_indirect_generated_draws)(
2213          cmd_buffer,
2214          indirect_data_address,
2215          stride,
2216          count_address,
2217          maxDrawCount,
2218          true /* indexed */);
2219    } else {
2220       emit_indirect_count_draws(cmd_buffer,
2221                                 indirect_data_address,
2222                                 stride,
2223                                 count_address,
2224                                 maxDrawCount,
2225                                 true /* indexed */);
2226    }
2227 
2228    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
2229                                                anv_address_utrace(count_address));
2230 
2231 }
2232 
genX(CmdBeginTransformFeedbackEXT)2233 void genX(CmdBeginTransformFeedbackEXT)(
2234     VkCommandBuffer                             commandBuffer,
2235     uint32_t                                    firstCounterBuffer,
2236     uint32_t                                    counterBufferCount,
2237     const VkBuffer*                             pCounterBuffers,
2238     const VkDeviceSize*                         pCounterBufferOffsets)
2239 {
2240    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2241 
2242    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2243    assert(counterBufferCount <= MAX_XFB_BUFFERS);
2244    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2245 
2246    trace_intel_begin_xfb(&cmd_buffer->trace);
2247 
2248    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2249     *
2250     *    "Ssoftware must ensure that no HW stream output operations can be in
2251     *    process or otherwise pending at the point that the MI_LOAD/STORE
2252     *    commands are processed. This will likely require a pipeline flush."
2253     */
2254    anv_add_pending_pipe_bits(cmd_buffer,
2255                              ANV_PIPE_CS_STALL_BIT,
2256                              "begin transform feedback");
2257    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2258 
2259    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
2260       /* If we have a counter buffer, this is a resume so we need to load the
2261        * value into the streamout offset register.  Otherwise, this is a begin
2262        * and we need to reset it to zero.
2263        */
2264       if (pCounterBuffers &&
2265           idx >= firstCounterBuffer &&
2266           idx - firstCounterBuffer < counterBufferCount &&
2267           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
2268          uint32_t cb_idx = idx - firstCounterBuffer;
2269          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2270          uint64_t offset = pCounterBufferOffsets ?
2271                            pCounterBufferOffsets[cb_idx] : 0;
2272 
2273          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2274             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2275             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
2276                                                    offset);
2277          }
2278       } else {
2279          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2280             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2281             lri.DataDWord        = 0;
2282          }
2283       }
2284    }
2285 
2286    cmd_buffer->state.xfb_enabled = true;
2287    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2288 }
2289 
genX(CmdEndTransformFeedbackEXT)2290 void genX(CmdEndTransformFeedbackEXT)(
2291     VkCommandBuffer                             commandBuffer,
2292     uint32_t                                    firstCounterBuffer,
2293     uint32_t                                    counterBufferCount,
2294     const VkBuffer*                             pCounterBuffers,
2295     const VkDeviceSize*                         pCounterBufferOffsets)
2296 {
2297    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2298 
2299    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2300    assert(counterBufferCount <= MAX_XFB_BUFFERS);
2301    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2302 
2303    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2304     *
2305     *    "Ssoftware must ensure that no HW stream output operations can be in
2306     *    process or otherwise pending at the point that the MI_LOAD/STORE
2307     *    commands are processed. This will likely require a pipeline flush."
2308     */
2309    anv_add_pending_pipe_bits(cmd_buffer,
2310                              ANV_PIPE_CS_STALL_BIT,
2311                              "end transform feedback");
2312    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2313 
2314    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
2315       unsigned idx = firstCounterBuffer + cb_idx;
2316 
2317       /* If we have a counter buffer, this is a resume so we need to load the
2318        * value into the streamout offset register.  Otherwise, this is a begin
2319        * and we need to reset it to zero.
2320        */
2321       if (pCounterBuffers &&
2322           cb_idx < counterBufferCount &&
2323           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
2324          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2325          uint64_t offset = pCounterBufferOffsets ?
2326                            pCounterBufferOffsets[cb_idx] : 0;
2327 
2328          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2329             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
2330                                                    offset);
2331             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2332          }
2333       }
2334    }
2335 
2336    trace_intel_end_xfb(&cmd_buffer->trace);
2337 
2338    cmd_buffer->state.xfb_enabled = false;
2339    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2340 }
2341 
2342 #if GFX_VERx10 >= 125
2343 
2344 void
genX(CmdDrawMeshTasksEXT)2345 genX(CmdDrawMeshTasksEXT)(
2346       VkCommandBuffer commandBuffer,
2347       uint32_t x,
2348       uint32_t y,
2349       uint32_t z)
2350 {
2351    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2352 
2353    if (anv_batch_has_error(&cmd_buffer->batch))
2354       return;
2355 
2356    anv_measure_snapshot(cmd_buffer,
2357                         INTEL_SNAPSHOT_DRAW,
2358                         "draw mesh", x * y * z);
2359 
2360    trace_intel_begin_draw_mesh(&cmd_buffer->trace);
2361 
2362    /* TODO(mesh): Check if this is not emitting more packets than we need. */
2363    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2364 
2365    if (cmd_buffer->state.conditional_render_enabled)
2366       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2367 
2368    anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
2369       m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
2370       m.ThreadGroupCountX = x;
2371       m.ThreadGroupCountY = y;
2372       m.ThreadGroupCountZ = z;
2373    }
2374 
2375    trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
2376 }
2377 
2378 #define GFX125_3DMESH_TG_COUNT 0x26F0
2379 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
2380 
2381 static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)2382 mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
2383                                         struct mi_builder *b,
2384                                         struct anv_address addr,
2385                                         bool emit_xp0,
2386                                         uint32_t xp0)
2387 {
2388    const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
2389    const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
2390    const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
2391 
2392    mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
2393                mi_mem32(anv_address_add(addr, groupCountXOff)));
2394 
2395    mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
2396                mi_mem32(anv_address_add(addr, groupCountYOff)));
2397 
2398    mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
2399                mi_mem32(anv_address_add(addr, groupCountZOff)));
2400 
2401    if (emit_xp0)
2402       mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
2403 }
2404 
2405 static void
emit_indirect_3dmesh_3d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)2406 emit_indirect_3dmesh_3d(struct anv_batch *batch,
2407                         bool predicate_enable,
2408                         bool uses_drawid)
2409 {
2410    uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
2411    uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
2412                    .PredicateEnable           = predicate_enable,
2413                    .IndirectParameterEnable   = true,
2414                    .ExtendedParameter0Present = uses_drawid);
2415    if (uses_drawid)
2416       dw[len - 1] = 0;
2417 }
2418 
2419 void
genX(CmdDrawMeshTasksIndirectEXT)2420 genX(CmdDrawMeshTasksIndirectEXT)(
2421     VkCommandBuffer                             commandBuffer,
2422     VkBuffer                                    _buffer,
2423     VkDeviceSize                                offset,
2424     uint32_t                                    drawCount,
2425     uint32_t                                    stride)
2426 {
2427    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2428    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2429    struct anv_graphics_pipeline *pipeline =
2430       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2431    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2432    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2433    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
2434 
2435    if (anv_batch_has_error(&cmd_buffer->batch))
2436       return;
2437 
2438    anv_measure_snapshot(cmd_buffer,
2439                         INTEL_SNAPSHOT_DRAW,
2440                         "draw mesh indirect", drawCount);
2441 
2442    trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
2443 
2444    if (execute_indirect_draw_supported(cmd_buffer)) {
2445       genX(cmd_buffer_emit_execute_indirect_draws)(
2446          cmd_buffer,
2447          anv_address_add(buffer->address, offset),
2448          MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2449          ANV_NULL_ADDRESS /* count_addr */,
2450          drawCount,
2451          VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT);
2452 
2453       trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2454       return;
2455    }
2456 
2457    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2458 
2459    if (cmd_state->conditional_render_enabled)
2460       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2461 
2462    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2463                        mesh_prog_data->uses_drawid;
2464    struct mi_builder b;
2465    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2466 
2467    for (uint32_t i = 0; i < drawCount; i++) {
2468       struct anv_address draw = anv_address_add(buffer->address, offset);
2469 
2470       mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2471 
2472       emit_indirect_3dmesh_3d(&cmd_buffer->batch,
2473             cmd_state->conditional_render_enabled, uses_drawid);
2474 
2475       offset += stride;
2476    }
2477 
2478    trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2479 }
2480 
2481 void
genX(CmdDrawMeshTasksIndirectCountEXT)2482 genX(CmdDrawMeshTasksIndirectCountEXT)(
2483     VkCommandBuffer                             commandBuffer,
2484     VkBuffer                                    _buffer,
2485     VkDeviceSize                                offset,
2486     VkBuffer                                    _countBuffer,
2487     VkDeviceSize                                countBufferOffset,
2488     uint32_t                                    maxDrawCount,
2489     uint32_t                                    stride)
2490 {
2491    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2492    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2493    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2494    struct anv_graphics_pipeline *pipeline =
2495       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2496    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2497    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2498 
2499    if (anv_batch_has_error(&cmd_buffer->batch))
2500       return;
2501 
2502    anv_measure_snapshot(cmd_buffer,
2503                         INTEL_SNAPSHOT_DRAW,
2504                         "draw mesh indirect count", 0);
2505 
2506    trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
2507 
2508    struct anv_address count_addr =
2509       anv_address_add(count_buffer->address, countBufferOffset);
2510 
2511 
2512    if (execute_indirect_draw_supported(cmd_buffer)) {
2513       genX(cmd_buffer_emit_execute_indirect_draws)(
2514          cmd_buffer,
2515          anv_address_add(buffer->address, offset),
2516          MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2517          count_addr /* count_addr */,
2518          maxDrawCount,
2519          VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT);
2520 
2521       trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, maxDrawCount);
2522       return;
2523    }
2524 
2525    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2526 
2527    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2528                        mesh_prog_data->uses_drawid;
2529 
2530    struct mi_builder b;
2531    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2532    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
2533    mi_builder_set_mocs(&b, mocs);
2534 
2535    struct mi_value max =
2536          prepare_for_draw_count_predicate(
2537             cmd_buffer, &b, count_addr);
2538 
2539    for (uint32_t i = 0; i < maxDrawCount; i++) {
2540       struct anv_address draw = anv_address_add(buffer->address, offset);
2541 
2542       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2543 
2544       mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2545 
2546       emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
2547 
2548       offset += stride;
2549    }
2550 
2551    trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace,
2552                                             anv_address_utrace(count_addr));
2553 }
2554 
2555 #endif /* GFX_VERx10 >= 125 */
2556