xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan_hasvk/genX_cmd_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33 
34 #include "common/intel_l3_config.h"
35 #include "genxml/gen_macros.h"
36 #include "genxml/genX_pack.h"
37 #include "common/intel_guardband.h"
38 #include "compiler/elk/elk_prim.h"
39 
40 #include "nir/nir_xfb_info.h"
41 
42 #include "ds/intel_tracepoints.h"
43 
44 /* We reserve :
45  *    - GPR 14 for secondary command buffer returns
46  *    - GPR 15 for conditional rendering
47  */
48 #define MI_BUILDER_NUM_ALLOC_GPRS 14
49 #define __gen_get_batch_dwords anv_batch_emit_dwords
50 #define __gen_address_offset anv_address_add
51 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
52 #include "common/mi_builder.h"
53 
54 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
55                                         uint32_t pipeline);
56 
57 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)58 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
59    enum anv_pipe_bits bits = 0;
60    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
61    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
62    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
63    bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
64    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
65    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
66    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
67    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
68    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
69    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
70    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
71    return bits;
72 }
73 
74 #define anv_debug_dump_pc(pc) \
75    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
76       fputs("pc: emit PC=( ", stderr); \
77       anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
78       fprintf(stderr, ") reason: %s\n", __func__); \
79    }
80 
81 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)82 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
83 {
84    struct anv_queue_family *queue_family = cmd_buffer->queue_family;
85    return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
86 }
87 
88 void
genX(cmd_buffer_emit_state_base_address)89 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
90 {
91    struct anv_device *device = cmd_buffer->device;
92    uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
93 
94    /* If we are emitting a new state base address we probably need to re-emit
95     * binding tables.
96     */
97    cmd_buffer->state.descriptors_dirty |= ~0;
98 
99    /* Emit a render target cache flush.
100     *
101     * This isn't documented anywhere in the PRM.  However, it seems to be
102     * necessary prior to changing the surface state base address.  Without
103     * this, we get GPU hangs when using multi-level command buffers which
104     * clear depth, reset state base address, and then go render stuff.
105     */
106    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
107       pc.DCFlushEnable = true;
108       pc.RenderTargetCacheFlushEnable = true;
109       pc.CommandStreamerStallEnable = true;
110       anv_debug_dump_pc(pc);
111    }
112 
113    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
114       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
115       sba.GeneralStateMOCS = mocs;
116       sba.GeneralStateBaseAddressModifyEnable = true;
117 
118       sba.StatelessDataPortAccessMOCS = mocs;
119 
120       sba.SurfaceStateBaseAddress =
121          anv_cmd_buffer_surface_base_address(cmd_buffer);
122       sba.SurfaceStateMOCS = mocs;
123       sba.SurfaceStateBaseAddressModifyEnable = true;
124 
125       sba.DynamicStateBaseAddress =
126          (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
127       sba.DynamicStateMOCS = mocs;
128       sba.DynamicStateBaseAddressModifyEnable = true;
129 
130       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
131       sba.IndirectObjectMOCS = mocs;
132       sba.IndirectObjectBaseAddressModifyEnable = true;
133 
134       sba.InstructionBaseAddress =
135          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
136       sba.InstructionMOCS = mocs;
137       sba.InstructionBaseAddressModifyEnable = true;
138 
139 #  if (GFX_VER >= 8)
140       /* Broadwell requires that we specify a buffer size for a bunch of
141        * these fields.  However, since we will be growing the BO's live, we
142        * just set them all to the maximum.
143        */
144       sba.GeneralStateBufferSize       = 0xfffff;
145       sba.IndirectObjectBufferSize     = 0xfffff;
146       if (anv_use_relocations(device->physical)) {
147          sba.DynamicStateBufferSize    = 0xfffff;
148          sba.InstructionBufferSize     = 0xfffff;
149       } else {
150          /* With softpin, we use fixed addresses so we actually know how big
151           * our base addresses are.
152           */
153          sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
154          sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
155       }
156       sba.GeneralStateBufferSizeModifyEnable    = true;
157       sba.IndirectObjectBufferSizeModifyEnable  = true;
158       sba.DynamicStateBufferSizeModifyEnable    = true;
159       sba.InstructionBuffersizeModifyEnable     = true;
160 #  else
161       /* On gfx7, we have upper bounds instead.  According to the docs,
162        * setting an upper bound of zero means that no bounds checking is
163        * performed so, in theory, we should be able to leave them zero.
164        * However, border color is broken and the GPU bounds-checks anyway.
165        * To avoid this and other potential problems, we may as well set it
166        * for everything.
167        */
168       sba.GeneralStateAccessUpperBound =
169          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
170       sba.GeneralStateAccessUpperBoundModifyEnable = true;
171       sba.DynamicStateAccessUpperBound =
172          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
173       sba.DynamicStateAccessUpperBoundModifyEnable = true;
174       sba.InstructionAccessUpperBound =
175          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
176       sba.InstructionAccessUpperBoundModifyEnable = true;
177 #  endif
178    }
179 
180    /* After re-setting the surface state base address, we have to do some
181     * cache flushing so that the sampler engine will pick up the new
182     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
183     * Shared Function > 3D Sampler > State > State Caching (page 96):
184     *
185     *    Coherency with system memory in the state cache, like the texture
186     *    cache is handled partially by software. It is expected that the
187     *    command stream or shader will issue Cache Flush operation or
188     *    Cache_Flush sampler message to ensure that the L1 cache remains
189     *    coherent with system memory.
190     *
191     *    [...]
192     *
193     *    Whenever the value of the Dynamic_State_Base_Addr,
194     *    Surface_State_Base_Addr are altered, the L1 state cache must be
195     *    invalidated to ensure the new surface or sampler state is fetched
196     *    from system memory.
197     *
198     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
199     * which, according the PIPE_CONTROL instruction documentation in the
200     * Broadwell PRM:
201     *
202     *    Setting this bit is independent of any other bit in this packet.
203     *    This bit controls the invalidation of the L1 and L2 state caches
204     *    at the top of the pipe i.e. at the parsing time.
205     *
206     * Unfortunately, experimentation seems to indicate that state cache
207     * invalidation through a PIPE_CONTROL does nothing whatsoever in
208     * regards to surface state and binding tables.  In stead, it seems that
209     * invalidating the texture cache is what is actually needed.
210     *
211     * XXX:  As far as we have been able to determine through
212     * experimentation, shows that flush the texture cache appears to be
213     * sufficient.  The theory here is that all of the sampling/rendering
214     * units cache the binding table in the texture cache.  However, we have
215     * yet to be able to actually confirm this.
216     *
217     * Wa_14013910100:
218     *
219     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
220     *   or program pipe control with Instruction cache invalidate post
221     *   STATE_BASE_ADDRESS command"
222     */
223    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
224       pc.TextureCacheInvalidationEnable = true;
225       pc.ConstantCacheInvalidationEnable = true;
226       pc.StateCacheInvalidationEnable = true;
227       anv_debug_dump_pc(pc);
228    }
229 }
230 
231 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)232 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
233                   struct anv_state state, struct anv_address addr)
234 {
235    VkResult result;
236 
237    if (anv_use_relocations(cmd_buffer->device->physical)) {
238       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
239       result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
240                                   &cmd_buffer->vk.pool->alloc,
241                                   state.offset + isl_dev->ss.addr_offset,
242                                   addr.bo, addr.offset, NULL);
243    } else {
244       result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
245                                      &cmd_buffer->vk.pool->alloc,
246                                      addr.bo);
247    }
248 
249    if (unlikely(result != VK_SUCCESS))
250       anv_batch_set_error(&cmd_buffer->batch, result);
251 }
252 
253 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)254 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
255                          struct anv_surface_state state)
256 {
257    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
258 
259    assert(!anv_address_is_null(state.address));
260    add_surface_reloc(cmd_buffer, state.state, state.address);
261 
262    if (!anv_address_is_null(state.aux_address)) {
263       VkResult result =
264          anv_reloc_list_add(&cmd_buffer->surface_relocs,
265                             &cmd_buffer->vk.pool->alloc,
266                             state.state.offset + isl_dev->ss.aux_addr_offset,
267                             state.aux_address.bo,
268                             state.aux_address.offset,
269                             NULL);
270       if (result != VK_SUCCESS)
271          anv_batch_set_error(&cmd_buffer->batch, result);
272    }
273 
274    if (!anv_address_is_null(state.clear_address)) {
275       VkResult result =
276          anv_reloc_list_add(&cmd_buffer->surface_relocs,
277                             &cmd_buffer->vk.pool->alloc,
278                             state.state.offset +
279                             isl_dev->ss.clear_color_state_offset,
280                             state.clear_address.bo,
281                             state.clear_address.offset,
282                             NULL);
283       if (result != VK_SUCCESS)
284          anv_batch_set_error(&cmd_buffer->batch, result);
285    }
286 }
287 
288 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)289 isl_color_value_requires_conversion(union isl_color_value color,
290                                     const struct isl_surf *surf,
291                                     const struct isl_view *view)
292 {
293    if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
294       return false;
295 
296    uint32_t surf_pack[4] = { 0, 0, 0, 0 };
297    isl_color_value_pack(&color, surf->format, surf_pack);
298 
299    uint32_t view_pack[4] = { 0, 0, 0, 0 };
300    union isl_color_value swiz_color =
301       isl_color_value_swizzle_inv(color, view->swizzle);
302    isl_color_value_pack(&swiz_color, view->format, view_pack);
303 
304    return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
305 }
306 
307 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)308 anv_can_fast_clear_color_view(struct anv_device * device,
309                               struct anv_image_view *iview,
310                               VkImageLayout layout,
311                               union isl_color_value clear_color,
312                               uint32_t num_layers,
313                               VkRect2D render_area)
314 {
315    if (iview->planes[0].isl.base_array_layer >=
316        anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
317                             iview->planes[0].isl.base_level))
318       return false;
319 
320    /* Start by getting the fast clear type.  We use the first subpass
321     * layout here because we don't want to fast-clear if the first subpass
322     * to use the attachment can't handle fast-clears.
323     */
324    enum anv_fast_clear_type fast_clear_type =
325       anv_layout_to_fast_clear_type(device->info, iview->image,
326                                     VK_IMAGE_ASPECT_COLOR_BIT,
327                                     layout);
328    switch (fast_clear_type) {
329    case ANV_FAST_CLEAR_NONE:
330       return false;
331    case ANV_FAST_CLEAR_DEFAULT_VALUE:
332       if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
333          return false;
334       break;
335    case ANV_FAST_CLEAR_ANY:
336       break;
337    }
338 
339    /* Potentially, we could do partial fast-clears but doing so has crazy
340     * alignment restrictions.  It's easier to just restrict to full size
341     * fast clears for now.
342     */
343    if (render_area.offset.x != 0 ||
344        render_area.offset.y != 0 ||
345        render_area.extent.width != iview->vk.extent.width ||
346        render_area.extent.height != iview->vk.extent.height)
347       return false;
348 
349    /* On Broadwell and earlier, we can only handle 0/1 clear colors */
350    if (!isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
351       return false;
352 
353    /* If the clear color is one that would require non-trivial format
354     * conversion on resolve, we don't bother with the fast clear.  This
355     * shouldn't be common as most clear colors are 0/1 and the most common
356     * format re-interpretation is for sRGB.
357     */
358    if (isl_color_value_requires_conversion(clear_color,
359                                            &iview->image->planes[0].primary_surface.isl,
360                                            &iview->planes[0].isl)) {
361       anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
362                     "Cannot fast-clear to colors which would require "
363                     "format conversion on resolve");
364       return false;
365    }
366 
367    /* We only allow fast clears to the first slice of an image (level 0,
368     * layer 0) and only for the entire slice.  This guarantees us that, at
369     * any given time, there is only one clear color on any given image at
370     * any given time.  At the time of our testing (Jan 17, 2018), there
371     * were no known applications which would benefit from fast-clearing
372     * more than just the first slice.
373     */
374    if (iview->planes[0].isl.base_level > 0 ||
375        iview->planes[0].isl.base_array_layer > 0) {
376       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
377                     "Rendering with multi-lod or multi-layer framebuffer "
378                     "with LOAD_OP_LOAD and baseMipLevel > 0 or "
379                     "baseArrayLayer > 0.  Not fast clearing.");
380       return false;
381    }
382 
383    if (num_layers > 1) {
384       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
385                     "Rendering to a multi-layer framebuffer with "
386                     "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
387    }
388 
389    return true;
390 }
391 
392 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,const struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)393 anv_can_hiz_clear_ds_view(struct anv_device *device,
394                           const struct anv_image_view *iview,
395                           VkImageLayout layout,
396                           VkImageAspectFlags clear_aspects,
397                           float depth_clear_value,
398                           VkRect2D render_area)
399 {
400    /* We don't do any HiZ or depth fast-clears on gfx7 yet */
401    if (GFX_VER == 7)
402       return false;
403 
404    /* If we're just clearing stencil, we can always HiZ clear */
405    if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
406       return true;
407 
408    /* We must have depth in order to have HiZ */
409    if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
410       return false;
411 
412    const enum isl_aux_usage clear_aux_usage =
413       anv_layout_to_aux_usage(device->info, iview->image,
414                               VK_IMAGE_ASPECT_DEPTH_BIT,
415                               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
416                               layout);
417    if (!isl_aux_usage_has_fast_clears(clear_aux_usage))
418       return false;
419 
420    assert(GFX_VER == 8);
421    assert(iview->vk.format != VK_FORMAT_D16_UNORM_S8_UINT);
422    if (iview->vk.format == VK_FORMAT_D16_UNORM) {
423       /* From the BDW PRM, Vol 7, "Depth Buffer Clear":
424        *
425        *   The following restrictions apply only if the depth buffer surface
426        *   type is D16_UNORM and software does not use the “full surf clear”:
427        *
428        *   If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
429        *   aligned to an 8x4 pixel block relative to the upper left corner of
430        *   the depth buffer, and contain an integer number of these pixel
431        *   blocks, and all 8x4 pixels must be lit.
432        *
433        * Simply disable partial clears for D16 on BDW.
434        */
435       if (render_area.offset.x > 0 ||
436           render_area.offset.y > 0 ||
437           render_area.extent.width !=
438           u_minify(iview->vk.extent.width, iview->vk.base_mip_level) ||
439           render_area.extent.height !=
440           u_minify(iview->vk.extent.height, iview->vk.base_mip_level)) {
441          return false;
442       }
443    }
444 
445    if (depth_clear_value != ANV_HZ_FC_VAL)
446       return false;
447 
448    /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
449     * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
450     * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
451     */
452    if (GFX_VER == 8 && anv_can_sample_with_hiz(device->info, iview->image))
453       return false;
454 
455    /* If we got here, then we can fast clear */
456    return true;
457 }
458 
459 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
460 
461 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
462  * the initial layout is undefined, the HiZ buffer and depth buffer will
463  * represent the same data at the end of this operation.
464  */
465 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)466 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
467                         const struct anv_image *image,
468                         uint32_t base_layer, uint32_t layer_count,
469                         VkImageLayout initial_layout,
470                         VkImageLayout final_layout,
471                         bool will_full_fast_clear)
472 {
473    const uint32_t depth_plane =
474       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
475    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
476       return;
477 
478    /* If will_full_fast_clear is set, the caller promises to fast-clear the
479     * largest portion of the specified range as it can.  For depth images,
480     * that means the entire image because we don't support multi-LOD HiZ.
481     */
482    assert(image->planes[0].primary_surface.isl.levels == 1);
483    if (will_full_fast_clear)
484       return;
485 
486    const enum isl_aux_state initial_state =
487       anv_layout_to_aux_state(cmd_buffer->device->info, image,
488                               VK_IMAGE_ASPECT_DEPTH_BIT,
489                               initial_layout);
490    const enum isl_aux_state final_state =
491       anv_layout_to_aux_state(cmd_buffer->device->info, image,
492                               VK_IMAGE_ASPECT_DEPTH_BIT,
493                               final_layout);
494 
495    const bool initial_depth_valid =
496       isl_aux_state_has_valid_primary(initial_state);
497    const bool initial_hiz_valid =
498       isl_aux_state_has_valid_aux(initial_state);
499    const bool final_needs_depth =
500       isl_aux_state_has_valid_primary(final_state);
501    const bool final_needs_hiz =
502       isl_aux_state_has_valid_aux(final_state);
503 
504    /* Getting into the pass-through state for Depth is tricky and involves
505     * both a resolve and an ambiguate.  We don't handle that state right now
506     * as anv_layout_to_aux_state never returns it.
507     */
508    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
509 
510    if (final_needs_depth && !initial_depth_valid) {
511       assert(initial_hiz_valid);
512       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
513                        0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
514    } else if (final_needs_hiz && !initial_hiz_valid) {
515       assert(initial_depth_valid);
516       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
517                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
518    }
519 }
520 
521 #if GFX_VER == 7
522 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)523 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
524 {
525    return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
526           layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
527           layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL ||
528           layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL;
529 }
530 #endif
531 
532 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
533  * the initial layout is undefined, the HiZ buffer and depth buffer will
534  * represent the same data at the end of this operation.
535  */
536 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)537 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
538                           const struct anv_image *image,
539                           uint32_t base_level, uint32_t level_count,
540                           uint32_t base_layer, uint32_t layer_count,
541                           VkImageLayout initial_layout,
542                           VkImageLayout final_layout,
543                           bool will_full_fast_clear)
544 {
545 #if GFX_VER == 7
546    const uint32_t plane =
547       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
548 
549    /* On gfx7, we have to store a texturable version of the stencil buffer in
550     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
551     * forth at strategic points. Stencil writes are only allowed in following
552     * layouts:
553     *
554     *  - VK_IMAGE_LAYOUT_GENERAL
555     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
556     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
557     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
558     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
559     *  - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
560     *
561     * For general, we have no nice opportunity to transition so we do the copy
562     * to the shadow unconditionally at the end of the subpass. For transfer
563     * destinations, we can update it as part of the transfer op. For the other
564     * layouts, we delay the copy until a transition into some other layout.
565     */
566    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
567        vk_image_layout_stencil_write_optimal(initial_layout) &&
568        !vk_image_layout_stencil_write_optimal(final_layout)) {
569       anv_image_copy_to_shadow(cmd_buffer, image,
570                                VK_IMAGE_ASPECT_STENCIL_BIT,
571                                base_level, level_count,
572                                base_layer, layer_count);
573    }
574 #endif
575 }
576 
577 #define MI_PREDICATE_SRC0    0x2400
578 #define MI_PREDICATE_SRC1    0x2408
579 #define MI_PREDICATE_RESULT  0x2418
580 
581 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)582 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
583                            const struct anv_image *image,
584                            VkImageAspectFlagBits aspect,
585                            enum anv_fast_clear_type fast_clear)
586 {
587    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
588       sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
589                                                        image, aspect);
590       sdi.ImmediateData = fast_clear;
591    }
592 }
593 
594 /* This is only really practical on haswell and above because it requires
595  * MI math in order to get it correct.
596  */
597 #if GFX_VERx10 >= 75
598 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)599 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
600                                   const struct anv_image *image,
601                                   VkImageAspectFlagBits aspect,
602                                   uint32_t level, uint32_t array_layer,
603                                   enum isl_aux_op resolve_op,
604                                   enum anv_fast_clear_type fast_clear_supported)
605 {
606    struct mi_builder b;
607    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
608 
609    const struct mi_value fast_clear_type =
610       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
611                                                   image, aspect));
612 
613    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
614    if (level == 0 && array_layer == 0) {
615       /* In this case, we are doing a partial resolve to get rid of fast-clear
616        * colors.  We don't care about the compression state but we do care
617        * about how much fast clear is allowed by the final layout.
618        */
619       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
620       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
621 
622       /* We need to compute (fast_clear_supported < image->fast_clear) */
623       struct mi_value pred =
624          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
625       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
626 
627       /* If the predicate is true, we want to write 0 to the fast clear type
628        * and, if it's false, leave it alone.  We can do this by writing
629        *
630        * clear_type = clear_type & ~predicate;
631        */
632       struct mi_value new_fast_clear_type =
633          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
634       mi_store(&b, fast_clear_type, new_fast_clear_type);
635    } else {
636       /* In this case, we're trying to do a partial resolve on a slice that
637        * doesn't have clear color.  There's nothing to do.
638        */
639       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
640       return;
641    }
642 
643    /* Set src1 to 0 and use a != condition */
644    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
645 
646    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
647       mip.LoadOperation    = LOAD_LOADINV;
648       mip.CombineOperation = COMBINE_SET;
649       mip.CompareOperation = COMPARE_SRCS_EQUAL;
650    }
651 }
652 #endif /* GFX_VERx10 >= 75 */
653 
654 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)655 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
656                                  const struct anv_image *image,
657                                  VkImageAspectFlagBits aspect,
658                                  uint32_t level, uint32_t array_layer,
659                                  enum isl_aux_op resolve_op,
660                                  enum anv_fast_clear_type fast_clear_supported)
661 {
662    struct mi_builder b;
663    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
664 
665    struct mi_value fast_clear_type_mem =
666       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
667                                                       image, aspect));
668 
669    /* This only works for partial resolves and only when the clear color is
670     * all or nothing.  On the upside, this emits less command streamer code
671     * and works on Ivybridge and Bay Trail.
672     */
673    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
674    assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
675 
676    /* We don't support fast clears on anything other than the first slice. */
677    if (level > 0 || array_layer > 0)
678       return;
679 
680    /* On gfx8, we don't have a concept of default clear colors because we
681     * can't sample from CCS surfaces.  It's enough to just load the fast clear
682     * state into the predicate register.
683     */
684    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
685    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
686    mi_store(&b, fast_clear_type_mem, mi_imm(0));
687 
688    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
689       mip.LoadOperation    = LOAD_LOADINV;
690       mip.CombineOperation = COMBINE_SET;
691       mip.CompareOperation = COMPARE_SRCS_EQUAL;
692    }
693 }
694 
695 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)696 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
697                                const struct anv_image *image,
698                                enum isl_format format,
699                                struct isl_swizzle swizzle,
700                                VkImageAspectFlagBits aspect,
701                                uint32_t level, uint32_t array_layer,
702                                enum isl_aux_op resolve_op,
703                                enum anv_fast_clear_type fast_clear_supported)
704 {
705    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
706 
707    anv_cmd_simple_resolve_predicate(cmd_buffer, image,
708                                     aspect, level, array_layer,
709                                     resolve_op, fast_clear_supported);
710 
711    /* CCS_D only supports full resolves and BLORP will assert on us if we try
712     * to do a partial resolve on a CCS_D surface.
713     */
714    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
715        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
716       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
717 
718    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
719                     level, array_layer, 1, resolve_op, NULL, true);
720 }
721 
722 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)723 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
724                                const struct anv_image *image,
725                                enum isl_format format,
726                                struct isl_swizzle swizzle,
727                                VkImageAspectFlagBits aspect,
728                                uint32_t array_layer,
729                                enum isl_aux_op resolve_op,
730                                enum anv_fast_clear_type fast_clear_supported)
731 {
732    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
733    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
734 
735 #if GFX_VERx10 >= 75
736    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
737                                      aspect, 0, array_layer,
738                                      resolve_op, fast_clear_supported);
739 
740    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
741                     array_layer, 1, resolve_op, NULL, true);
742 #else
743    unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
744 #endif
745 }
746 
747 void
genX(cmd_buffer_mark_image_written)748 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
749                                     const struct anv_image *image,
750                                     VkImageAspectFlagBits aspect,
751                                     enum isl_aux_usage aux_usage,
752                                     uint32_t level,
753                                     uint32_t base_layer,
754                                     uint32_t layer_count)
755 {
756    /* The aspect must be exactly one of the image aspects. */
757    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
758 }
759 
760 static void
set_image_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,const VkImageAspectFlags aspect,const union isl_color_value clear_color)761 set_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
762                       const struct anv_image *image,
763                       const VkImageAspectFlags aspect,
764                       const union isl_color_value clear_color)
765 {
766    uint32_t plane = anv_image_aspect_to_plane(image, aspect);
767    enum isl_format format = image->planes[plane].primary_surface.isl.format;
768 
769    struct anv_address addr =
770       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
771    assert(!anv_address_is_null(addr));
772 
773    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
774       sdi.Address = addr;
775       if (GFX_VERx10 >= 75) {
776          /* On HSW+, the RENDER_SURFACE_STATE dword containing the clear
777           * values also contains other fields. The dword constructed here
778           * will later be copied onto a surface state as-is. So, initialize
779           * those fields to match the values that we typically expect in a
780           * surface.
781           *
782           * XXX: Handle other values for ShaderChannelSelect and
783           *      ResourceMinLOD.
784           */
785          sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
786                              ISL_CHANNEL_SELECT_GREEN << 22 |
787                              ISL_CHANNEL_SELECT_BLUE  << 19 |
788                              ISL_CHANNEL_SELECT_ALPHA << 16;
789       }
790       if (isl_format_has_int_channel(format)) {
791          for (unsigned i = 0; i < 4; i++) {
792             assert(clear_color.u32[i] == 0 ||
793                    clear_color.u32[i] == 1);
794          }
795          sdi.ImmediateData |= (clear_color.u32[0] != 0) << 31;
796          sdi.ImmediateData |= (clear_color.u32[1] != 0) << 30;
797          sdi.ImmediateData |= (clear_color.u32[2] != 0) << 29;
798          sdi.ImmediateData |= (clear_color.u32[3] != 0) << 28;
799       } else {
800          for (unsigned i = 0; i < 4; i++) {
801             assert(clear_color.f32[i] == 0.0f ||
802                    clear_color.f32[i] == 1.0f);
803          }
804          sdi.ImmediateData |= (clear_color.f32[0] != 0.0f) << 31;
805          sdi.ImmediateData |= (clear_color.f32[1] != 0.0f) << 30;
806          sdi.ImmediateData |= (clear_color.f32[2] != 0.0f) << 29;
807          sdi.ImmediateData |= (clear_color.f32[3] != 0.0f) << 28;
808       }
809    }
810 }
811 
812 /* Copy the fast-clear value dword(s) between a surface state object and an
813  * image's fast clear state buffer.
814  */
815 static void
genX(copy_fast_clear_dwords)816 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
817                              struct anv_state surface_state,
818                              const struct anv_image *image,
819                              VkImageAspectFlagBits aspect,
820                              bool copy_from_surface_state)
821 {
822    assert(cmd_buffer && image);
823    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
824 
825    struct anv_address ss_clear_addr = {
826       .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
827       .offset = surface_state.offset +
828                 cmd_buffer->device->isl_dev.ss.clear_value_offset,
829    };
830    const struct anv_address entry_addr =
831       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
832    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
833 
834 #if GFX_VER == 7
835    /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
836     * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
837     * in-flight when they are issued even if the memory touched is not
838     * currently active for rendering.  The weird bit is that it is not the
839     * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
840     * rendering hangs such that the next stalling command after the
841     * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
842     *
843     * It is unclear exactly why this hang occurs.  Both MI commands come with
844     * warnings about the 3D pipeline but that doesn't seem to fully explain
845     * it.  My (Faith's) best theory is that it has something to do with the
846     * fact that we're using a GPU state register as our temporary and that
847     * something with reading/writing it is causing problems.
848     *
849     * In order to work around this issue, we emit a PIPE_CONTROL with the
850     * command streamer stall bit set.
851     */
852    anv_add_pending_pipe_bits(cmd_buffer,
853                              ANV_PIPE_CS_STALL_BIT,
854                              "after copy_fast_clear_dwords. Avoid potential hang");
855    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
856 #endif
857 
858    struct mi_builder b;
859    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
860 
861    if (copy_from_surface_state) {
862       mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
863    } else {
864       mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
865 
866       /* Updating a surface state object may require that the state cache be
867        * invalidated. From the SKL PRM, Shared Functions -> State -> State
868        * Caching:
869        *
870        *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
871        *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
872        *    modified [...], the L1 state cache must be invalidated to ensure
873        *    the new surface or sampler state is fetched from system memory.
874        *
875        * In testing, SKL doesn't actually seem to need this, but HSW does.
876        */
877       anv_add_pending_pipe_bits(cmd_buffer,
878                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
879                                 "after copy_fast_clear_dwords surface state update");
880    }
881 }
882 
883 /**
884  * @brief Transitions a color buffer from one layout to another.
885  *
886  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
887  * more information.
888  *
889  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
890  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
891  *                    this represents the maximum layers to transition at each
892  *                    specified miplevel.
893  */
894 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)895 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
896                         const struct anv_image *image,
897                         VkImageAspectFlagBits aspect,
898                         const uint32_t base_level, uint32_t level_count,
899                         uint32_t base_layer, uint32_t layer_count,
900                         VkImageLayout initial_layout,
901                         VkImageLayout final_layout,
902                         uint32_t src_queue_family,
903                         uint32_t dst_queue_family,
904                         bool will_full_fast_clear)
905 {
906    struct anv_device *device = cmd_buffer->device;
907    const struct intel_device_info *devinfo = device->info;
908    /* Validate the inputs. */
909    assert(cmd_buffer);
910    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
911    /* These values aren't supported for simplicity's sake. */
912    assert(level_count != VK_REMAINING_MIP_LEVELS &&
913           layer_count != VK_REMAINING_ARRAY_LAYERS);
914    /* Ensure the subresource range is valid. */
915    UNUSED uint64_t last_level_num = base_level + level_count;
916    const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
917    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
918    assert((uint64_t)base_layer + layer_count  <= image_layers);
919    assert(last_level_num <= image->vk.mip_levels);
920    /* If there is a layout transfer, the final layout cannot be undefined or
921     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
922     */
923    assert(initial_layout == final_layout ||
924           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
925            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
926    const struct isl_drm_modifier_info *isl_mod_info =
927       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
928       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
929       : NULL;
930 
931    const bool src_queue_external =
932       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
933       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
934 
935    const bool dst_queue_external =
936       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
937       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
938 
939    /* Simultaneous acquire and release on external queues is illegal. */
940    assert(!src_queue_external || !dst_queue_external);
941 
942    /* Ownership transition on an external queue requires special action if the
943     * image has a DRM format modifier because we store image data in
944     * a driver-private bo which is inaccessible to the external queue.
945     */
946    const bool private_binding_acquire =
947       src_queue_external &&
948       anv_image_is_externally_shared(image) &&
949       anv_image_has_private_binding(image);
950 
951    const bool private_binding_release =
952       dst_queue_external &&
953       anv_image_is_externally_shared(image) &&
954       anv_image_has_private_binding(image);
955 
956    if (initial_layout == final_layout &&
957        !private_binding_acquire && !private_binding_release) {
958       /* No work is needed. */
959        return;
960    }
961 
962    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
963 
964    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
965        final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
966       /* This surface is a linear compressed image with a tiled shadow surface
967        * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
968        * we need to ensure the shadow copy is up-to-date.
969        */
970       assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
971       assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
972       assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
973       assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
974       assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
975       assert(plane == 0);
976       anv_image_copy_to_shadow(cmd_buffer, image,
977                                VK_IMAGE_ASPECT_COLOR_BIT,
978                                base_level, level_count,
979                                base_layer, layer_count);
980    }
981 
982    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
983       return;
984 
985    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
986 
987    /* The following layouts are equivalent for non-linear images. */
988    const bool initial_layout_undefined =
989       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
990       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
991 
992    bool must_init_fast_clear_state = false;
993    bool must_init_aux_surface = false;
994 
995    if (initial_layout_undefined) {
996       /* The subresource may have been aliased and populated with arbitrary
997        * data.
998        */
999       must_init_fast_clear_state = true;
1000       must_init_aux_surface = true;
1001    } else if (private_binding_acquire) {
1002       /* The fast clear state lives in a driver-private bo, and therefore the
1003        * external/foreign queue is unaware of it.
1004        *
1005        * If this is the first time we are accessing the image, then the fast
1006        * clear state is uninitialized.
1007        *
1008        * If this is NOT the first time we are accessing the image, then the fast
1009        * clear state may still be valid and correct due to the resolve during
1010        * our most recent ownership release.  However, we do not track the aux
1011        * state with MI stores, and therefore must assume the worst-case: that
1012        * this is the first time we are accessing the image.
1013        */
1014       assert(image->planes[plane].fast_clear_memory_range.binding ==
1015               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1016       must_init_fast_clear_state = true;
1017 
1018       /* The aux surface, like the fast clear state, lives in
1019        * a driver-private bo.  We must initialize the aux surface for the
1020        * same reasons we must initialize the fast clear state.
1021        */
1022       assert(image->planes[plane].aux_surface.memory_range.binding ==
1023              ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1024       must_init_aux_surface = true;
1025    }
1026 
1027    if (must_init_fast_clear_state) {
1028       if (base_level == 0 && base_layer == 0) {
1029          const union isl_color_value zero_color = {};
1030          set_image_clear_color(cmd_buffer, image, aspect, zero_color);
1031          set_image_fast_clear_state(cmd_buffer, image, aspect,
1032                                     ANV_FAST_CLEAR_NONE);
1033       }
1034    }
1035 
1036    if (must_init_aux_surface) {
1037       assert(must_init_fast_clear_state);
1038 
1039       /* Initialize the aux buffers to enable correct rendering.  In order to
1040        * ensure that things such as storage images work correctly, aux buffers
1041        * need to be initialized to valid data.
1042        *
1043        * Having an aux buffer with invalid data is a problem for two reasons:
1044        *
1045        *  1) Having an invalid value in the buffer can confuse the hardware.
1046        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1047        *     invalid and leads to the hardware doing strange things.  It
1048        *     doesn't hang as far as we can tell but rendering corruption can
1049        *     occur.
1050        *
1051        *  2) If this transition is into the GENERAL layout and we then use the
1052        *     image as a storage image, then we must have the aux buffer in the
1053        *     pass-through state so that, if we then go to texture from the
1054        *     image, we get the results of our storage image writes and not the
1055        *     fast clear color or other random data.
1056        *
1057        * For CCS both of the problems above are real demonstrable issues.  In
1058        * that case, the only thing we can do is to perform an ambiguate to
1059        * transition the aux surface into the pass-through state.
1060        *
1061        * For MCS, (2) is never an issue because we don't support multisampled
1062        * storage images.  In theory, issue (1) is a problem with MCS but we've
1063        * never seen it in the wild.  For 4x and 16x, all bit patters could, in
1064        * theory, be interpreted as something but we don't know that all bit
1065        * patterns are actually valid.  For 2x and 8x, you could easily end up
1066        * with the MCS referring to an invalid plane because not all bits of
1067        * the MCS value are actually used.  Even though we've never seen issues
1068        * in the wild, it's best to play it safe and initialize the MCS.  We
1069        * can use a fast-clear for MCS because we only ever touch from render
1070        * and texture (no image load store).
1071        */
1072       if (image->vk.samples == 1) {
1073          for (uint32_t l = 0; l < level_count; l++) {
1074             const uint32_t level = base_level + l;
1075 
1076             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1077             if (base_layer >= aux_layers)
1078                break; /* We will only get fewer layers as level increases */
1079             uint32_t level_layer_count =
1080                MIN2(layer_count, aux_layers - base_layer);
1081 
1082             /* If will_full_fast_clear is set, the caller promises to
1083              * fast-clear the largest portion of the specified range as it can.
1084              * For color images, that means only the first LOD and array slice.
1085              */
1086             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1087                base_layer++;
1088                level_layer_count--;
1089                if (level_layer_count == 0)
1090                   continue;
1091             }
1092 
1093             anv_image_ccs_op(cmd_buffer, image,
1094                              image->planes[plane].primary_surface.isl.format,
1095                              ISL_SWIZZLE_IDENTITY,
1096                              aspect, level, base_layer, level_layer_count,
1097                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1098          }
1099       } else {
1100          if (image->vk.samples == 4 || image->vk.samples == 16) {
1101             anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1102                           "Doing a potentially unnecessary fast-clear to "
1103                           "define an MCS buffer.");
1104          }
1105 
1106          /* If will_full_fast_clear is set, the caller promises to fast-clear
1107           * the largest portion of the specified range as it can.
1108           */
1109          if (will_full_fast_clear)
1110             return;
1111 
1112          assert(base_level == 0 && level_count == 1);
1113          anv_image_mcs_op(cmd_buffer, image,
1114                           image->planes[plane].primary_surface.isl.format,
1115                           ISL_SWIZZLE_IDENTITY,
1116                           aspect, base_layer, layer_count,
1117                           ISL_AUX_OP_FAST_CLEAR, NULL, false);
1118       }
1119       return;
1120    }
1121 
1122    enum isl_aux_usage initial_aux_usage =
1123       anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1124    enum isl_aux_usage final_aux_usage =
1125       anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1126    enum anv_fast_clear_type initial_fast_clear =
1127       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1128    enum anv_fast_clear_type final_fast_clear =
1129       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1130 
1131    /* We must override the anv_layout_to_* functions because they are unaware of
1132     * acquire/release direction.
1133     */
1134    if (private_binding_acquire) {
1135       assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1136       initial_aux_usage = ISL_AUX_USAGE_NONE;
1137       initial_fast_clear = ANV_FAST_CLEAR_NONE;
1138    } else if (private_binding_release) {
1139       assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1140       final_aux_usage = ISL_AUX_USAGE_NONE;
1141       final_fast_clear = ANV_FAST_CLEAR_NONE;
1142    }
1143 
1144    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1145     * We can handle transitions between CCS_D/E to and from NONE.  What we
1146     * don't yet handle is switching between CCS_E and CCS_D within a given
1147     * image.  Doing so in a performant way requires more detailed aux state
1148     * tracking such as what is done in i965.  For now, just assume that we
1149     * only have one type of compression.
1150     */
1151    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1152           final_aux_usage == ISL_AUX_USAGE_NONE ||
1153           initial_aux_usage == final_aux_usage);
1154 
1155    /* If initial aux usage is NONE, there is nothing to resolve */
1156    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1157       return;
1158 
1159    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1160 
1161    /* If the initial layout supports more fast clear than the final layout
1162     * then we need at least a partial resolve.
1163     */
1164    if (final_fast_clear < initial_fast_clear)
1165       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1166 
1167    if (resolve_op == ISL_AUX_OP_NONE)
1168       return;
1169 
1170    /* Perform a resolve to synchronize data between the main and aux buffer.
1171     * Before we begin, we must satisfy the cache flushing requirement specified
1172     * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1173     *
1174     *    Any transition from any value in {Clear, Render, Resolve} to a
1175     *    different value in {Clear, Render, Resolve} requires end of pipe
1176     *    synchronization.
1177     *
1178     * We perform a flush of the write cache before and after the clear and
1179     * resolve operations to meet this requirement.
1180     *
1181     * Unlike other drawing, fast clear operations are not properly
1182     * synchronized. The first PIPE_CONTROL here likely ensures that the
1183     * contents of the previous render or clear hit the render target before we
1184     * resolve and the second likely ensures that the resolve is complete before
1185     * we do any more rendering or clearing.
1186     */
1187    anv_add_pending_pipe_bits(cmd_buffer,
1188                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1189                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1190                              "after transition RT");
1191 
1192    for (uint32_t l = 0; l < level_count; l++) {
1193       uint32_t level = base_level + l;
1194 
1195       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1196       if (base_layer >= aux_layers)
1197          break; /* We will only get fewer layers as level increases */
1198       uint32_t level_layer_count =
1199          MIN2(layer_count, aux_layers - base_layer);
1200 
1201       for (uint32_t a = 0; a < level_layer_count; a++) {
1202          uint32_t array_layer = base_layer + a;
1203 
1204          /* If will_full_fast_clear is set, the caller promises to fast-clear
1205           * the largest portion of the specified range as it can.  For color
1206           * images, that means only the first LOD and array slice.
1207           */
1208          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1209             continue;
1210 
1211          if (image->vk.samples == 1) {
1212             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1213                                            image->planes[plane].primary_surface.isl.format,
1214                                            ISL_SWIZZLE_IDENTITY,
1215                                            aspect, level, array_layer, resolve_op,
1216                                            final_fast_clear);
1217          } else {
1218             /* We only support fast-clear on the first layer so partial
1219              * resolves should not be used on other layers as they will use
1220              * the clear color stored in memory that is only valid for layer0.
1221              */
1222             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1223                 array_layer != 0)
1224                continue;
1225 
1226             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1227                                            image->planes[plane].primary_surface.isl.format,
1228                                            ISL_SWIZZLE_IDENTITY,
1229                                            aspect, array_layer, resolve_op,
1230                                            final_fast_clear);
1231          }
1232       }
1233    }
1234 
1235    anv_add_pending_pipe_bits(cmd_buffer,
1236                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1237                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1238                              "after transition RT");
1239 }
1240 
1241 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1242 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1243                                 uint32_t color_att_count)
1244 {
1245    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1246 
1247    /* Reserve one for the NULL state. */
1248    unsigned num_states = 1 + color_att_count;
1249    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1250    const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1251    gfx->att_states =
1252       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1253                              num_states * ss_stride, isl_dev->ss.align);
1254    if (gfx->att_states.map == NULL) {
1255       return anv_batch_set_error(&cmd_buffer->batch,
1256                                  VK_ERROR_OUT_OF_DEVICE_MEMORY);
1257    }
1258 
1259    struct anv_state next_state = gfx->att_states;
1260    next_state.alloc_size = isl_dev->ss.size;
1261 
1262    gfx->null_surface_state = next_state;
1263    next_state.offset += ss_stride;
1264    next_state.map += ss_stride;
1265 
1266    gfx->color_att_count = color_att_count;
1267    for (uint32_t i = 0; i < color_att_count; i++) {
1268       gfx->color_att[i] = (struct anv_attachment) {
1269          .surface_state.state = next_state,
1270       };
1271       next_state.offset += ss_stride;
1272       next_state.map += ss_stride;
1273    }
1274    gfx->depth_att = (struct anv_attachment) { };
1275    gfx->stencil_att = (struct anv_attachment) { };
1276 
1277    return VK_SUCCESS;
1278 }
1279 
1280 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1281 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1282 {
1283    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1284 
1285    gfx->render_area = (VkRect2D) { };
1286    gfx->layer_count = 0;
1287    gfx->samples = 0;
1288 
1289    gfx->color_att_count = 0;
1290    gfx->depth_att = (struct anv_attachment) { };
1291    gfx->stencil_att = (struct anv_attachment) { };
1292    gfx->null_surface_state = ANV_STATE_NULL;
1293 }
1294 
1295 VkResult
genX(BeginCommandBuffer)1296 genX(BeginCommandBuffer)(
1297     VkCommandBuffer                             commandBuffer,
1298     const VkCommandBufferBeginInfo*             pBeginInfo)
1299 {
1300    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1301    VkResult result;
1302 
1303    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1304     * command buffer's state. Otherwise, we must *reset* its state. In both
1305     * cases we reset it.
1306     *
1307     * From the Vulkan 1.0 spec:
1308     *
1309     *    If a command buffer is in the executable state and the command buffer
1310     *    was allocated from a command pool with the
1311     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1312     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
1313     *    as if vkResetCommandBuffer had been called with
1314     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1315     *    the command buffer in the recording state.
1316     */
1317    anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
1318    anv_cmd_buffer_reset_rendering(cmd_buffer);
1319 
1320    cmd_buffer->usage_flags = pBeginInfo->flags;
1321 
1322    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1323     * primary level command buffers.
1324     *
1325     * From the Vulkan 1.0 spec:
1326     *
1327     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1328     *    secondary command buffer is considered to be entirely inside a render
1329     *    pass. If this is a primary command buffer, then this bit is ignored.
1330     */
1331    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1332       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1333 
1334    trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
1335 
1336    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1337 
1338    /* We sometimes store vertex data in the dynamic state buffer for blorp
1339     * operations and our dynamic state stream may re-use data from previous
1340     * command buffers.  In order to prevent stale cache data, we flush the VF
1341     * cache.  We could do this on every blorp call but that's not really
1342     * needed as all of the data will get written by the CPU prior to the GPU
1343     * executing anything.  The chances are fairly high that they will use
1344     * blorp at least once per primary command buffer so it shouldn't be
1345     * wasted.
1346     *
1347     * There is also a workaround on gfx8 which requires us to invalidate the
1348     * VF cache occasionally.  It's easier if we can assume we start with a
1349     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1350     */
1351    anv_add_pending_pipe_bits(cmd_buffer,
1352                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1353                              "new cmd buffer");
1354 
1355    /* We send an "Indirect State Pointers Disable" packet at
1356     * EndCommandBuffer, so all push constant packets are ignored during a
1357     * context restore. Documentation says after that command, we need to
1358     * emit push constants again before any rendering operation. So we
1359     * flag them dirty here to make sure they get emitted.
1360     */
1361    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1362 
1363    if (cmd_buffer->usage_flags &
1364        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1365       struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1366 
1367       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
1368       const VkRenderingInfo *resume_info =
1369          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
1370                                                                pBeginInfo,
1371                                                                gcbiar_data);
1372       if (resume_info != NULL) {
1373          genX(CmdBeginRendering)(commandBuffer, resume_info);
1374       } else {
1375          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
1376             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1377                                                              pBeginInfo);
1378          assert(inheritance_info);
1379 
1380          gfx->rendering_flags = inheritance_info->flags;
1381          gfx->render_area = (VkRect2D) { };
1382          gfx->layer_count = 0;
1383          gfx->samples = inheritance_info->rasterizationSamples;
1384          gfx->view_mask = inheritance_info->viewMask;
1385 
1386          uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1387          result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
1388          if (result != VK_SUCCESS)
1389             return result;
1390 
1391          for (uint32_t i = 0; i < color_att_count; i++) {
1392             gfx->color_att[i].vk_format =
1393                inheritance_info->pColorAttachmentFormats[i];
1394          }
1395          gfx->depth_att.vk_format =
1396             inheritance_info->depthAttachmentFormat;
1397          gfx->stencil_att.vk_format =
1398             inheritance_info->stencilAttachmentFormat;
1399 
1400          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1401 
1402          anv_cmd_graphic_state_update_has_uint_rt(gfx);
1403       }
1404    }
1405 
1406 #if GFX_VER >= 8
1407    /* Emit the sample pattern at the beginning of the batch because the
1408     * default locations emitted at the device initialization might have been
1409     * changed by a previous command buffer.
1410     *
1411     * Do not change that when we're continuing a previous renderpass.
1412     */
1413    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1414        !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1415       genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
1416 #endif
1417 
1418 #if GFX_VERx10 >= 75
1419    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1420       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1421          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1422 
1423       /* If secondary buffer supports conditional rendering
1424        * we should emit commands as if conditional rendering is enabled.
1425        */
1426       cmd_buffer->state.conditional_render_enabled =
1427          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1428    }
1429 #endif
1430 
1431    return VK_SUCCESS;
1432 }
1433 
1434 /* From the PRM, Volume 2a:
1435  *
1436  *    "Indirect State Pointers Disable
1437  *
1438  *    At the completion of the post-sync operation associated with this pipe
1439  *    control packet, the indirect state pointers in the hardware are
1440  *    considered invalid; the indirect pointers are not saved in the context.
1441  *    If any new indirect state commands are executed in the command stream
1442  *    while the pipe control is pending, the new indirect state commands are
1443  *    preserved.
1444  *
1445  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1446  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1447  *    commands are only considered as Indirect State Pointers. Once ISP is
1448  *    issued in a context, SW must initialize by programming push constant
1449  *    commands for all the shaders (at least to zero length) before attempting
1450  *    any rendering operation for the same context."
1451  *
1452  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1453  * even though they point to a BO that has been already unreferenced at
1454  * the end of the previous batch buffer. This has been fine so far since
1455  * we are protected by these scratch page (every address not covered by
1456  * a BO should be pointing to the scratch page). But on CNL, it is
1457  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1458  * instruction.
1459  *
1460  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1461  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1462  * context restore, so the mentioned hang doesn't happen. However,
1463  * software must program push constant commands for all stages prior to
1464  * rendering anything. So we flag them dirty in BeginCommandBuffer.
1465  *
1466  * Finally, we also make sure to stall at pixel scoreboard to make sure the
1467  * constants have been loaded into the EUs prior to disable the push constants
1468  * so that it doesn't hang a previous 3DPRIMITIVE.
1469  */
1470 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1471 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1472 {
1473    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1474          pc.StallAtPixelScoreboard = true;
1475          pc.CommandStreamerStallEnable = true;
1476          anv_debug_dump_pc(pc);
1477    }
1478    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1479          pc.IndirectStatePointersDisable = true;
1480          pc.CommandStreamerStallEnable = true;
1481          anv_debug_dump_pc(pc);
1482    }
1483 }
1484 
1485 VkResult
genX(EndCommandBuffer)1486 genX(EndCommandBuffer)(
1487     VkCommandBuffer                             commandBuffer)
1488 {
1489    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1490 
1491    if (anv_batch_has_error(&cmd_buffer->batch))
1492       return cmd_buffer->batch.status;
1493 
1494    anv_measure_endcommandbuffer(cmd_buffer);
1495 
1496    /* We want every command buffer to start with the PMA fix in a known state,
1497     * so we disable it at the end of the command buffer.
1498     */
1499    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1500 
1501    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1502 
1503    emit_isp_disable(cmd_buffer);
1504 
1505    trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
1506 
1507    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1508 
1509    return VK_SUCCESS;
1510 }
1511 
1512 void
genX(CmdExecuteCommands)1513 genX(CmdExecuteCommands)(
1514     VkCommandBuffer                             commandBuffer,
1515     uint32_t                                    commandBufferCount,
1516     const VkCommandBuffer*                      pCmdBuffers)
1517 {
1518    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1519 
1520    assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1521 
1522    if (anv_batch_has_error(&primary->batch))
1523       return;
1524 
1525    /* The secondary command buffers will assume that the PMA fix is disabled
1526     * when they begin executing.  Make sure this is true.
1527     */
1528    genX(cmd_buffer_enable_pma_fix)(primary, false);
1529 
1530    /* The secondary command buffer doesn't know which textures etc. have been
1531     * flushed prior to their execution.  Apply those flushes now.
1532     */
1533    genX(cmd_buffer_apply_pipe_flushes)(primary);
1534 
1535    for (uint32_t i = 0; i < commandBufferCount; i++) {
1536       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1537 
1538       assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1539       assert(!anv_batch_has_error(&secondary->batch));
1540 
1541 #if GFX_VERx10 >= 75
1542       if (secondary->state.conditional_render_enabled) {
1543          if (!primary->state.conditional_render_enabled) {
1544             /* Secondary buffer is constructed as if it will be executed
1545              * with conditional rendering, we should satisfy this dependency
1546              * regardless of conditional rendering being enabled in primary.
1547              */
1548             struct mi_builder b;
1549             mi_builder_init(&b, primary->device->info, &primary->batch);
1550             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1551                          mi_imm(UINT64_MAX));
1552          }
1553       }
1554 #endif
1555 
1556       if (secondary->usage_flags &
1557           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1558          /* If we're continuing a render pass from the primary, we need to
1559           * copy the surface states for the current subpass into the storage
1560           * we allocated for them in BeginCommandBuffer.
1561           */
1562          struct anv_bo *ss_bo =
1563             primary->device->surface_state_pool.block_pool.bo;
1564          struct anv_state src_state = primary->state.gfx.att_states;
1565          struct anv_state dst_state = secondary->state.gfx.att_states;
1566          assert(src_state.alloc_size == dst_state.alloc_size);
1567 
1568          genX(cmd_buffer_so_memcpy)(primary,
1569                                     (struct anv_address) {
1570                                        .bo = ss_bo,
1571                                        .offset = dst_state.offset,
1572                                     },
1573                                     (struct anv_address) {
1574                                        .bo = ss_bo,
1575                                        .offset = src_state.offset,
1576                                     },
1577                                     src_state.alloc_size);
1578       }
1579 
1580       anv_cmd_buffer_add_secondary(primary, secondary);
1581 
1582       assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1583              secondary->perf_query_pool == primary->perf_query_pool);
1584       if (secondary->perf_query_pool)
1585          primary->perf_query_pool = secondary->perf_query_pool;
1586    }
1587 
1588    /* The secondary isn't counted in our VF cache tracking so we need to
1589     * invalidate the whole thing.
1590     */
1591    if (GFX_VER == 8) {
1592       anv_add_pending_pipe_bits(primary,
1593                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1594                                 "Secondary cmd buffer not tracked in VF cache");
1595    }
1596 
1597    /* The secondary may have selected a different pipeline (3D or compute) and
1598     * may have changed the current L3$ configuration.  Reset our tracking
1599     * variables to invalid values to ensure that we re-emit these in the case
1600     * where we do any draws or compute dispatches from the primary after the
1601     * secondary has returned.
1602     */
1603    primary->state.current_pipeline = UINT32_MAX;
1604    primary->state.current_l3_config = NULL;
1605    primary->state.current_hash_scale = 0;
1606    primary->state.gfx.push_constant_stages = 0;
1607    vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
1608 
1609    /* Each of the secondary command buffers will use its own state base
1610     * address.  We need to re-emit state base address for the primary after
1611     * all of the secondaries are done.
1612     *
1613     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1614     * address calls?
1615     */
1616    genX(cmd_buffer_emit_state_base_address)(primary);
1617 }
1618 
1619 /**
1620  * Program the hardware to use the specified L3 configuration.
1621  */
1622 void
genX(cmd_buffer_config_l3)1623 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1624                            const struct intel_l3_config *cfg)
1625 {
1626    assert(cfg);
1627    if (cfg == cmd_buffer->state.current_l3_config)
1628       return;
1629 
1630    if (INTEL_DEBUG(DEBUG_L3)) {
1631       mesa_logd("L3 config transition: ");
1632       intel_dump_l3_config(cfg, stderr);
1633    }
1634 
1635    /* According to the hardware docs, the L3 partitioning can only be changed
1636     * while the pipeline is completely drained and the caches are flushed,
1637     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1638     */
1639    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1640       pc.DCFlushEnable = true;
1641       pc.PostSyncOperation = NoWrite;
1642       pc.CommandStreamerStallEnable = true;
1643       anv_debug_dump_pc(pc);
1644    }
1645 
1646    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1647     * invalidation of the relevant caches.  Note that because RO invalidation
1648     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1649     * command is processed by the CS) we cannot combine it with the previous
1650     * stalling flush as the hardware documentation suggests, because that
1651     * would cause the CS to stall on previous rendering *after* RO
1652     * invalidation and wouldn't prevent the RO caches from being polluted by
1653     * concurrent rendering before the stall completes.  This intentionally
1654     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1655     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1656     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1657     * already guarantee that there is no concurrent GPGPU kernel execution
1658     * (see SKL HSD 2132585).
1659     */
1660    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1661       pc.TextureCacheInvalidationEnable = true;
1662       pc.ConstantCacheInvalidationEnable = true;
1663       pc.InstructionCacheInvalidateEnable = true;
1664       pc.StateCacheInvalidationEnable = true;
1665       pc.PostSyncOperation = NoWrite;
1666       anv_debug_dump_pc(pc);
1667    }
1668 
1669    /* Now send a third stalling flush to make sure that invalidation is
1670     * complete when the L3 configuration registers are modified.
1671     */
1672    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1673       pc.DCFlushEnable = true;
1674       pc.PostSyncOperation = NoWrite;
1675       pc.CommandStreamerStallEnable = true;
1676       anv_debug_dump_pc(pc);
1677    }
1678 
1679    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1680    cmd_buffer->state.current_l3_config = cfg;
1681 }
1682 
1683 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1684 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1685                               struct anv_device *device,
1686                               uint32_t current_pipeline,
1687                               enum anv_pipe_bits bits)
1688 {
1689    /*
1690     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1691     *
1692     *    Write synchronization is a special case of end-of-pipe
1693     *    synchronization that requires that the render cache and/or depth
1694     *    related caches are flushed to memory, where the data will become
1695     *    globally visible. This type of synchronization is required prior to
1696     *    SW (CPU) actually reading the result data from memory, or initiating
1697     *    an operation that will use as a read surface (such as a texture
1698     *    surface) a previous render target and/or depth/stencil buffer
1699     *
1700     *
1701     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1702     *
1703     *    Exercising the write cache flush bits (Render Target Cache Flush
1704     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1705     *    ensures the write caches are flushed and doesn't guarantee the data
1706     *    is globally visible.
1707     *
1708     *    SW can track the completion of the end-of-pipe-synchronization by
1709     *    using "Notify Enable" and "PostSync Operation - Write Immediate
1710     *    Data" in the PIPE_CONTROL command.
1711     *
1712     * In other words, flushes are pipelined while invalidations are handled
1713     * immediately.  Therefore, if we're flushing anything then we need to
1714     * schedule an end-of-pipe sync before any invalidations can happen.
1715     */
1716    if (bits & ANV_PIPE_FLUSH_BITS)
1717       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1718 
1719    /* If we're going to do an invalidate and we have a pending end-of-pipe
1720     * sync that has yet to be resolved, we do the end-of-pipe sync now.
1721     */
1722    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1723        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1724       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1725       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1726    }
1727 
1728    /* Project: SKL / Argument: LRI Post Sync Operation [23]
1729     *
1730     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1731     *  programmed prior to programming a PIPECONTROL command with "LRI
1732     *  Post Sync Operation" in GPGPU mode of operation (i.e when
1733     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
1734     *
1735     * The same text exists a few rows below for Post Sync Op.
1736     */
1737    if (bits & ANV_PIPE_POST_SYNC_BIT)
1738       bits &= ~ANV_PIPE_POST_SYNC_BIT;
1739 
1740    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1741                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1742       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1743          /* Flushing HDC pipeline requires DC Flush on earlier HW. */
1744          pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
1745          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
1746          pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
1747          pipe.RenderTargetCacheFlushEnable =
1748             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
1749 
1750          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
1751 #if GFX_VER == 8
1752          /* From Broadwell PRM, volume 2a:
1753           *    PIPE_CONTROL: Command Streamer Stall Enable:
1754           *
1755           *    "This bit must be always set when PIPE_CONTROL command is
1756           *     programmed by GPGPU and MEDIA workloads, except for the cases
1757           *     when only Read Only Cache Invalidation bits are set (State
1758           *     Cache Invalidation Enable, Instruction cache Invalidation
1759           *     Enable, Texture Cache Invalidation Enable, Constant Cache
1760           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
1761           *     need not implemented when FF_DOP_CG is disabled."
1762           *
1763           *    Since we do all the invalidation in the following PIPE_CONTROL,
1764           *    if we got here, we need a stall.
1765           */
1766          pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
1767 #endif
1768 
1769          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
1770 
1771          /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1772           *
1773           *    "The most common action to perform upon reaching a
1774           *    synchronization point is to write a value out to memory. An
1775           *    immediate value (included with the synchronization command) may
1776           *    be written."
1777           *
1778           *
1779           * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1780           *
1781           *    "In case the data flushed out by the render engine is to be
1782           *    read back in to the render engine in coherent manner, then the
1783           *    render engine has to wait for the fence completion before
1784           *    accessing the flushed data. This can be achieved by following
1785           *    means on various products: PIPE_CONTROL command with CS Stall
1786           *    and the required write caches flushed with Post-Sync-Operation
1787           *    as Write Immediate Data.
1788           *
1789           *    Example:
1790           *       - Workload-1 (3D/GPGPU/MEDIA)
1791           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1792           *         Immediate Data, Required Write Cache Flush bits set)
1793           *       - Workload-2 (Can use the data produce or output by
1794           *         Workload-1)
1795           */
1796          if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1797             pipe.CommandStreamerStallEnable = true;
1798             pipe.PostSyncOperation = WriteImmediateData;
1799             pipe.Address = device->workaround_address;
1800          }
1801 
1802          /*
1803           * According to the Broadwell documentation, any PIPE_CONTROL with the
1804           * "Command Streamer Stall" bit set must also have another bit set,
1805           * with five different options:
1806           *
1807           *  - Render Target Cache Flush
1808           *  - Depth Cache Flush
1809           *  - Stall at Pixel Scoreboard
1810           *  - Post-Sync Operation
1811           *  - Depth Stall
1812           *  - DC Flush Enable
1813           *
1814           * I chose "Stall at Pixel Scoreboard" since that's what we use in
1815           * mesa and it seems to work fine. The choice is fairly arbitrary.
1816           */
1817          if (pipe.CommandStreamerStallEnable &&
1818              !pipe.RenderTargetCacheFlushEnable &&
1819              !pipe.DepthCacheFlushEnable &&
1820              !pipe.StallAtPixelScoreboard &&
1821              !pipe.PostSyncOperation &&
1822              !pipe.DepthStallEnable &&
1823              !pipe.DCFlushEnable)
1824             pipe.StallAtPixelScoreboard = true;
1825          anv_debug_dump_pc(pipe);
1826       }
1827 
1828       /* If a render target flush was emitted, then we can toggle off the bit
1829        * saying that render target writes are ongoing.
1830        */
1831       if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
1832          bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
1833 
1834       if (GFX_VERx10 == 75) {
1835          /* Haswell needs addition work-arounds:
1836           *
1837           * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1838           *
1839           *    Option 1:
1840           *    PIPE_CONTROL command with the CS Stall and the required write
1841           *    caches flushed with Post-SyncOperation as Write Immediate Data
1842           *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
1843           *    spce) commands.
1844           *
1845           *    Example:
1846           *       - Workload-1
1847           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1848           *         Immediate Data, Required Write Cache Flush bits set)
1849           *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
1850           *       - Workload-2 (Can use the data produce or output by
1851           *         Workload-1)
1852           *
1853           * Unfortunately, both the PRMs and the internal docs are a bit
1854           * out-of-date in this regard.  What the windows driver does (and
1855           * this appears to actually work) is to emit a register read from the
1856           * memory address written by the pipe control above.
1857           *
1858           * What register we load into doesn't matter.  We choose an indirect
1859           * rendering register because we know it always exists and it's one
1860           * of the first registers the command parser allows us to write.  If
1861           * you don't have command parser support in your kernel (pre-4.2),
1862           * this will get turned into MI_NOOP and you won't get the
1863           * workaround.  Unfortunately, there's just not much we can do in
1864           * that case.  This register is perfectly safe to write since we
1865           * always re-load all of the indirect draw registers right before
1866           * 3DPRIMITIVE when needed anyway.
1867           */
1868          anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1869             lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
1870             lrm.MemoryAddress = device->workaround_address;
1871          }
1872       }
1873 
1874       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1875                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1876    }
1877 
1878    if (bits & ANV_PIPE_INVALIDATE_BITS) {
1879       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1880          pipe.StateCacheInvalidationEnable =
1881             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
1882          pipe.ConstantCacheInvalidationEnable =
1883             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
1884          pipe.VFCacheInvalidationEnable =
1885             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
1886          pipe.TextureCacheInvalidationEnable =
1887             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
1888          pipe.InstructionCacheInvalidateEnable =
1889             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
1890 
1891          anv_debug_dump_pc(pipe);
1892       }
1893 
1894       bits &= ~ANV_PIPE_INVALIDATE_BITS;
1895    }
1896 
1897    return bits;
1898 }
1899 
1900 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1901 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1902 {
1903    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1904 
1905    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1906       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1907    else if (bits == 0)
1908       return;
1909 
1910    bool trace_flush =
1911       (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
1912    if (trace_flush)
1913       trace_intel_begin_stall(&cmd_buffer->trace);
1914 
1915    if (GFX_VER == 8 &&
1916        (bits & ANV_PIPE_CS_STALL_BIT) &&
1917        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1918       /* If we are doing a VF cache invalidate AND a CS stall (it must be
1919        * both) then we can reset our vertex cache tracking.
1920        */
1921       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1922              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1923       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1924              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1925    }
1926 
1927    cmd_buffer->state.pending_pipe_bits =
1928       genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1929                                     cmd_buffer->device,
1930                                     cmd_buffer->state.current_pipeline,
1931                                     bits);
1932 
1933    if (trace_flush) {
1934       trace_intel_end_stall(&cmd_buffer->trace, bits,
1935                             anv_pipe_flush_bit_to_ds_stall_flag,
1936                             NULL, NULL, NULL, NULL);
1937    }
1938 }
1939 
1940 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,const char * reason)1941 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
1942                    const VkDependencyInfo *dep_info,
1943                    const char *reason)
1944 {
1945    /* XXX: Right now, we're really dumb and just flush whatever categories
1946     * the app asks for.  One of these days we may make this a bit better
1947     * but right now that's all the hardware allows for in most areas.
1948     */
1949    VkAccessFlags2 src_flags = 0;
1950    VkAccessFlags2 dst_flags = 0;
1951 
1952    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
1953       src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
1954       dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
1955    }
1956 
1957    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
1958       src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
1959       dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
1960    }
1961 
1962    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
1963       const VkImageMemoryBarrier2 *img_barrier =
1964          &dep_info->pImageMemoryBarriers[i];
1965 
1966       src_flags |= img_barrier->srcAccessMask;
1967       dst_flags |= img_barrier->dstAccessMask;
1968 
1969       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
1970       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
1971 
1972       uint32_t base_layer, layer_count;
1973       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
1974          base_layer = 0;
1975          layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
1976       } else {
1977          base_layer = range->baseArrayLayer;
1978          layer_count = vk_image_subresource_layer_count(&image->vk, range);
1979       }
1980       const uint32_t level_count =
1981          vk_image_subresource_level_count(&image->vk, range);
1982 
1983       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1984          transition_depth_buffer(cmd_buffer, image,
1985                                  base_layer, layer_count,
1986                                  img_barrier->oldLayout,
1987                                  img_barrier->newLayout,
1988                                  false /* will_full_fast_clear */);
1989       }
1990 
1991       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1992          transition_stencil_buffer(cmd_buffer, image,
1993                                    range->baseMipLevel, level_count,
1994                                    base_layer, layer_count,
1995                                    img_barrier->oldLayout,
1996                                    img_barrier->newLayout,
1997                                    false /* will_full_fast_clear */);
1998 
1999          /* If we are in a renderpass, the gfx7 stencil shadow may need to be
2000           * updated even if the layout doesn't change
2001           */
2002          if (cmd_buffer->state.gfx.samples &&
2003               (img_barrier->dstAccessMask & (VK_ACCESS_2_SHADER_READ_BIT |
2004                                              VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2005                                              VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))) {
2006             const uint32_t plane =
2007                anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
2008             if (anv_surface_is_valid(&image->planes[plane].shadow_surface))
2009                anv_image_copy_to_shadow(cmd_buffer, image,
2010                                         VK_IMAGE_ASPECT_STENCIL_BIT,
2011                                         range->baseMipLevel, level_count,
2012                                         base_layer, layer_count);
2013          }
2014       }
2015 
2016       if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2017          VkImageAspectFlags color_aspects =
2018             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2019          anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2020             transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2021                                     range->baseMipLevel, level_count,
2022                                     base_layer, layer_count,
2023                                     img_barrier->oldLayout,
2024                                     img_barrier->newLayout,
2025                                     img_barrier->srcQueueFamilyIndex,
2026                                     img_barrier->dstQueueFamilyIndex,
2027                                     false /* will_full_fast_clear */);
2028          }
2029       }
2030    }
2031 
2032    enum anv_pipe_bits bits =
2033       anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2034       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2035 
2036    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2037 }
2038 
genX(CmdPipelineBarrier2)2039 void genX(CmdPipelineBarrier2)(
2040     VkCommandBuffer                             commandBuffer,
2041     const VkDependencyInfo*                     pDependencyInfo)
2042 {
2043    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2044 
2045    cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2046 }
2047 
2048 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2049 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2050 {
2051    VkShaderStageFlags stages =
2052       cmd_buffer->state.gfx.pipeline->active_stages;
2053 
2054    /* In order to avoid thrash, we assume that vertex and fragment stages
2055     * always exist.  In the rare case where one is missing *and* the other
2056     * uses push concstants, this may be suboptimal.  However, avoiding stalls
2057     * seems more important.
2058     */
2059    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2060    if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2061       stages |= VK_SHADER_STAGE_VERTEX_BIT;
2062 
2063    if (stages == cmd_buffer->state.gfx.push_constant_stages)
2064       return;
2065 
2066    const unsigned push_constant_kb =
2067       cmd_buffer->device->info->max_constant_urb_size_kb;
2068 
2069    const unsigned num_stages =
2070       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2071    unsigned size_per_stage = push_constant_kb / num_stages;
2072 
2073    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2074     * units of 2KB.  Incidentally, these are the same platforms that have
2075     * 32KB worth of push constant space.
2076     */
2077    if (push_constant_kb == 32)
2078       size_per_stage &= ~1u;
2079 
2080    uint32_t kb_used = 0;
2081    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2082       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2083       anv_batch_emit(&cmd_buffer->batch,
2084                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2085          alloc._3DCommandSubOpcode  = 18 + i;
2086          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2087          alloc.ConstantBufferSize   = push_size;
2088       }
2089       kb_used += push_size;
2090    }
2091 
2092    anv_batch_emit(&cmd_buffer->batch,
2093                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2094       alloc.ConstantBufferOffset = kb_used;
2095       alloc.ConstantBufferSize = push_constant_kb - kb_used;
2096    }
2097 
2098    cmd_buffer->state.gfx.push_constant_stages = stages;
2099 
2100    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2101     *
2102     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2103     *    the next 3DPRIMITIVE command after programming the
2104     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2105     *
2106     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2107     * pipeline setup, we need to dirty push constants.
2108     */
2109    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2110 }
2111 
2112 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2113 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2114                    struct anv_cmd_pipeline_state *pipe_state,
2115                    struct anv_shader_bin *shader,
2116                    struct anv_state *bt_state)
2117 {
2118    uint32_t state_offset;
2119 
2120    struct anv_pipeline_bind_map *map = &shader->bind_map;
2121    if (map->surface_count == 0) {
2122       *bt_state = (struct anv_state) { 0, };
2123       return VK_SUCCESS;
2124    }
2125 
2126    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2127                                                   map->surface_count,
2128                                                   &state_offset);
2129    uint32_t *bt_map = bt_state->map;
2130 
2131    if (bt_state->map == NULL)
2132       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2133 
2134    /* We only need to emit relocs if we're not using softpin.  If we are using
2135     * softpin then we always keep all user-allocated memory objects resident.
2136     */
2137    const bool need_client_mem_relocs =
2138       anv_use_relocations(cmd_buffer->device->physical);
2139    struct anv_push_constants *push = &pipe_state->push_constants;
2140 
2141    for (uint32_t s = 0; s < map->surface_count; s++) {
2142       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2143 
2144       struct anv_state surface_state;
2145 
2146       switch (binding->set) {
2147       case ANV_DESCRIPTOR_SET_NULL:
2148          bt_map[s] = 0;
2149          break;
2150 
2151       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2152          /* Color attachment binding */
2153          assert(shader->stage == MESA_SHADER_FRAGMENT);
2154          if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2155             const struct anv_attachment *att =
2156                &cmd_buffer->state.gfx.color_att[binding->index];
2157             surface_state = att->surface_state.state;
2158          } else {
2159             surface_state = cmd_buffer->state.gfx.null_surface_state;
2160          }
2161          assert(surface_state.map);
2162          bt_map[s] = surface_state.offset + state_offset;
2163          break;
2164 
2165       case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2166          struct anv_state surface_state =
2167             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2168 
2169          struct anv_address constant_data = {
2170             .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2171             .offset = shader->kernel.offset +
2172                       shader->prog_data->const_data_offset,
2173          };
2174          unsigned constant_data_size = shader->prog_data->const_data_size;
2175 
2176          const enum isl_format format =
2177             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2178                                                VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2179          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2180                                        format, ISL_SWIZZLE_IDENTITY,
2181                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2182                                        constant_data, constant_data_size, 1);
2183 
2184          assert(surface_state.map);
2185          bt_map[s] = surface_state.offset + state_offset;
2186          add_surface_reloc(cmd_buffer, surface_state, constant_data);
2187          break;
2188       }
2189 
2190       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2191          /* This is always the first binding for compute shaders */
2192          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2193 
2194          struct anv_state surface_state =
2195             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2196 
2197          const enum isl_format format =
2198             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2199                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2200          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2201                                        format, ISL_SWIZZLE_IDENTITY,
2202                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2203                                        cmd_buffer->state.compute.num_workgroups,
2204                                        12, 1);
2205 
2206          assert(surface_state.map);
2207          bt_map[s] = surface_state.offset + state_offset;
2208          if (need_client_mem_relocs) {
2209             add_surface_reloc(cmd_buffer, surface_state,
2210                               cmd_buffer->state.compute.num_workgroups);
2211          }
2212          break;
2213       }
2214 
2215       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2216          /* This is a descriptor set buffer so the set index is actually
2217           * given by binding->binding.  (Yes, that's confusing.)
2218           */
2219          struct anv_descriptor_set *set =
2220             pipe_state->descriptors[binding->index];
2221          assert(set->desc_mem.alloc_size);
2222          assert(set->desc_surface_state.alloc_size);
2223          bt_map[s] = set->desc_surface_state.offset + state_offset;
2224          add_surface_reloc(cmd_buffer, set->desc_surface_state,
2225                            anv_descriptor_set_address(set));
2226          break;
2227       }
2228 
2229       default: {
2230          assert(binding->set < MAX_SETS);
2231          const struct anv_descriptor_set *set =
2232             pipe_state->descriptors[binding->set];
2233          if (binding->index >= set->descriptor_count) {
2234             /* From the Vulkan spec section entitled "DescriptorSet and
2235              * Binding Assignment":
2236              *
2237              *    "If the array is runtime-sized, then array elements greater
2238              *    than or equal to the size of that binding in the bound
2239              *    descriptor set must not be used."
2240              *
2241              * Unfortunately, the compiler isn't smart enough to figure out
2242              * when a dynamic binding isn't used so it may grab the whole
2243              * array and stick it in the binding table.  In this case, it's
2244              * safe to just skip those bindings that are OOB.
2245              */
2246             assert(binding->index < set->layout->descriptor_count);
2247             continue;
2248          }
2249          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2250 
2251          switch (desc->type) {
2252          case VK_DESCRIPTOR_TYPE_SAMPLER:
2253             /* Nothing for us to do here */
2254             continue;
2255 
2256          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2257          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2258          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2259             if (desc->image_view) {
2260                struct anv_surface_state sstate =
2261                   (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2262                   desc->image_view->planes[binding->plane].general_sampler_surface_state :
2263                   desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2264                surface_state = sstate.state;
2265                assert(surface_state.alloc_size);
2266                if (need_client_mem_relocs)
2267                   add_surface_state_relocs(cmd_buffer, sstate);
2268             } else {
2269                surface_state = cmd_buffer->device->null_surface_state;
2270             }
2271             break;
2272          }
2273 
2274          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2275             if (desc->image_view) {
2276                struct anv_surface_state sstate =
2277                   binding->lowered_storage_surface
2278                   ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2279                   : desc->image_view->planes[binding->plane].storage_surface_state;
2280                surface_state = sstate.state;
2281                assert(surface_state.alloc_size);
2282                if (surface_state.offset == 0) {
2283                   mesa_loge("Bound a image to a descriptor where the "
2284                             "descriptor does not have NonReadable "
2285                             "set and the image does not have a "
2286                             "corresponding SPIR-V format enum.");
2287                   vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2288                                   VK_DEBUG_REPORT_ERROR_BIT_EXT,
2289                                   &desc->image_view->vk.base,
2290                                   __LINE__, 0, "anv",
2291                                   "Bound a image to a descriptor where the "
2292                                   "descriptor does not have NonReadable "
2293                                   "set and the image does not have a "
2294                                   "corresponding SPIR-V format enum.");
2295                }
2296                if (surface_state.offset && need_client_mem_relocs)
2297                   add_surface_state_relocs(cmd_buffer, sstate);
2298             } else {
2299                surface_state = cmd_buffer->device->null_surface_state;
2300             }
2301             break;
2302          }
2303 
2304          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2305          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2306             if (desc->set_buffer_view) {
2307                surface_state = desc->set_buffer_view->surface_state;
2308                assert(surface_state.alloc_size);
2309                if (need_client_mem_relocs) {
2310                   add_surface_reloc(cmd_buffer, surface_state,
2311                                     desc->set_buffer_view->address);
2312                }
2313             } else {
2314                surface_state = cmd_buffer->device->null_surface_state;
2315             }
2316             break;
2317 
2318          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2319             if (desc->buffer_view) {
2320                surface_state = desc->buffer_view->surface_state;
2321                assert(surface_state.alloc_size);
2322                if (need_client_mem_relocs) {
2323                   add_surface_reloc(cmd_buffer, surface_state,
2324                                     desc->buffer_view->address);
2325                }
2326             } else {
2327                surface_state = cmd_buffer->device->null_surface_state;
2328             }
2329             break;
2330 
2331          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2332          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2333             if (desc->buffer) {
2334                /* Compute the offset within the buffer */
2335                uint32_t dynamic_offset =
2336                   push->dynamic_offsets[binding->dynamic_offset_index];
2337                uint64_t offset = desc->offset + dynamic_offset;
2338                /* Clamp to the buffer size */
2339                offset = MIN2(offset, desc->buffer->vk.size);
2340                /* Clamp the range to the buffer size */
2341                uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
2342 
2343                /* Align the range for consistency */
2344                if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2345                   range = align(range, ANV_UBO_ALIGNMENT);
2346 
2347                struct anv_address address =
2348                   anv_address_add(desc->buffer->address, offset);
2349 
2350                surface_state =
2351                   anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2352                enum isl_format format =
2353                   anv_isl_format_for_descriptor_type(cmd_buffer->device,
2354                                                      desc->type);
2355 
2356                isl_surf_usage_flags_t usage =
2357                   desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2358                   ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2359                   ISL_SURF_USAGE_STORAGE_BIT;
2360 
2361                anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2362                                              format, ISL_SWIZZLE_IDENTITY,
2363                                              usage, address, range, 1);
2364                if (need_client_mem_relocs)
2365                   add_surface_reloc(cmd_buffer, surface_state, address);
2366             } else {
2367                surface_state = cmd_buffer->device->null_surface_state;
2368             }
2369             break;
2370          }
2371 
2372          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2373             if (desc->buffer_view) {
2374                surface_state = binding->lowered_storage_surface
2375                   ? desc->buffer_view->lowered_storage_surface_state
2376                   : desc->buffer_view->storage_surface_state;
2377                assert(surface_state.alloc_size);
2378                if (need_client_mem_relocs) {
2379                   add_surface_reloc(cmd_buffer, surface_state,
2380                                     desc->buffer_view->address);
2381                }
2382             } else {
2383                surface_state = cmd_buffer->device->null_surface_state;
2384             }
2385             break;
2386 
2387          default:
2388             assert(!"Invalid descriptor type");
2389             continue;
2390          }
2391          assert(surface_state.map);
2392          bt_map[s] = surface_state.offset + state_offset;
2393          break;
2394       }
2395       }
2396    }
2397 
2398    return VK_SUCCESS;
2399 }
2400 
2401 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2402 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2403               struct anv_cmd_pipeline_state *pipe_state,
2404               struct anv_shader_bin *shader,
2405               struct anv_state *state)
2406 {
2407    struct anv_pipeline_bind_map *map = &shader->bind_map;
2408    if (map->sampler_count == 0) {
2409       *state = (struct anv_state) { 0, };
2410       return VK_SUCCESS;
2411    }
2412 
2413    uint32_t size = map->sampler_count * 16;
2414    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2415 
2416    if (state->map == NULL)
2417       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2418 
2419    for (uint32_t s = 0; s < map->sampler_count; s++) {
2420       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2421       const struct anv_descriptor *desc =
2422          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2423 
2424       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2425           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2426          continue;
2427 
2428       struct anv_sampler *sampler = desc->sampler;
2429 
2430       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2431        * happens to be zero.
2432        */
2433       if (sampler == NULL)
2434          continue;
2435 
2436       memcpy(state->map + (s * 16),
2437              sampler->state[binding->plane], sizeof(sampler->state[0]));
2438    }
2439 
2440    return VK_SUCCESS;
2441 }
2442 
2443 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2444 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2445                       struct anv_cmd_pipeline_state *pipe_state,
2446                       const VkShaderStageFlags dirty,
2447                       struct anv_shader_bin **shaders,
2448                       uint32_t num_shaders)
2449 {
2450    VkShaderStageFlags flushed = 0;
2451 
2452    VkResult result = VK_SUCCESS;
2453    for (uint32_t i = 0; i < num_shaders; i++) {
2454       if (!shaders[i])
2455          continue;
2456 
2457       gl_shader_stage stage = shaders[i]->stage;
2458       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2459       if ((vk_stage & dirty) == 0)
2460          continue;
2461 
2462       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2463       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2464                              &cmd_buffer->state.samplers[stage]);
2465       if (result != VK_SUCCESS)
2466          break;
2467 
2468       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2469       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2470                                   &cmd_buffer->state.binding_tables[stage]);
2471       if (result != VK_SUCCESS)
2472          break;
2473 
2474       flushed |= vk_stage;
2475    }
2476 
2477    if (result != VK_SUCCESS) {
2478       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2479 
2480       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2481       if (result != VK_SUCCESS)
2482          return 0;
2483 
2484       /* Re-emit state base addresses so we get the new surface state base
2485        * address before we start emitting binding tables etc.
2486        */
2487       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2488 
2489       /* Re-emit all active binding tables */
2490       flushed = 0;
2491 
2492       for (uint32_t i = 0; i < num_shaders; i++) {
2493          if (!shaders[i])
2494             continue;
2495 
2496          gl_shader_stage stage = shaders[i]->stage;
2497 
2498          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2499                                 &cmd_buffer->state.samplers[stage]);
2500          if (result != VK_SUCCESS) {
2501             anv_batch_set_error(&cmd_buffer->batch, result);
2502             return 0;
2503          }
2504          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2505                                      &cmd_buffer->state.binding_tables[stage]);
2506          if (result != VK_SUCCESS) {
2507             anv_batch_set_error(&cmd_buffer->batch, result);
2508             return 0;
2509          }
2510 
2511          flushed |= mesa_to_vk_shader_stage(stage);
2512       }
2513    }
2514 
2515    return flushed;
2516 }
2517 
2518 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2519 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2520                                     uint32_t stages)
2521 {
2522    static const uint32_t sampler_state_opcodes[] = {
2523       [MESA_SHADER_VERTEX]                      = 43,
2524       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
2525       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
2526       [MESA_SHADER_GEOMETRY]                    = 46,
2527       [MESA_SHADER_FRAGMENT]                    = 47,
2528    };
2529 
2530    static const uint32_t binding_table_opcodes[] = {
2531       [MESA_SHADER_VERTEX]                      = 38,
2532       [MESA_SHADER_TESS_CTRL]                   = 39,
2533       [MESA_SHADER_TESS_EVAL]                   = 40,
2534       [MESA_SHADER_GEOMETRY]                    = 41,
2535       [MESA_SHADER_FRAGMENT]                    = 42,
2536    };
2537 
2538    anv_foreach_stage(s, stages) {
2539       assert(s < ARRAY_SIZE(binding_table_opcodes));
2540 
2541       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2542          anv_batch_emit(&cmd_buffer->batch,
2543                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2544             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2545             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2546          }
2547       }
2548 
2549       /* Always emit binding table pointers if we're asked to, since on SKL
2550        * this is what flushes push constants. */
2551       anv_batch_emit(&cmd_buffer->batch,
2552                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
2553          btp._3DCommandSubOpcode = binding_table_opcodes[s];
2554          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
2555       }
2556    }
2557 }
2558 
2559 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2560 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
2561                        const struct anv_shader_bin *shader,
2562                        const struct anv_push_range *range)
2563 {
2564    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2565    switch (range->set) {
2566    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2567       /* This is a descriptor set buffer so the set index is
2568        * actually given by binding->binding.  (Yes, that's
2569        * confusing.)
2570        */
2571       struct anv_descriptor_set *set =
2572          gfx_state->base.descriptors[range->index];
2573       return anv_descriptor_set_address(set);
2574    }
2575 
2576    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
2577       if (gfx_state->base.push_constants_state.alloc_size == 0) {
2578          gfx_state->base.push_constants_state =
2579             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
2580       }
2581       return (struct anv_address) {
2582          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
2583          .offset = gfx_state->base.push_constants_state.offset,
2584       };
2585    }
2586 
2587    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2588       return (struct anv_address) {
2589          .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2590          .offset = shader->kernel.offset +
2591                    shader->prog_data->const_data_offset,
2592       };
2593 
2594    default: {
2595       assert(range->set < MAX_SETS);
2596       struct anv_descriptor_set *set =
2597          gfx_state->base.descriptors[range->set];
2598       const struct anv_descriptor *desc =
2599          &set->descriptors[range->index];
2600 
2601       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2602          if (desc->buffer_view)
2603             return desc->buffer_view->address;
2604       } else {
2605          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2606          if (desc->buffer) {
2607             const struct anv_push_constants *push =
2608                &gfx_state->base.push_constants;
2609             uint32_t dynamic_offset =
2610                push->dynamic_offsets[range->dynamic_offset_index];
2611             return anv_address_add(desc->buffer->address,
2612                                    desc->offset + dynamic_offset);
2613          }
2614       }
2615 
2616       /* For NULL UBOs, we just return an address in the workaround BO.  We do
2617        * writes to it for workarounds but always at the bottom.  The higher
2618        * bytes should be all zeros.
2619        */
2620       assert(range->length * 32 <= 2048);
2621       return (struct anv_address) {
2622          .bo = cmd_buffer->device->workaround_bo,
2623          .offset = 1024,
2624       };
2625    }
2626    }
2627 }
2628 
2629 
2630 /** Returns the size in bytes of the bound buffer
2631  *
2632  * The range is relative to the start of the buffer, not the start of the
2633  * range.  The returned range may be smaller than
2634  *
2635  *    (range->start + range->length) * 32;
2636  */
2637 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2638 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
2639                           const struct anv_shader_bin *shader,
2640                           const struct anv_push_range *range)
2641 {
2642    assert(shader->stage != MESA_SHADER_COMPUTE);
2643    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2644    switch (range->set) {
2645    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2646       struct anv_descriptor_set *set =
2647          gfx_state->base.descriptors[range->index];
2648       assert(range->start * 32 < set->desc_mem.alloc_size);
2649       assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
2650       return set->desc_mem.alloc_size;
2651    }
2652 
2653    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
2654       return (range->start + range->length) * 32;
2655 
2656    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2657       return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
2658 
2659    default: {
2660       assert(range->set < MAX_SETS);
2661       struct anv_descriptor_set *set =
2662          gfx_state->base.descriptors[range->set];
2663       const struct anv_descriptor *desc =
2664          &set->descriptors[range->index];
2665 
2666       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2667          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
2668             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
2669          */
2670          if (!desc->set_buffer_view)
2671             return 0;
2672 
2673          if (range->start * 32 > desc->set_buffer_view->range)
2674             return 0;
2675 
2676          return desc->set_buffer_view->range;
2677       } else {
2678          if (!desc->buffer)
2679             return 0;
2680 
2681          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2682          /* Compute the offset within the buffer */
2683          const struct anv_push_constants *push =
2684             &gfx_state->base.push_constants;
2685          uint32_t dynamic_offset =
2686             push->dynamic_offsets[range->dynamic_offset_index];
2687          uint64_t offset = desc->offset + dynamic_offset;
2688          /* Clamp to the buffer size */
2689          offset = MIN2(offset, desc->buffer->vk.size);
2690          /* Clamp the range to the buffer size */
2691          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
2692 
2693          /* Align the range for consistency */
2694          bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
2695 
2696          return bound_range;
2697       }
2698    }
2699    }
2700 }
2701 
2702 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)2703 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
2704                               gl_shader_stage stage,
2705                               struct anv_address *buffers,
2706                               unsigned buffer_count)
2707 {
2708    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2709    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2710 
2711    static const uint32_t push_constant_opcodes[] = {
2712       [MESA_SHADER_VERTEX]                      = 21,
2713       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
2714       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
2715       [MESA_SHADER_GEOMETRY]                    = 22,
2716       [MESA_SHADER_FRAGMENT]                    = 23,
2717    };
2718 
2719    assert(stage < ARRAY_SIZE(push_constant_opcodes));
2720 
2721    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
2722 
2723    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
2724       c._3DCommandSubOpcode = push_constant_opcodes[stage];
2725 
2726       /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
2727        *
2728        *    "Constant Buffer Object Control State must be always
2729        *     programmed to zero."
2730        *
2731        * This restriction does not exist on any newer platforms.
2732        *
2733        * We only have one MOCS field for the whole packet, not one per
2734        * buffer.  We could go out of our way here to walk over all of
2735        * the buffers and see if any of them are used externally and use
2736        * the external MOCS.  However, the notion that someone would use
2737        * the same bit of memory for both scanout and a UBO is nuts.
2738        *
2739        * Let's not bother and assume it's all internal.
2740        */
2741 #if GFX_VER != 8
2742       c.ConstantBody.MOCS = mocs;
2743 #endif
2744 
2745       if (anv_pipeline_has_stage(pipeline, stage)) {
2746          const struct anv_pipeline_bind_map *bind_map =
2747             &pipeline->shaders[stage]->bind_map;
2748 
2749 #if GFX_VERx10 >= 75
2750          /* The Skylake PRM contains the following restriction:
2751           *
2752           *    "The driver must ensure The following case does not occur
2753           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
2754           *     buffer 3 read length equal to zero committed followed by a
2755           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
2756           *     zero committed."
2757           *
2758           * To avoid this, we program the buffers in the highest slots.
2759           * This way, slot 0 is only used if slot 3 is also used.
2760           */
2761          assert(buffer_count <= 4);
2762          const unsigned shift = 4 - buffer_count;
2763          for (unsigned i = 0; i < buffer_count; i++) {
2764             const struct anv_push_range *range = &bind_map->push_ranges[i];
2765 
2766             /* At this point we only have non-empty ranges */
2767             assert(range->length > 0);
2768 
2769             /* For Ivy Bridge, make sure we only set the first range (actual
2770              * push constants)
2771              */
2772             assert((GFX_VERx10 >= 75) || i == 0);
2773 
2774             c.ConstantBody.ReadLength[i + shift] = range->length;
2775             c.ConstantBody.Buffer[i + shift] =
2776                anv_address_add(buffers[i], range->start * 32);
2777          }
2778 #else
2779          /* For Ivy Bridge, push constants are relative to dynamic state
2780           * base address and we only ever push actual push constants.
2781           */
2782          if (bind_map->push_ranges[0].length > 0) {
2783             assert(buffer_count == 1);
2784             assert(bind_map->push_ranges[0].set ==
2785                    ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
2786             assert(buffers[0].bo ==
2787                    cmd_buffer->device->dynamic_state_pool.block_pool.bo);
2788             c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
2789             c.ConstantBody.Buffer[0].bo = NULL;
2790             c.ConstantBody.Buffer[0].offset = buffers[0].offset;
2791          }
2792          assert(bind_map->push_ranges[1].length == 0);
2793          assert(bind_map->push_ranges[2].length == 0);
2794          assert(bind_map->push_ranges[3].length == 0);
2795 #endif
2796       }
2797    }
2798 }
2799 
2800 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)2801 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
2802                                 VkShaderStageFlags dirty_stages)
2803 {
2804    VkShaderStageFlags flushed = 0;
2805    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2806    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2807 
2808    /* Compute robust pushed register access mask for each stage. */
2809    if (cmd_buffer->device->vk.enabled_features.robustBufferAccess) {
2810       anv_foreach_stage(stage, dirty_stages) {
2811          if (!anv_pipeline_has_stage(pipeline, stage))
2812             continue;
2813 
2814          const struct anv_shader_bin *shader = pipeline->shaders[stage];
2815          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2816          struct anv_push_constants *push = &gfx_state->base.push_constants;
2817 
2818          push->push_reg_mask[stage] = 0;
2819          /* Start of the current range in the shader, relative to the start of
2820           * push constants in the shader.
2821           */
2822          unsigned range_start_reg = 0;
2823          for (unsigned i = 0; i < 4; i++) {
2824             const struct anv_push_range *range = &bind_map->push_ranges[i];
2825             if (range->length == 0)
2826                continue;
2827 
2828             unsigned bound_size =
2829                get_push_range_bound_size(cmd_buffer, shader, range);
2830             if (bound_size >= range->start * 32) {
2831                unsigned bound_regs =
2832                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
2833                        range->length);
2834                assert(range_start_reg + bound_regs <= 64);
2835                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
2836                                                               bound_regs);
2837             }
2838 
2839             cmd_buffer->state.push_constants_dirty |=
2840                mesa_to_vk_shader_stage(stage);
2841 
2842             range_start_reg += range->length;
2843          }
2844       }
2845    }
2846 
2847    /* Resets the push constant state so that we allocate a new one if
2848     * needed.
2849     */
2850    gfx_state->base.push_constants_state = ANV_STATE_NULL;
2851 
2852    anv_foreach_stage(stage, dirty_stages) {
2853       unsigned buffer_count = 0;
2854       flushed |= mesa_to_vk_shader_stage(stage);
2855       UNUSED uint32_t max_push_range = 0;
2856 
2857       struct anv_address buffers[4] = {};
2858       if (anv_pipeline_has_stage(pipeline, stage)) {
2859          const struct anv_shader_bin *shader = pipeline->shaders[stage];
2860          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2861 
2862          /* We have to gather buffer addresses as a second step because the
2863           * loop above puts data into the push constant area and the call to
2864           * get_push_range_address is what locks our push constants and copies
2865           * them into the actual GPU buffer.  If we did the two loops at the
2866           * same time, we'd risk only having some of the sizes in the push
2867           * constant buffer when we did the copy.
2868           */
2869          for (unsigned i = 0; i < 4; i++) {
2870             const struct anv_push_range *range = &bind_map->push_ranges[i];
2871             if (range->length == 0)
2872                break;
2873 
2874             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
2875             max_push_range = MAX2(max_push_range, range->length);
2876             buffer_count++;
2877          }
2878 
2879          /* We have at most 4 buffers but they should be tightly packed */
2880          for (unsigned i = buffer_count; i < 4; i++)
2881             assert(bind_map->push_ranges[i].length == 0);
2882       }
2883 
2884       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
2885    }
2886 
2887    cmd_buffer->state.push_constants_dirty &= ~flushed;
2888 }
2889 
2890 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)2891 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
2892 {
2893    const struct vk_dynamic_graphics_state *dyn =
2894       &cmd_buffer->vk.dynamic_graphics_state;
2895 
2896    if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
2897        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
2898 #if GFX_VER <= 7
2899        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
2900        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
2901 #endif
2902        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
2903       return;
2904 
2905    /* Take dynamic primitive topology in to account with
2906     *    3DSTATE_CLIP::ViewportXYClipTestEnable
2907     */
2908    VkPolygonMode dynamic_raster_mode =
2909       genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
2910                                 dyn->ia.primitive_topology);
2911    bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
2912 
2913    struct GENX(3DSTATE_CLIP) clip = {
2914       GENX(3DSTATE_CLIP_header),
2915 #if GFX_VER <= 7
2916       .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
2917       .CullMode     = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
2918 #endif
2919       .ViewportXYClipTestEnable = xy_clip_test_enable,
2920    };
2921    uint32_t dwords[GENX(3DSTATE_CLIP_length)];
2922 
2923    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2924    if (anv_pipeline_is_primitive(pipeline)) {
2925       const struct elk_vue_prog_data *last =
2926          anv_pipeline_get_last_vue_prog_data(pipeline);
2927       if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2928          clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
2929                                dyn->vp.viewport_count - 1 : 0;
2930       }
2931    }
2932 
2933    GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
2934    anv_batch_emit_merge(&cmd_buffer->batch, dwords,
2935                         pipeline->gfx7.clip);
2936 }
2937 
2938 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer * cmd_buffer)2939 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
2940 {
2941    struct anv_instance *instance = cmd_buffer->device->physical->instance;
2942    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2943    const struct vk_dynamic_graphics_state *dyn =
2944       &cmd_buffer->vk.dynamic_graphics_state;
2945    uint32_t count = dyn->vp.viewport_count;
2946    const VkViewport *viewports = dyn->vp.viewports;
2947    struct anv_state sf_clip_state =
2948       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
2949 
2950    bool negative_one_to_one =
2951       cmd_buffer->state.gfx.pipeline->negative_one_to_one;
2952 
2953    float scale = negative_one_to_one ? 0.5f : 1.0f;
2954 
2955    for (uint32_t i = 0; i < count; i++) {
2956       const VkViewport *vp = &viewports[i];
2957 
2958       /* The gfx7 state struct has just the matrix and guardband fields, the
2959        * gfx8 struct adds the min/max viewport fields. */
2960       struct GENX(SF_CLIP_VIEWPORT) sfv = {
2961          .ViewportMatrixElementm00 = vp->width / 2,
2962          .ViewportMatrixElementm11 = vp->height / 2,
2963          .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
2964          .ViewportMatrixElementm30 = vp->x + vp->width / 2,
2965          .ViewportMatrixElementm31 = vp->y + vp->height / 2,
2966          .ViewportMatrixElementm32 = negative_one_to_one ?
2967             (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
2968          .XMinClipGuardband = -1.0f,
2969          .XMaxClipGuardband = 1.0f,
2970          .YMinClipGuardband = -1.0f,
2971          .YMaxClipGuardband = 1.0f,
2972 #if GFX_VER >= 8
2973          .XMinViewPort = vp->x,
2974          .XMaxViewPort = vp->x + vp->width - 1,
2975          .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
2976          .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
2977 #endif
2978       };
2979 
2980       /* Fix depth test misrenderings by lowering translated depth range */
2981       if (instance->lower_depth_range_rate != 1.0f)
2982          sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
2983 
2984       const uint32_t fb_size_max = 1 << 14;
2985       uint32_t x_min = 0, x_max = fb_size_max;
2986       uint32_t y_min = 0, y_max = fb_size_max;
2987 
2988       /* If we have a valid renderArea, include that */
2989       if (gfx->render_area.extent.width > 0 &&
2990           gfx->render_area.extent.height > 0) {
2991          x_min = MAX2(x_min, gfx->render_area.offset.x);
2992          x_max = MIN2(x_min, gfx->render_area.offset.x +
2993                              gfx->render_area.extent.width);
2994          y_min = MAX2(y_min, gfx->render_area.offset.y);
2995          y_max = MIN2(y_min, gfx->render_area.offset.y +
2996                              gfx->render_area.extent.height);
2997       }
2998 
2999       /* The client is required to have enough scissors for whatever it sets
3000        * as ViewportIndex but it's possible that they've got more viewports
3001        * set from a previous command.  Also, from the Vulkan 1.3.207:
3002        *
3003        *    "The application must ensure (using scissor if necessary) that
3004        *    all rendering is contained within the render area."
3005        *
3006        * If the client doesn't set a scissor, that basically means it
3007        * guarantees everything is in-bounds already.  If we end up using a
3008        * guardband of [-1, 1] in that case, there shouldn't be much loss.
3009        * It's theoretically possible that they could do all their clipping
3010        * with clip planes but that'd be a bit odd.
3011        */
3012       if (i < dyn->vp.scissor_count) {
3013          const VkRect2D *scissor = &dyn->vp.scissors[i];
3014          x_min = MAX2(x_min, scissor->offset.x);
3015          x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
3016          y_min = MAX2(y_min, scissor->offset.y);
3017          y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
3018       }
3019 
3020       /* Only bother calculating the guardband if our known render area is
3021        * less than the maximum size.  Otherwise, it will calculate [-1, 1]
3022        * anyway but possibly with precision loss.
3023        */
3024       if (x_min > 0 || x_max < fb_size_max ||
3025           y_min > 0 || y_max < fb_size_max) {
3026          intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
3027                                         sfv.ViewportMatrixElementm00,
3028                                         sfv.ViewportMatrixElementm11,
3029                                         sfv.ViewportMatrixElementm30,
3030                                         sfv.ViewportMatrixElementm31,
3031                                         &sfv.XMinClipGuardband,
3032                                         &sfv.XMaxClipGuardband,
3033                                         &sfv.YMinClipGuardband,
3034                                         &sfv.YMaxClipGuardband);
3035       }
3036 
3037       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3038    }
3039 
3040    anv_batch_emit(&cmd_buffer->batch,
3041                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3042       clip.SFClipViewportPointer = sf_clip_state.offset;
3043    }
3044 }
3045 
3046 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer * cmd_buffer,bool depth_clamp_enable)3047 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3048                                bool depth_clamp_enable)
3049 {
3050    const struct vk_dynamic_graphics_state *dyn =
3051       &cmd_buffer->vk.dynamic_graphics_state;
3052    uint32_t count = dyn->vp.viewport_count;
3053    const VkViewport *viewports = dyn->vp.viewports;
3054    struct anv_state cc_state =
3055       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3056 
3057    for (uint32_t i = 0; i < count; i++) {
3058       const VkViewport *vp = &viewports[i];
3059 
3060       /* From the Vulkan spec:
3061        *
3062        *    "It is valid for minDepth to be greater than or equal to
3063        *    maxDepth."
3064        */
3065       float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3066       float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3067 
3068       struct GENX(CC_VIEWPORT) cc_viewport = {
3069          .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3070          .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3071       };
3072 
3073       GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3074    }
3075 
3076    anv_batch_emit(&cmd_buffer->batch,
3077                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3078       cc.CCViewportPointer = cc_state.offset;
3079    }
3080 }
3081 
3082 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer * cmd_buffer)3083 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3084 {
3085    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3086    const struct vk_dynamic_graphics_state *dyn =
3087       &cmd_buffer->vk.dynamic_graphics_state;
3088    uint32_t count = dyn->vp.scissor_count;
3089    const VkRect2D *scissors = dyn->vp.scissors;
3090    const VkViewport *viewports = dyn->vp.viewports;
3091 
3092    /* Wa_1409725701:
3093     *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3094     *    stored as an array of up to 16 elements. The location of first
3095     *    element of the array, as specified by Pointer to SCISSOR_RECT, should
3096     *    be aligned to a 64-byte boundary.
3097     */
3098    uint32_t alignment = 64;
3099    struct anv_state scissor_state =
3100       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3101 
3102    for (uint32_t i = 0; i < count; i++) {
3103       const VkRect2D *s = &scissors[i];
3104       const VkViewport *vp = &viewports[i];
3105 
3106       /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3107        * ymax < ymin for empty clips.  In case clip x, y, width height are all
3108        * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3109        * what we want. Just special case empty clips and produce a canonical
3110        * empty clip. */
3111       static const struct GENX(SCISSOR_RECT) empty_scissor = {
3112          .ScissorRectangleYMin = 1,
3113          .ScissorRectangleXMin = 1,
3114          .ScissorRectangleYMax = 0,
3115          .ScissorRectangleXMax = 0
3116       };
3117 
3118       const int max = 0xffff;
3119 
3120       uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3121       uint32_t x_min = MAX2(s->offset.x, vp->x);
3122       int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3123                        MAX2(vp->y, vp->y + vp->height) - 1);
3124       int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3125                        vp->x + vp->width - 1);
3126 
3127       y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
3128       x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
3129 
3130       /* Do this math using int64_t so overflow gets clamped correctly. */
3131       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3132          y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
3133          x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
3134          y_max = CLAMP((uint64_t) y_max, 0,
3135                              gfx->render_area.offset.y +
3136                              gfx->render_area.extent.height - 1);
3137          x_max = CLAMP((uint64_t) x_max, 0,
3138                              gfx->render_area.offset.x +
3139                              gfx->render_area.extent.width - 1);
3140       }
3141 
3142       struct GENX(SCISSOR_RECT) scissor = {
3143          .ScissorRectangleYMin = y_min,
3144          .ScissorRectangleXMin = x_min,
3145          .ScissorRectangleYMax = y_max,
3146          .ScissorRectangleXMax = x_max
3147       };
3148 
3149       if (s->extent.width <= 0 || s->extent.height <= 0) {
3150          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3151                                  &empty_scissor);
3152       } else {
3153          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3154       }
3155    }
3156 
3157    anv_batch_emit(&cmd_buffer->batch,
3158                   GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3159       ssp.ScissorRectPointer = scissor_state.offset;
3160    }
3161 }
3162 
3163 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3164 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3165 {
3166    const struct vk_dynamic_graphics_state *dyn =
3167       &cmd_buffer->vk.dynamic_graphics_state;
3168    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3169 
3170 #if GFX_VER == 7
3171 #  define streamout_state_dw pipeline->gfx7.streamout_state
3172 #else
3173 #  define streamout_state_dw pipeline->gfx8.streamout_state
3174 #endif
3175 
3176    uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3177 
3178    struct GENX(3DSTATE_STREAMOUT) so = {
3179       GENX(3DSTATE_STREAMOUT_header),
3180       .RenderingDisable = dyn->rs.rasterizer_discard_enable,
3181    };
3182    GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3183    anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3184 }
3185 
3186 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)3187 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
3188 {
3189    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3190    const struct vk_dynamic_graphics_state *dyn =
3191       &cmd_buffer->vk.dynamic_graphics_state;
3192    uint32_t *p;
3193 
3194    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3195 
3196    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3197 
3198    genX(flush_pipeline_select_3d)(cmd_buffer);
3199 
3200    /* Apply any pending pipeline flushes we may have.  We want to apply them
3201     * now because, if any of those flushes are for things like push constants,
3202     * the GPU will read the state at weird times.
3203     */
3204    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3205 
3206    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3207    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3208       vb_emit |= pipeline->vb_used;
3209 
3210    if (vb_emit) {
3211       const uint32_t num_buffers = __builtin_popcount(vb_emit);
3212       const uint32_t num_dwords = 1 + num_buffers * 4;
3213 
3214       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3215                           GENX(3DSTATE_VERTEX_BUFFERS));
3216       uint32_t i = 0;
3217       u_foreach_bit(vb, vb_emit) {
3218          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3219          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3220 
3221          struct GENX(VERTEX_BUFFER_STATE) state;
3222          if (buffer) {
3223             uint32_t stride = dyn->vi_binding_strides[vb];
3224             UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
3225 
3226 #if GFX_VER <= 7
3227             bool per_instance = pipeline->vb[vb].instanced;
3228             uint32_t divisor = pipeline->vb[vb].instance_divisor *
3229                                pipeline->instance_multiplier;
3230 #endif
3231 
3232             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3233                .VertexBufferIndex = vb,
3234 
3235                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3236                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3237 #if GFX_VER <= 7
3238                .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
3239                .InstanceDataStepRate = per_instance ? divisor : 1,
3240 #endif
3241                .AddressModifyEnable = true,
3242                .BufferPitch = stride,
3243                .BufferStartingAddress = anv_address_add(buffer->address, offset),
3244                .NullVertexBuffer = offset >= buffer->vk.size,
3245 
3246 #if GFX_VER >= 8
3247                .BufferSize = size,
3248 #else
3249                /* XXX: to handle dynamic offset for older gens we might want
3250                 * to modify Endaddress, but there are issues when doing so:
3251                 *
3252                 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3253                 */
3254                .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
3255 #endif
3256             };
3257          } else {
3258             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3259                .VertexBufferIndex = vb,
3260                .NullVertexBuffer = true,
3261                .MOCS = anv_mocs(cmd_buffer->device, NULL,
3262                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3263             };
3264          }
3265 
3266 #if GFX_VER == 8
3267          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3268                                                         state.BufferStartingAddress,
3269                                                         state.BufferSize);
3270 #endif
3271 
3272          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3273          i++;
3274       }
3275    }
3276 
3277    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3278 
3279    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3280                                 pipeline->active_stages;
3281    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3282        !vk_dynamic_graphics_state_any_dirty(dyn) &&
3283        !cmd_buffer->state.push_constants_dirty)
3284       return;
3285 
3286    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3287        (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3288                          ANV_CMD_DIRTY_PIPELINE))) {
3289       /* Wa_16011411144:
3290        *
3291        * SW must insert a PIPE_CONTROL cmd before and after the
3292        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3293        * state is not combined with other state changes.
3294        */
3295       if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
3296          anv_add_pending_pipe_bits(cmd_buffer,
3297                                    ANV_PIPE_CS_STALL_BIT,
3298                                    "before SO_BUFFER change WA");
3299          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3300       }
3301 
3302       /* We don't need any per-buffer dirty tracking because you're not
3303        * allowed to bind different XFB buffers while XFB is enabled.
3304        */
3305       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3306          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3307          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3308             sob.SOBufferIndex = idx;
3309 
3310             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3311                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
3312                                    ISL_SURF_USAGE_STREAM_OUT_BIT);
3313                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3314                                                         xfb->offset);
3315 #if GFX_VER >= 8
3316                sob.SOBufferEnable = true;
3317                sob.StreamOffsetWriteEnable = false;
3318                /* Size is in DWords - 1 */
3319                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3320 #else
3321                /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3322                 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3323                 * default for an empty SO_BUFFER packet) to disable them.
3324                 */
3325                sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3326                sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3327                                                        xfb->offset + xfb->size);
3328 #endif
3329             } else {
3330                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3331             }
3332          }
3333       }
3334    }
3335 
3336    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3337       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3338 
3339       /* If the pipeline changed, we may need to re-allocate push constant
3340        * space in the URB.
3341        */
3342       cmd_buffer_alloc_push_constants(cmd_buffer);
3343    }
3344 
3345 #if GFX_VER <= 7
3346    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3347        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3348       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3349        *
3350        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3351        *    stall needs to be sent just prior to any 3DSTATE_VS,
3352        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3353        *    3DSTATE_BINDING_TABLE_POINTER_VS,
3354        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
3355        *    PIPE_CONTROL needs to be sent before any combination of VS
3356        *    associated 3DSTATE."
3357        */
3358       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3359          pc.DepthStallEnable  = true;
3360          pc.PostSyncOperation = WriteImmediateData;
3361          pc.Address           = cmd_buffer->device->workaround_address;
3362          anv_debug_dump_pc(pc);
3363       }
3364    }
3365 #endif
3366 
3367    /* Render targets live in the same binding table as fragment descriptors */
3368    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3369       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3370 
3371    /* We emit the binding tables and sampler tables first, then emit push
3372     * constants and then finally emit binding table and sampler table
3373     * pointers.  It has to happen in this order, since emitting the binding
3374     * tables may change the push constants (in case of storage images). After
3375     * emitting push constants, on SKL+ we have to emit the corresponding
3376     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
3377     */
3378    uint32_t dirty = 0;
3379    if (descriptors_dirty) {
3380       dirty = flush_descriptor_sets(cmd_buffer,
3381                                     &cmd_buffer->state.gfx.base,
3382                                     descriptors_dirty,
3383                                     pipeline->shaders,
3384                                     ARRAY_SIZE(pipeline->shaders));
3385       cmd_buffer->state.descriptors_dirty &= ~dirty;
3386    }
3387 
3388    if (dirty || cmd_buffer->state.push_constants_dirty) {
3389       /* Because we're pushing UBOs, we have to push whenever either
3390        * descriptors or push constants is dirty.
3391        */
3392       dirty |= cmd_buffer->state.push_constants_dirty;
3393       cmd_buffer_flush_push_constants(cmd_buffer,
3394                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3395    }
3396 
3397    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
3398       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
3399                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3400    }
3401 
3402    cmd_buffer_emit_clip(cmd_buffer);
3403 
3404    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3405                                        ANV_CMD_DIRTY_XFB_ENABLE)) ||
3406        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
3407       cmd_buffer_emit_streamout(cmd_buffer);
3408 
3409    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3410                                        ANV_CMD_DIRTY_RENDER_TARGETS)) ||
3411        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
3412        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
3413       cmd_buffer_emit_viewport(cmd_buffer);
3414       cmd_buffer_emit_depth_viewport(cmd_buffer,
3415                                      pipeline->depth_clamp_enable);
3416       cmd_buffer_emit_scissor(cmd_buffer);
3417    }
3418 
3419    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
3420        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
3421       uint32_t topology;
3422       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
3423          topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
3424       else
3425          topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
3426 
3427       cmd_buffer->state.gfx.primitive_topology = topology;
3428 
3429 #if (GFX_VER >= 8)
3430       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
3431          vft.PrimitiveTopologyType = topology;
3432       }
3433 #endif
3434    }
3435 
3436    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
3437 }
3438 
3439 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)3440 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
3441                struct anv_address addr,
3442                uint32_t size, uint32_t index)
3443 {
3444    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
3445                                  GENX(3DSTATE_VERTEX_BUFFERS));
3446 
3447    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
3448       &(struct GENX(VERTEX_BUFFER_STATE)) {
3449          .VertexBufferIndex = index,
3450          .AddressModifyEnable = true,
3451          .BufferPitch = 0,
3452          .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
3453                           ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3454          .NullVertexBuffer = size == 0,
3455 #if (GFX_VER >= 8)
3456          .BufferStartingAddress = addr,
3457          .BufferSize = size
3458 #else
3459          .BufferStartingAddress = addr,
3460          .EndAddress = anv_address_add(addr, size),
3461 #endif
3462       });
3463 
3464    genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
3465                                                   index, addr, size);
3466 }
3467 
3468 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)3469 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
3470                              struct anv_address addr)
3471 {
3472    emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
3473 }
3474 
3475 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)3476 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
3477                           uint32_t base_vertex, uint32_t base_instance)
3478 {
3479    if (base_vertex == 0 && base_instance == 0) {
3480       emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
3481    } else {
3482       struct anv_state id_state =
3483          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
3484 
3485       ((uint32_t *)id_state.map)[0] = base_vertex;
3486       ((uint32_t *)id_state.map)[1] = base_instance;
3487 
3488       struct anv_address addr = {
3489          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3490          .offset = id_state.offset,
3491       };
3492 
3493       emit_base_vertex_instance_bo(cmd_buffer, addr);
3494    }
3495 }
3496 
3497 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)3498 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
3499 {
3500    struct anv_state state =
3501       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
3502 
3503    ((uint32_t *)state.map)[0] = draw_index;
3504 
3505    struct anv_address addr = {
3506       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3507       .offset = state.offset,
3508    };
3509 
3510    emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
3511 }
3512 
3513 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)3514 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
3515                                    uint32_t access_type)
3516 {
3517    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3518    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3519 
3520    uint64_t vb_used = pipeline->vb_used;
3521    if (vs_prog_data->uses_firstvertex ||
3522        vs_prog_data->uses_baseinstance)
3523       vb_used |= 1ull << ANV_SVGS_VB_INDEX;
3524    if (vs_prog_data->uses_drawid)
3525       vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
3526 
3527    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
3528                                                        access_type == RANDOM,
3529                                                        vb_used);
3530 }
3531 
3532 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct elk_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)3533 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
3534                                            const struct elk_vs_prog_data *vs_prog_data,
3535                                            uint32_t base_vertex,
3536                                            uint32_t base_instance,
3537                                            uint32_t draw_id,
3538                                            bool force_flush)
3539 {
3540    bool emitted = false;
3541    if (vs_prog_data->uses_firstvertex ||
3542        vs_prog_data->uses_baseinstance) {
3543       emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
3544       emitted = true;
3545    }
3546    if (vs_prog_data->uses_drawid) {
3547       emit_draw_index(cmd_buffer, draw_id);
3548       emitted = true;
3549    }
3550    /* Emitting draw index or vertex index BOs may result in needing
3551     * additional VF cache flushes.
3552     */
3553    if (emitted || force_flush)
3554       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3555 }
3556 
genX(CmdDraw)3557 void genX(CmdDraw)(
3558     VkCommandBuffer                             commandBuffer,
3559     uint32_t                                    vertexCount,
3560     uint32_t                                    instanceCount,
3561     uint32_t                                    firstVertex,
3562     uint32_t                                    firstInstance)
3563 {
3564    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3565    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3566    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3567 
3568    if (anv_batch_has_error(&cmd_buffer->batch))
3569       return;
3570 
3571    const uint32_t count =
3572       vertexCount * instanceCount * pipeline->instance_multiplier;
3573    anv_measure_snapshot(cmd_buffer,
3574                         INTEL_SNAPSHOT_DRAW,
3575                         "draw", count);
3576    trace_intel_begin_draw(&cmd_buffer->trace);
3577 
3578    /* Select pipeline here to allow
3579     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3580     * cmd_buffer_flush_gfx_state().
3581     */
3582    genX(flush_pipeline_select_3d)(cmd_buffer);
3583 
3584    if (cmd_buffer->state.conditional_render_enabled)
3585       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3586 
3587    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3588                                               firstVertex, firstInstance, 0,
3589                                               false /* force_flush */);
3590 
3591    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3592 
3593    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3594       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3595       prim.VertexAccessType         = SEQUENTIAL;
3596       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3597       prim.VertexCountPerInstance   = vertexCount;
3598       prim.StartVertexLocation      = firstVertex;
3599       prim.InstanceCount            = instanceCount *
3600                                       pipeline->instance_multiplier;
3601       prim.StartInstanceLocation    = firstInstance;
3602       prim.BaseVertexLocation       = 0;
3603    }
3604 
3605    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3606 
3607    trace_intel_end_draw(&cmd_buffer->trace, count);
3608 }
3609 
genX(CmdDrawMultiEXT)3610 void genX(CmdDrawMultiEXT)(
3611     VkCommandBuffer                             commandBuffer,
3612     uint32_t                                    drawCount,
3613     const VkMultiDrawInfoEXT                   *pVertexInfo,
3614     uint32_t                                    instanceCount,
3615     uint32_t                                    firstInstance,
3616     uint32_t                                    stride)
3617 {
3618    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3619    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3620    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3621 
3622    if (anv_batch_has_error(&cmd_buffer->batch))
3623       return;
3624 
3625    const uint32_t count =
3626       drawCount * instanceCount * pipeline->instance_multiplier;
3627    anv_measure_snapshot(cmd_buffer,
3628                         INTEL_SNAPSHOT_DRAW,
3629                         "draw_multi", count);
3630    trace_intel_begin_draw_multi(&cmd_buffer->trace);
3631 
3632    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3633 
3634    if (cmd_buffer->state.conditional_render_enabled)
3635       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3636 
3637    uint32_t i = 0;
3638    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3639       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3640                                                  draw->firstVertex,
3641                                                  firstInstance, i, !i);
3642 
3643       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3644          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3645          prim.VertexAccessType         = SEQUENTIAL;
3646          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3647          prim.VertexCountPerInstance   = draw->vertexCount;
3648          prim.StartVertexLocation      = draw->firstVertex;
3649          prim.InstanceCount            = instanceCount *
3650                                          pipeline->instance_multiplier;
3651          prim.StartInstanceLocation    = firstInstance;
3652          prim.BaseVertexLocation       = 0;
3653       }
3654    }
3655 
3656    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3657 
3658    trace_intel_end_draw_multi(&cmd_buffer->trace, count);
3659 }
3660 
genX(CmdDrawIndexed)3661 void genX(CmdDrawIndexed)(
3662     VkCommandBuffer                             commandBuffer,
3663     uint32_t                                    indexCount,
3664     uint32_t                                    instanceCount,
3665     uint32_t                                    firstIndex,
3666     int32_t                                     vertexOffset,
3667     uint32_t                                    firstInstance)
3668 {
3669    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3670    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3671    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3672 
3673    if (anv_batch_has_error(&cmd_buffer->batch))
3674       return;
3675 
3676    const uint32_t count =
3677       indexCount * instanceCount * pipeline->instance_multiplier;
3678    anv_measure_snapshot(cmd_buffer,
3679                         INTEL_SNAPSHOT_DRAW,
3680                         "draw indexed",
3681                         count);
3682    trace_intel_begin_draw_indexed(&cmd_buffer->trace);
3683 
3684    /* Select pipeline here to allow
3685     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3686     * cmd_buffer_flush_gfx_state().
3687     */
3688    genX(flush_pipeline_select_3d)(cmd_buffer);
3689 
3690    if (cmd_buffer->state.conditional_render_enabled)
3691       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3692 
3693    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3694                                               vertexOffset, firstInstance,
3695                                               0, false /* force_flush */);
3696 
3697    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3698 
3699    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3700       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3701       prim.VertexAccessType         = RANDOM;
3702       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3703       prim.VertexCountPerInstance   = indexCount;
3704       prim.StartVertexLocation      = firstIndex;
3705       prim.InstanceCount            = instanceCount *
3706                                       pipeline->instance_multiplier;
3707       prim.StartInstanceLocation    = firstInstance;
3708       prim.BaseVertexLocation       = vertexOffset;
3709    }
3710 
3711    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3712 
3713    trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
3714 }
3715 
genX(CmdDrawMultiIndexedEXT)3716 void genX(CmdDrawMultiIndexedEXT)(
3717     VkCommandBuffer                             commandBuffer,
3718     uint32_t                                    drawCount,
3719     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
3720     uint32_t                                    instanceCount,
3721     uint32_t                                    firstInstance,
3722     uint32_t                                    stride,
3723     const int32_t                              *pVertexOffset)
3724 {
3725    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3726    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3727    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3728 
3729    if (anv_batch_has_error(&cmd_buffer->batch))
3730       return;
3731 
3732    const uint32_t count =
3733       drawCount * instanceCount * pipeline->instance_multiplier;
3734    anv_measure_snapshot(cmd_buffer,
3735                         INTEL_SNAPSHOT_DRAW,
3736                         "draw indexed_multi",
3737                         count);
3738    trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
3739 
3740    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3741 
3742    if (cmd_buffer->state.conditional_render_enabled)
3743       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3744 
3745    uint32_t i = 0;
3746    if (pVertexOffset) {
3747       if (vs_prog_data->uses_drawid) {
3748          bool emitted = true;
3749          if (vs_prog_data->uses_firstvertex ||
3750              vs_prog_data->uses_baseinstance) {
3751             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3752             emitted = true;
3753          }
3754          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3755             if (vs_prog_data->uses_drawid) {
3756                emit_draw_index(cmd_buffer, i);
3757                emitted = true;
3758             }
3759             /* Emitting draw index or vertex index BOs may result in needing
3760              * additional VF cache flushes.
3761              */
3762             if (emitted)
3763                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3764 
3765             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3766                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3767                prim.VertexAccessType         = RANDOM;
3768                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3769                prim.VertexCountPerInstance   = draw->indexCount;
3770                prim.StartVertexLocation      = draw->firstIndex;
3771                prim.InstanceCount            = instanceCount *
3772                                                pipeline->instance_multiplier;
3773                prim.StartInstanceLocation    = firstInstance;
3774                prim.BaseVertexLocation       = *pVertexOffset;
3775             }
3776             emitted = false;
3777          }
3778       } else {
3779          if (vs_prog_data->uses_firstvertex ||
3780              vs_prog_data->uses_baseinstance) {
3781             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3782             /* Emitting draw index or vertex index BOs may result in needing
3783              * additional VF cache flushes.
3784              */
3785             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3786          }
3787          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3788             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3789                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3790                prim.VertexAccessType         = RANDOM;
3791                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3792                prim.VertexCountPerInstance   = draw->indexCount;
3793                prim.StartVertexLocation      = draw->firstIndex;
3794                prim.InstanceCount            = instanceCount *
3795                                                pipeline->instance_multiplier;
3796                prim.StartInstanceLocation    = firstInstance;
3797                prim.BaseVertexLocation       = *pVertexOffset;
3798             }
3799          }
3800       }
3801    } else {
3802       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3803          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3804                                                     draw->vertexOffset,
3805                                                     firstInstance, i, i != 0);
3806 
3807          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3808             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3809             prim.VertexAccessType         = RANDOM;
3810             prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3811             prim.VertexCountPerInstance   = draw->indexCount;
3812             prim.StartVertexLocation      = draw->firstIndex;
3813             prim.InstanceCount            = instanceCount *
3814                                             pipeline->instance_multiplier;
3815             prim.StartInstanceLocation    = firstInstance;
3816             prim.BaseVertexLocation       = draw->vertexOffset;
3817          }
3818       }
3819    }
3820 
3821    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3822 
3823    trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
3824 }
3825 
3826 /* Auto-Draw / Indirect Registers */
3827 #define GFX7_3DPRIM_END_OFFSET          0x2420
3828 #define GFX7_3DPRIM_START_VERTEX        0x2430
3829 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
3830 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
3831 #define GFX7_3DPRIM_START_INSTANCE      0x243C
3832 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
3833 
genX(CmdDrawIndirectByteCountEXT)3834 void genX(CmdDrawIndirectByteCountEXT)(
3835     VkCommandBuffer                             commandBuffer,
3836     uint32_t                                    instanceCount,
3837     uint32_t                                    firstInstance,
3838     VkBuffer                                    counterBuffer,
3839     VkDeviceSize                                counterBufferOffset,
3840     uint32_t                                    counterOffset,
3841     uint32_t                                    vertexStride)
3842 {
3843 #if GFX_VERx10 >= 75
3844    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3845    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
3846    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3847    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3848 
3849    /* firstVertex is always zero for this draw function */
3850    const uint32_t firstVertex = 0;
3851 
3852    if (anv_batch_has_error(&cmd_buffer->batch))
3853       return;
3854 
3855    anv_measure_snapshot(cmd_buffer,
3856                         INTEL_SNAPSHOT_DRAW,
3857                         "draw indirect byte count",
3858                         instanceCount * pipeline->instance_multiplier);
3859    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
3860 
3861    /* Select pipeline here to allow
3862     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3863     * emit_base_vertex_instance() & emit_draw_index().
3864     */
3865    genX(flush_pipeline_select_3d)(cmd_buffer);
3866 
3867    if (cmd_buffer->state.conditional_render_enabled)
3868       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3869 
3870    if (vs_prog_data->uses_firstvertex ||
3871        vs_prog_data->uses_baseinstance)
3872       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
3873    if (vs_prog_data->uses_drawid)
3874       emit_draw_index(cmd_buffer, 0);
3875 
3876    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3877 
3878    struct mi_builder b;
3879    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3880    struct mi_value count =
3881       mi_mem32(anv_address_add(counter_buffer->address,
3882                                    counterBufferOffset));
3883    if (counterOffset)
3884       count = mi_isub(&b, count, mi_imm(counterOffset));
3885    count = mi_udiv32_imm(&b, count, vertexStride);
3886    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
3887 
3888    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
3889    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
3890             mi_imm(instanceCount * pipeline->instance_multiplier));
3891    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
3892    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3893 
3894    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3895       prim.IndirectParameterEnable  = true;
3896       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3897       prim.VertexAccessType         = SEQUENTIAL;
3898       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3899    }
3900 
3901    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3902 
3903    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
3904       instanceCount * pipeline->instance_multiplier);
3905 #endif /* GFX_VERx10 >= 75 */
3906 }
3907 
3908 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)3909 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
3910                          struct anv_address addr,
3911                          bool indexed)
3912 {
3913    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3914 
3915    struct mi_builder b;
3916    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3917 
3918    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
3919                 mi_mem32(anv_address_add(addr, 0)));
3920 
3921    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
3922    if (pipeline->instance_multiplier > 1) {
3923 #if GFX_VERx10 >= 75
3924       instance_count = mi_imul_imm(&b, instance_count,
3925                                    pipeline->instance_multiplier);
3926 #else
3927       anv_finishme("Multiview + indirect draw requires MI_MATH; "
3928                    "MI_MATH is not supported on Ivy Bridge");
3929 #endif
3930    }
3931    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
3932 
3933    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
3934                 mi_mem32(anv_address_add(addr, 8)));
3935 
3936    if (indexed) {
3937       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
3938                    mi_mem32(anv_address_add(addr, 12)));
3939       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3940                    mi_mem32(anv_address_add(addr, 16)));
3941    } else {
3942       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3943                    mi_mem32(anv_address_add(addr, 12)));
3944       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3945    }
3946 }
3947 
genX(CmdDrawIndirect)3948 void genX(CmdDrawIndirect)(
3949     VkCommandBuffer                             commandBuffer,
3950     VkBuffer                                    _buffer,
3951     VkDeviceSize                                offset,
3952     uint32_t                                    drawCount,
3953     uint32_t                                    stride)
3954 {
3955    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3956    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
3957    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3958    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3959 
3960    if (anv_batch_has_error(&cmd_buffer->batch))
3961       return;
3962 
3963    anv_measure_snapshot(cmd_buffer,
3964                         INTEL_SNAPSHOT_DRAW,
3965                         "draw indirect",
3966                         drawCount);
3967    trace_intel_begin_draw_indirect(&cmd_buffer->trace);
3968 
3969    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3970 
3971    if (cmd_buffer->state.conditional_render_enabled)
3972       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3973 
3974    for (uint32_t i = 0; i < drawCount; i++) {
3975       struct anv_address draw = anv_address_add(buffer->address, offset);
3976 
3977       if (vs_prog_data->uses_firstvertex ||
3978           vs_prog_data->uses_baseinstance)
3979          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
3980       if (vs_prog_data->uses_drawid)
3981          emit_draw_index(cmd_buffer, i);
3982 
3983       /* Emitting draw index or vertex index BOs may result in needing
3984        * additional VF cache flushes.
3985        */
3986       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3987 
3988       load_indirect_parameters(cmd_buffer, draw, false);
3989 
3990       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3991          prim.IndirectParameterEnable  = true;
3992          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3993          prim.VertexAccessType         = SEQUENTIAL;
3994          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3995       }
3996 
3997       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3998 
3999       offset += stride;
4000    }
4001 
4002    trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
4003 }
4004 
genX(CmdDrawIndexedIndirect)4005 void genX(CmdDrawIndexedIndirect)(
4006     VkCommandBuffer                             commandBuffer,
4007     VkBuffer                                    _buffer,
4008     VkDeviceSize                                offset,
4009     uint32_t                                    drawCount,
4010     uint32_t                                    stride)
4011 {
4012    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4013    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4014    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4015    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4016 
4017    if (anv_batch_has_error(&cmd_buffer->batch))
4018       return;
4019 
4020    anv_measure_snapshot(cmd_buffer,
4021                         INTEL_SNAPSHOT_DRAW,
4022                         "draw indexed indirect",
4023                         drawCount);
4024    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
4025 
4026    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4027 
4028    if (cmd_buffer->state.conditional_render_enabled)
4029       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4030 
4031    for (uint32_t i = 0; i < drawCount; i++) {
4032       struct anv_address draw = anv_address_add(buffer->address, offset);
4033 
4034       /* TODO: We need to stomp base vertex to 0 somehow */
4035       if (vs_prog_data->uses_firstvertex ||
4036           vs_prog_data->uses_baseinstance)
4037          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4038       if (vs_prog_data->uses_drawid)
4039          emit_draw_index(cmd_buffer, i);
4040 
4041       /* Emitting draw index or vertex index BOs may result in needing
4042        * additional VF cache flushes.
4043        */
4044       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4045 
4046       load_indirect_parameters(cmd_buffer, draw, true);
4047 
4048       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4049          prim.IndirectParameterEnable  = true;
4050          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4051          prim.VertexAccessType         = RANDOM;
4052          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4053       }
4054 
4055       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4056 
4057       offset += stride;
4058    }
4059 
4060    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
4061 }
4062 
4063 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)4064 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4065                                  struct mi_builder *b,
4066                                  struct anv_address count_address)
4067 {
4068    struct mi_value ret = mi_imm(0);
4069 
4070    if (cmd_buffer->state.conditional_render_enabled) {
4071 #if GFX_VERx10 >= 75
4072       ret = mi_new_gpr(b);
4073       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4074 #endif
4075    } else {
4076       /* Upload the current draw count from the draw parameters buffer to
4077        * MI_PREDICATE_SRC0.
4078        */
4079       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4080       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4081    }
4082 
4083    return ret;
4084 }
4085 
4086 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4087 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4088                           struct mi_builder *b,
4089                           uint32_t draw_index)
4090 {
4091    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4092    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4093 
4094    if (draw_index == 0) {
4095       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4096          mip.LoadOperation    = LOAD_LOADINV;
4097          mip.CombineOperation = COMBINE_SET;
4098          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4099       }
4100    } else {
4101       /* While draw_index < draw_count the predicate's result will be
4102        *  (draw_index == draw_count) ^ TRUE = TRUE
4103        * When draw_index == draw_count the result is
4104        *  (TRUE) ^ TRUE = FALSE
4105        * After this all results will be:
4106        *  (FALSE) ^ FALSE = FALSE
4107        */
4108       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4109          mip.LoadOperation    = LOAD_LOAD;
4110          mip.CombineOperation = COMBINE_XOR;
4111          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4112       }
4113    }
4114 }
4115 
4116 #if GFX_VERx10 >= 75
4117 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4118 emit_draw_count_predicate_with_conditional_render(
4119                           struct anv_cmd_buffer *cmd_buffer,
4120                           struct mi_builder *b,
4121                           uint32_t draw_index,
4122                           struct mi_value max)
4123 {
4124    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4125    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4126 
4127 #if GFX_VER >= 8
4128    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4129 #else
4130    /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4131     * so we emit MI_PREDICATE to set it.
4132     */
4133 
4134    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4135    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4136 
4137    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4138       mip.LoadOperation    = LOAD_LOADINV;
4139       mip.CombineOperation = COMBINE_SET;
4140       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4141    }
4142 #endif
4143 }
4144 #endif
4145 
4146 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4147 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4148                                struct mi_builder *b,
4149                                uint32_t draw_index,
4150                                struct mi_value max)
4151 {
4152 #if GFX_VERx10 >= 75
4153    if (cmd_buffer->state.conditional_render_enabled) {
4154       emit_draw_count_predicate_with_conditional_render(
4155             cmd_buffer, b, draw_index, mi_value_ref(b, max));
4156    } else {
4157       emit_draw_count_predicate(cmd_buffer, b, draw_index);
4158    }
4159 #else
4160    emit_draw_count_predicate(cmd_buffer, b, draw_index);
4161 #endif
4162 }
4163 
genX(CmdDrawIndirectCount)4164 void genX(CmdDrawIndirectCount)(
4165     VkCommandBuffer                             commandBuffer,
4166     VkBuffer                                    _buffer,
4167     VkDeviceSize                                offset,
4168     VkBuffer                                    _countBuffer,
4169     VkDeviceSize                                countBufferOffset,
4170     uint32_t                                    maxDrawCount,
4171     uint32_t                                    stride)
4172 {
4173    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4174    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4175    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4176    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4177    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4178    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4179 
4180    if (anv_batch_has_error(&cmd_buffer->batch))
4181       return;
4182 
4183    anv_measure_snapshot(cmd_buffer,
4184                         INTEL_SNAPSHOT_DRAW,
4185                         "draw indirect count",
4186                         0);
4187    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
4188 
4189    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4190 
4191    struct mi_builder b;
4192    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4193    struct anv_address count_address =
4194       anv_address_add(count_buffer->address, countBufferOffset);
4195    struct mi_value max =
4196       prepare_for_draw_count_predicate(cmd_buffer, &b, count_address);
4197 
4198    for (uint32_t i = 0; i < maxDrawCount; i++) {
4199       struct anv_address draw = anv_address_add(buffer->address, offset);
4200 
4201       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4202 
4203       if (vs_prog_data->uses_firstvertex ||
4204           vs_prog_data->uses_baseinstance)
4205          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4206       if (vs_prog_data->uses_drawid)
4207          emit_draw_index(cmd_buffer, i);
4208 
4209       /* Emitting draw index or vertex index BOs may result in needing
4210        * additional VF cache flushes.
4211        */
4212       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4213 
4214       load_indirect_parameters(cmd_buffer, draw, false);
4215 
4216       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4217          prim.IndirectParameterEnable  = true;
4218          prim.PredicateEnable          = true;
4219          prim.VertexAccessType         = SEQUENTIAL;
4220          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4221       }
4222 
4223       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4224 
4225       offset += stride;
4226    }
4227 
4228    mi_value_unref(&b, max);
4229 
4230    trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
4231                                        anv_address_utrace(count_address));
4232 }
4233 
genX(CmdDrawIndexedIndirectCount)4234 void genX(CmdDrawIndexedIndirectCount)(
4235     VkCommandBuffer                             commandBuffer,
4236     VkBuffer                                    _buffer,
4237     VkDeviceSize                                offset,
4238     VkBuffer                                    _countBuffer,
4239     VkDeviceSize                                countBufferOffset,
4240     uint32_t                                    maxDrawCount,
4241     uint32_t                                    stride)
4242 {
4243    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4244    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4245    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4246    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4247    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4248    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4249 
4250    if (anv_batch_has_error(&cmd_buffer->batch))
4251       return;
4252 
4253    anv_measure_snapshot(cmd_buffer,
4254                         INTEL_SNAPSHOT_DRAW,
4255                         "draw indexed indirect count",
4256                         0);
4257    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
4258 
4259    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4260 
4261    struct mi_builder b;
4262    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4263    struct anv_address count_address =
4264       anv_address_add(count_buffer->address, countBufferOffset);
4265    struct mi_value max =
4266       prepare_for_draw_count_predicate(cmd_buffer, &b, count_address);
4267 
4268    for (uint32_t i = 0; i < maxDrawCount; i++) {
4269       struct anv_address draw = anv_address_add(buffer->address, offset);
4270 
4271       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4272 
4273       /* TODO: We need to stomp base vertex to 0 somehow */
4274       if (vs_prog_data->uses_firstvertex ||
4275           vs_prog_data->uses_baseinstance)
4276          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4277       if (vs_prog_data->uses_drawid)
4278          emit_draw_index(cmd_buffer, i);
4279 
4280       /* Emitting draw index or vertex index BOs may result in needing
4281        * additional VF cache flushes.
4282        */
4283       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4284 
4285       load_indirect_parameters(cmd_buffer, draw, true);
4286 
4287       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4288          prim.IndirectParameterEnable  = true;
4289          prim.PredicateEnable          = true;
4290          prim.VertexAccessType         = RANDOM;
4291          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4292       }
4293 
4294       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4295 
4296       offset += stride;
4297    }
4298 
4299    mi_value_unref(&b, max);
4300 
4301    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
4302                                                anv_address_utrace(count_address));
4303 }
4304 
genX(CmdBeginTransformFeedbackEXT)4305 void genX(CmdBeginTransformFeedbackEXT)(
4306     VkCommandBuffer                             commandBuffer,
4307     uint32_t                                    firstCounterBuffer,
4308     uint32_t                                    counterBufferCount,
4309     const VkBuffer*                             pCounterBuffers,
4310     const VkDeviceSize*                         pCounterBufferOffsets)
4311 {
4312    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4313 
4314    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4315    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4316    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4317 
4318    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4319     *
4320     *    "Ssoftware must ensure that no HW stream output operations can be in
4321     *    process or otherwise pending at the point that the MI_LOAD/STORE
4322     *    commands are processed. This will likely require a pipeline flush."
4323     */
4324    anv_add_pending_pipe_bits(cmd_buffer,
4325                              ANV_PIPE_CS_STALL_BIT,
4326                              "begin transform feedback");
4327    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4328 
4329    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4330       /* If we have a counter buffer, this is a resume so we need to load the
4331        * value into the streamout offset register.  Otherwise, this is a begin
4332        * and we need to reset it to zero.
4333        */
4334       if (pCounterBuffers &&
4335           idx >= firstCounterBuffer &&
4336           idx - firstCounterBuffer < counterBufferCount &&
4337           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4338          uint32_t cb_idx = idx - firstCounterBuffer;
4339          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4340          uint64_t offset = pCounterBufferOffsets ?
4341                            pCounterBufferOffsets[cb_idx] : 0;
4342 
4343          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4344             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4345             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
4346                                                    offset);
4347          }
4348       } else {
4349          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4350             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4351             lri.DataDWord        = 0;
4352          }
4353       }
4354    }
4355 
4356    cmd_buffer->state.xfb_enabled = true;
4357    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4358 }
4359 
genX(CmdEndTransformFeedbackEXT)4360 void genX(CmdEndTransformFeedbackEXT)(
4361     VkCommandBuffer                             commandBuffer,
4362     uint32_t                                    firstCounterBuffer,
4363     uint32_t                                    counterBufferCount,
4364     const VkBuffer*                             pCounterBuffers,
4365     const VkDeviceSize*                         pCounterBufferOffsets)
4366 {
4367    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4368 
4369    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4370    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4371    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4372 
4373    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4374     *
4375     *    "Ssoftware must ensure that no HW stream output operations can be in
4376     *    process or otherwise pending at the point that the MI_LOAD/STORE
4377     *    commands are processed. This will likely require a pipeline flush."
4378     */
4379    anv_add_pending_pipe_bits(cmd_buffer,
4380                              ANV_PIPE_CS_STALL_BIT,
4381                              "end transform feedback");
4382    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4383 
4384    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
4385       unsigned idx = firstCounterBuffer + cb_idx;
4386 
4387       /* If we have a counter buffer, this is a resume so we need to load the
4388        * value into the streamout offset register.  Otherwise, this is a begin
4389        * and we need to reset it to zero.
4390        */
4391       if (pCounterBuffers &&
4392           cb_idx < counterBufferCount &&
4393           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
4394          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4395          uint64_t offset = pCounterBufferOffsets ?
4396                            pCounterBufferOffsets[cb_idx] : 0;
4397 
4398          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
4399             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
4400                                                    offset);
4401             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4402          }
4403       }
4404    }
4405 
4406    cmd_buffer->state.xfb_enabled = false;
4407    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4408 }
4409 
4410 static void
genX(cmd_buffer_flush_compute_state)4411 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
4412 {
4413    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
4414    struct anv_compute_pipeline *pipeline = comp_state->pipeline;
4415 
4416    assert(pipeline->cs);
4417 
4418    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
4419 
4420    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
4421 
4422    /* Apply any pending pipeline flushes we may have.  We want to apply them
4423     * now because, if any of those flushes are for things like push constants,
4424     * the GPU will read the state at weird times.
4425     */
4426    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4427 
4428    if (cmd_buffer->state.compute.pipeline_dirty) {
4429       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4430        *
4431        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4432        *    the only bits that are changed are scoreboard related: Scoreboard
4433        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4434        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
4435        *    sufficient."
4436        */
4437       anv_add_pending_pipe_bits(cmd_buffer,
4438                               ANV_PIPE_CS_STALL_BIT,
4439                               "flush compute state");
4440       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4441 
4442       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
4443 
4444       /* The workgroup size of the pipeline affects our push constant layout
4445        * so flag push constants as dirty if we change the pipeline.
4446        */
4447       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4448    }
4449 
4450    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
4451        cmd_buffer->state.compute.pipeline_dirty) {
4452       flush_descriptor_sets(cmd_buffer,
4453                             &cmd_buffer->state.compute.base,
4454                             VK_SHADER_STAGE_COMPUTE_BIT,
4455                             &pipeline->cs, 1);
4456       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4457 
4458       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
4459       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
4460          .BindingTablePointer =
4461             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
4462          .SamplerStatePointer =
4463             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
4464       };
4465       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
4466 
4467       struct anv_state state =
4468          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
4469                                       pipeline->interface_descriptor_data,
4470                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
4471                                       64);
4472 
4473       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4474       anv_batch_emit(&cmd_buffer->batch,
4475                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
4476          mid.InterfaceDescriptorTotalLength        = size;
4477          mid.InterfaceDescriptorDataStartAddress   = state.offset;
4478       }
4479    }
4480 
4481    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
4482       comp_state->push_data =
4483          anv_cmd_buffer_cs_push_constants(cmd_buffer);
4484 
4485       if (comp_state->push_data.alloc_size) {
4486          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
4487             curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
4488             curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
4489          }
4490       }
4491 
4492       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4493    }
4494 
4495    cmd_buffer->state.compute.pipeline_dirty = false;
4496 
4497    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4498 }
4499 
4500 #if GFX_VER == 7
4501 
4502 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)4503 verify_cmd_parser(const struct anv_device *device,
4504                   int required_version,
4505                   const char *function)
4506 {
4507    if (device->physical->cmd_parser_version < required_version) {
4508       return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
4509                        "cmd parser version %d is required for %s",
4510                        required_version, function);
4511    } else {
4512       return VK_SUCCESS;
4513    }
4514 }
4515 
4516 #endif
4517 
4518 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)4519 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
4520                                   uint32_t baseGroupX,
4521                                   uint32_t baseGroupY,
4522                                   uint32_t baseGroupZ)
4523 {
4524    if (anv_batch_has_error(&cmd_buffer->batch))
4525       return;
4526 
4527    struct anv_push_constants *push =
4528       &cmd_buffer->state.compute.base.push_constants;
4529    if (push->cs.base_work_group_id[0] != baseGroupX ||
4530        push->cs.base_work_group_id[1] != baseGroupY ||
4531        push->cs.base_work_group_id[2] != baseGroupZ) {
4532       push->cs.base_work_group_id[0] = baseGroupX;
4533       push->cs.base_work_group_id[1] = baseGroupY;
4534       push->cs.base_work_group_id[2] = baseGroupZ;
4535 
4536       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4537    }
4538 }
4539 
4540 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4541 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
4542                   const struct anv_compute_pipeline *pipeline, bool indirect,
4543                   const struct elk_cs_prog_data *prog_data,
4544                   uint32_t groupCountX, uint32_t groupCountY,
4545                   uint32_t groupCountZ)
4546 {
4547    bool predicate = (GFX_VER <= 7 && indirect) ||
4548       cmd_buffer->state.conditional_render_enabled;
4549 
4550    const struct intel_device_info *devinfo = pipeline->base.device->info;
4551    const struct intel_cs_dispatch_info dispatch =
4552       elk_cs_get_dispatch_info(devinfo, prog_data, NULL);
4553 
4554    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
4555       ggw.IndirectParameterEnable      = indirect;
4556       ggw.PredicateEnable              = predicate;
4557       ggw.SIMDSize                     = dispatch.simd_size / 16;
4558       ggw.ThreadDepthCounterMaximum    = 0;
4559       ggw.ThreadHeightCounterMaximum   = 0;
4560       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
4561       ggw.ThreadGroupIDXDimension      = groupCountX;
4562       ggw.ThreadGroupIDYDimension      = groupCountY;
4563       ggw.ThreadGroupIDZDimension      = groupCountZ;
4564       ggw.RightExecutionMask           = dispatch.right_mask;
4565       ggw.BottomExecutionMask          = 0xffffffff;
4566    }
4567 
4568    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
4569 }
4570 
4571 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4572 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
4573                const struct anv_compute_pipeline *pipeline, bool indirect,
4574                const struct elk_cs_prog_data *prog_data,
4575                uint32_t groupCountX, uint32_t groupCountY,
4576                uint32_t groupCountZ)
4577 {
4578    emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
4579                      groupCountY, groupCountZ);
4580 }
4581 
genX(CmdDispatchBase)4582 void genX(CmdDispatchBase)(
4583     VkCommandBuffer                             commandBuffer,
4584     uint32_t                                    baseGroupX,
4585     uint32_t                                    baseGroupY,
4586     uint32_t                                    baseGroupZ,
4587     uint32_t                                    groupCountX,
4588     uint32_t                                    groupCountY,
4589     uint32_t                                    groupCountZ)
4590 {
4591    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4592    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4593    const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4594 
4595    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
4596                                      baseGroupY, baseGroupZ);
4597 
4598    if (anv_batch_has_error(&cmd_buffer->batch))
4599       return;
4600 
4601    anv_measure_snapshot(cmd_buffer,
4602                         INTEL_SNAPSHOT_COMPUTE,
4603                         "compute",
4604                         groupCountX * groupCountY * groupCountZ *
4605                         prog_data->local_size[0] * prog_data->local_size[1] *
4606                         prog_data->local_size[2]);
4607 
4608    trace_intel_begin_compute(&cmd_buffer->trace);
4609 
4610    if (prog_data->uses_num_work_groups) {
4611       struct anv_state state =
4612          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
4613       uint32_t *sizes = state.map;
4614       sizes[0] = groupCountX;
4615       sizes[1] = groupCountY;
4616       sizes[2] = groupCountZ;
4617       cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
4618          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4619          .offset = state.offset,
4620       };
4621 
4622       /* The num_workgroups buffer goes in the binding table */
4623       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4624    }
4625 
4626    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4627 
4628    if (cmd_buffer->state.conditional_render_enabled)
4629       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4630 
4631    emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
4632                   groupCountY, groupCountZ);
4633 
4634    trace_intel_end_compute(&cmd_buffer->trace,
4635                            groupCountX, groupCountY, groupCountZ);
4636 }
4637 
4638 #define GPGPU_DISPATCHDIMX 0x2500
4639 #define GPGPU_DISPATCHDIMY 0x2504
4640 #define GPGPU_DISPATCHDIMZ 0x2508
4641 
genX(CmdDispatchIndirect)4642 void genX(CmdDispatchIndirect)(
4643     VkCommandBuffer                             commandBuffer,
4644     VkBuffer                                    _buffer,
4645     VkDeviceSize                                offset)
4646 {
4647    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4648    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4649    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4650    const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4651    struct anv_address addr = anv_address_add(buffer->address, offset);
4652    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
4653 
4654    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
4655 
4656 #if GFX_VER == 7
4657    /* Linux 4.4 added command parser version 5 which allows the GPGPU
4658     * indirect dispatch registers to be written.
4659     */
4660    if (verify_cmd_parser(cmd_buffer->device, 5,
4661                          "vkCmdDispatchIndirect") != VK_SUCCESS)
4662       return;
4663 #endif
4664 
4665    anv_measure_snapshot(cmd_buffer,
4666                         INTEL_SNAPSHOT_COMPUTE,
4667                         "compute indirect",
4668                         0);
4669    trace_intel_begin_compute(&cmd_buffer->trace);
4670 
4671    if (prog_data->uses_num_work_groups) {
4672       cmd_buffer->state.compute.num_workgroups = addr;
4673 
4674       /* The num_workgroups buffer goes in the binding table */
4675       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4676    }
4677 
4678    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4679 
4680    struct mi_builder b;
4681    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4682 
4683    struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
4684    struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
4685    struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
4686 
4687    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
4688    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
4689    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
4690 
4691 #if GFX_VER <= 7
4692    /* predicate = (compute_dispatch_indirect_x_size == 0); */
4693    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
4694    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4695    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4696       mip.LoadOperation    = LOAD_LOAD;
4697       mip.CombineOperation = COMBINE_SET;
4698       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4699    }
4700 
4701    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4702    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
4703    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4704       mip.LoadOperation    = LOAD_LOAD;
4705       mip.CombineOperation = COMBINE_OR;
4706       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4707    }
4708 
4709    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4710    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
4711    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4712       mip.LoadOperation    = LOAD_LOAD;
4713       mip.CombineOperation = COMBINE_OR;
4714       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4715    }
4716 
4717    /* predicate = !predicate; */
4718    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4719       mip.LoadOperation    = LOAD_LOADINV;
4720       mip.CombineOperation = COMBINE_OR;
4721       mip.CompareOperation = COMPARE_FALSE;
4722    }
4723 
4724 #if GFX_VERx10 == 75
4725    if (cmd_buffer->state.conditional_render_enabled) {
4726       /* predicate &= !(conditional_rendering_predicate == 0); */
4727       mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
4728                    mi_reg32(ANV_PREDICATE_RESULT_REG));
4729       anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4730          mip.LoadOperation    = LOAD_LOADINV;
4731          mip.CombineOperation = COMBINE_AND;
4732          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4733       }
4734    }
4735 #endif
4736 
4737 #else /* GFX_VER > 7 */
4738    if (cmd_buffer->state.conditional_render_enabled)
4739       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4740 #endif
4741 
4742    emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
4743 
4744    trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
4745 }
4746 
4747 static void
genX(flush_pipeline_select)4748 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4749                             uint32_t pipeline)
4750 {
4751    UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4752 
4753    if (cmd_buffer->state.current_pipeline == pipeline)
4754       return;
4755 
4756 #if GFX_VER >= 8
4757    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4758     *
4759     *   Software must clear the COLOR_CALC_STATE Valid field in
4760     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4761     *   with Pipeline Select set to GPGPU.
4762     *
4763     * The internal hardware docs recommend the same workaround for Gfx9
4764     * hardware too.
4765     */
4766    if (pipeline == GPGPU)
4767       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4768 #endif
4769 
4770    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4771     * PIPELINE_SELECT [DevBWR+]":
4772     *
4773     *   Project: DEVSNB+
4774     *
4775     *   Software must ensure all the write caches are flushed through a
4776     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4777     *   command to invalidate read only caches prior to programming
4778     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4779     *
4780     * Note the cmd_buffer_apply_pipe_flushes will split this into two
4781     * PIPE_CONTROLs.
4782     */
4783    anv_add_pending_pipe_bits(cmd_buffer,
4784                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4785                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4786                              ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4787                              ANV_PIPE_CS_STALL_BIT |
4788                              ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4789                              ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4790                              ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4791                              ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4792                              ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT,
4793                              "flush and invalidate for PIPELINE_SELECT");
4794    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4795 
4796    anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
4797       ps.PipelineSelection = pipeline;
4798    }
4799 
4800    cmd_buffer->state.current_pipeline = pipeline;
4801 }
4802 
4803 void
genX(flush_pipeline_select_3d)4804 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4805 {
4806    genX(flush_pipeline_select)(cmd_buffer, _3D);
4807 }
4808 
4809 void
genX(flush_pipeline_select_gpgpu)4810 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4811 {
4812    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4813 }
4814 
4815 void
genX(cmd_buffer_emit_gfx7_depth_flush)4816 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
4817 {
4818    if (GFX_VER >= 8)
4819       return;
4820 
4821    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
4822     *
4823     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
4824     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
4825     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
4826     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
4827     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
4828     *    Depth Flush Bit set, followed by another pipelined depth stall
4829     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
4830     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
4831     *    via a preceding MI_FLUSH)."
4832     */
4833    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4834       pipe.DepthStallEnable = true;
4835       anv_debug_dump_pc(pipe);
4836    }
4837    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4838       pipe.DepthCacheFlushEnable = true;
4839       anv_debug_dump_pc(pipe);
4840    }
4841    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4842       pipe.DepthStallEnable = true;
4843       anv_debug_dump_pc(pipe);
4844    }
4845 }
4846 
4847 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4848  *
4849  *    "The VF cache needs to be invalidated before binding and then using
4850  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
4851  *    (at a 64B granularity) since the last invalidation.  A VF cache
4852  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
4853  *    bit in PIPE_CONTROL."
4854  *
4855  * This is implemented by carefully tracking all vertex and index buffer
4856  * bindings and flushing if the cache ever ends up with a range in the cache
4857  * that would exceed 4 GiB.  This is implemented in three parts:
4858  *
4859  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4860  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4861  *       tracking code of the new binding.  If this new binding would cause
4862  *       the cache to have a too-large range on the next draw call, a pipeline
4863  *       stall and VF cache invalidate are added to pending_pipeline_bits.
4864  *
4865  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4866  *       empty whenever we emit a VF invalidate.
4867  *
4868  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4869  *       after every 3DPRIMITIVE and copies the bound range into the dirty
4870  *       range for each used buffer.  This has to be a separate step because
4871  *       we don't always re-bind all buffers and so 1. can't know which
4872  *       buffers are actually bound.
4873  */
4874 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4875 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4876                                                int vb_index,
4877                                                struct anv_address vb_address,
4878                                                uint32_t vb_size)
4879 {
4880    if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4881       return;
4882 
4883    struct anv_vb_cache_range *bound, *dirty;
4884    if (vb_index == -1) {
4885       bound = &cmd_buffer->state.gfx.ib_bound_range;
4886       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4887    } else {
4888       assert(vb_index >= 0);
4889       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4890       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4891       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4892       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4893    }
4894 
4895    if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4896                                                   vb_address,
4897                                                   vb_size)) {
4898       anv_add_pending_pipe_bits(cmd_buffer,
4899                                 ANV_PIPE_CS_STALL_BIT |
4900                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4901                                 "vb > 32b range");
4902    }
4903 }
4904 
4905 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4906 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4907                                                     uint32_t access_type,
4908                                                     uint64_t vb_used)
4909 {
4910    if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4911       return;
4912 
4913    if (access_type == RANDOM) {
4914       /* We have an index buffer */
4915       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4916       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4917 
4918       anv_merge_vb_cache_range(dirty, bound);
4919    }
4920 
4921    uint64_t mask = vb_used;
4922    while (mask) {
4923       int i = u_bit_scan64(&mask);
4924       assert(i >= 0);
4925       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4926       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4927 
4928       struct anv_vb_cache_range *bound, *dirty;
4929       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4930       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4931 
4932       anv_merge_vb_cache_range(dirty, bound);
4933    }
4934 }
4935 
4936 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4937 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4938 {
4939    struct anv_device *device = cmd_buffer->device;
4940    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4941 
4942    /* FIXME: Width and Height are wrong */
4943 
4944    genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
4945 
4946    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4947                                         device->isl_dev.ds.size / 4);
4948    if (dw == NULL)
4949       return;
4950 
4951    struct isl_view isl_view = {};
4952    struct isl_depth_stencil_hiz_emit_info info = {
4953       .view = &isl_view,
4954       .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4955    };
4956 
4957    if (gfx->depth_att.iview != NULL) {
4958       isl_view = gfx->depth_att.iview->planes[0].isl;
4959    } else if (gfx->stencil_att.iview != NULL) {
4960       isl_view = gfx->stencil_att.iview->planes[0].isl;
4961    }
4962 
4963    if (gfx->view_mask) {
4964       assert(isl_view.array_len == 0 ||
4965              isl_view.array_len >= util_last_bit(gfx->view_mask));
4966       isl_view.array_len = util_last_bit(gfx->view_mask);
4967    } else {
4968       assert(isl_view.array_len == 0 ||
4969              isl_view.array_len >= util_last_bit(gfx->layer_count));
4970       isl_view.array_len = gfx->layer_count;
4971    }
4972 
4973    if (gfx->depth_att.iview != NULL) {
4974       const struct anv_image_view *iview = gfx->depth_att.iview;
4975       const struct anv_image *image = iview->image;
4976 
4977       const uint32_t depth_plane =
4978          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4979       const struct anv_surface *depth_surface =
4980          &image->planes[depth_plane].primary_surface;
4981       const struct anv_address depth_address =
4982          anv_image_address(image, &depth_surface->memory_range);
4983 
4984       info.depth_surf = &depth_surface->isl;
4985 
4986       info.depth_address =
4987          anv_batch_emit_reloc(&cmd_buffer->batch,
4988                               dw + device->isl_dev.ds.depth_offset / 4,
4989                               depth_address.bo, depth_address.offset);
4990       info.mocs =
4991          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
4992 
4993       info.hiz_usage = gfx->depth_att.aux_usage;
4994       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
4995          assert(isl_aux_usage_has_hiz(info.hiz_usage));
4996 
4997          const struct anv_surface *hiz_surface =
4998             &image->planes[depth_plane].aux_surface;
4999          const struct anv_address hiz_address =
5000             anv_image_address(image, &hiz_surface->memory_range);
5001 
5002          info.hiz_surf = &hiz_surface->isl;
5003 
5004          info.hiz_address =
5005             anv_batch_emit_reloc(&cmd_buffer->batch,
5006                                  dw + device->isl_dev.ds.hiz_offset / 4,
5007                                  hiz_address.bo, hiz_address.offset);
5008 
5009          info.depth_clear_value = ANV_HZ_FC_VAL;
5010       }
5011    }
5012 
5013    if (gfx->stencil_att.iview != NULL) {
5014       const struct anv_image_view *iview = gfx->stencil_att.iview;
5015       const struct anv_image *image = iview->image;
5016 
5017       const uint32_t stencil_plane =
5018          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5019       const struct anv_surface *stencil_surface =
5020          &image->planes[stencil_plane].primary_surface;
5021       const struct anv_address stencil_address =
5022          anv_image_address(image, &stencil_surface->memory_range);
5023 
5024       info.stencil_surf = &stencil_surface->isl;
5025 
5026       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
5027       info.stencil_address =
5028          anv_batch_emit_reloc(&cmd_buffer->batch,
5029                               dw + device->isl_dev.ds.stencil_offset / 4,
5030                               stencil_address.bo, stencil_address.offset);
5031       info.mocs =
5032          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
5033    }
5034 
5035    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5036 
5037    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5038 }
5039 
5040 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5041 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5042 {
5043    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5044       vk_find_struct_const(att->pNext,
5045                            RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5046    if (layout_info != NULL)
5047       return layout_info->initialLayout;
5048 
5049    return att->imageLayout;
5050 }
5051 
genX(CmdBeginRendering)5052 void genX(CmdBeginRendering)(
5053     VkCommandBuffer                             commandBuffer,
5054     const VkRenderingInfo*                      pRenderingInfo)
5055 {
5056    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5057    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5058    VkResult result;
5059 
5060    if (!is_render_queue_cmd_buffer(cmd_buffer)) {
5061       assert(!"Trying to start a render pass on non-render queue!");
5062       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5063       return;
5064    }
5065 
5066    anv_measure_beginrenderpass(cmd_buffer);
5067    trace_intel_begin_render_pass(&cmd_buffer->trace);
5068 
5069    gfx->rendering_flags = pRenderingInfo->flags;
5070    gfx->render_area = pRenderingInfo->renderArea;
5071    gfx->view_mask = pRenderingInfo->viewMask;
5072    gfx->layer_count = pRenderingInfo->layerCount;
5073    gfx->samples = 0;
5074 
5075    const bool is_multiview = gfx->view_mask != 0;
5076    const VkRect2D render_area = gfx->render_area;
5077    const uint32_t layers =
5078       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5079 
5080    /* The framebuffer size is at least large enough to contain the render
5081     * area.  Because a zero renderArea is possible, we MAX with 1.
5082     */
5083    struct isl_extent3d fb_size = {
5084       .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5085       .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5086       .d = layers,
5087    };
5088 
5089    const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5090    result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5091    if (result != VK_SUCCESS)
5092       return;
5093 
5094    genX(flush_pipeline_select_3d)(cmd_buffer);
5095 
5096    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5097       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
5098          continue;
5099 
5100       const VkRenderingAttachmentInfo *att =
5101          &pRenderingInfo->pColorAttachments[i];
5102       ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5103       const VkImageLayout initial_layout = attachment_initial_layout(att);
5104 
5105       assert(render_area.offset.x + render_area.extent.width <=
5106              iview->vk.extent.width);
5107       assert(render_area.offset.y + render_area.extent.height <=
5108              iview->vk.extent.height);
5109       assert(layers <= iview->vk.layer_count);
5110 
5111       fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5112       fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5113 
5114       assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5115       gfx->samples |= iview->vk.image->samples;
5116 
5117       enum isl_aux_usage aux_usage =
5118          anv_layout_to_aux_usage(cmd_buffer->device->info,
5119                                  iview->image,
5120                                  VK_IMAGE_ASPECT_COLOR_BIT,
5121                                  VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5122                                  att->imageLayout);
5123 
5124       union isl_color_value fast_clear_color = { .u32 = { 0, } };
5125 
5126       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5127           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5128          const union isl_color_value clear_color =
5129             vk_to_isl_color_with_format(att->clearValue.color,
5130                                         iview->planes[0].isl.format);
5131 
5132          /* We only support fast-clears on the first layer */
5133          const bool fast_clear =
5134             (!is_multiview || (gfx->view_mask & 1)) &&
5135             anv_can_fast_clear_color_view(cmd_buffer->device, iview,
5136                                           att->imageLayout, clear_color,
5137                                           layers, render_area);
5138 
5139          if (att->imageLayout != initial_layout) {
5140             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5141                    render_area.extent.width == iview->vk.extent.width &&
5142                    render_area.extent.height == iview->vk.extent.height);
5143             if (is_multiview) {
5144                u_foreach_bit(view, gfx->view_mask) {
5145                   transition_color_buffer(cmd_buffer, iview->image,
5146                                           VK_IMAGE_ASPECT_COLOR_BIT,
5147                                           iview->vk.base_mip_level, 1,
5148                                           iview->vk.base_array_layer + view,
5149                                           1, /* layer_count */
5150                                           initial_layout, att->imageLayout,
5151                                           VK_QUEUE_FAMILY_IGNORED,
5152                                           VK_QUEUE_FAMILY_IGNORED,
5153                                           fast_clear);
5154                }
5155             } else {
5156                transition_color_buffer(cmd_buffer, iview->image,
5157                                        VK_IMAGE_ASPECT_COLOR_BIT,
5158                                        iview->vk.base_mip_level, 1,
5159                                        iview->vk.base_array_layer,
5160                                        gfx->layer_count,
5161                                        initial_layout, att->imageLayout,
5162                                        VK_QUEUE_FAMILY_IGNORED,
5163                                        VK_QUEUE_FAMILY_IGNORED,
5164                                        fast_clear);
5165             }
5166          }
5167 
5168          uint32_t clear_view_mask = pRenderingInfo->viewMask;
5169          uint32_t base_clear_layer = iview->vk.base_array_layer;
5170          uint32_t clear_layer_count = gfx->layer_count;
5171          if (fast_clear) {
5172             /* We only support fast-clears on the first layer */
5173             assert(iview->vk.base_mip_level == 0 &&
5174                    iview->vk.base_array_layer == 0);
5175 
5176             fast_clear_color = clear_color;
5177 
5178             if (iview->image->vk.samples == 1) {
5179                anv_image_ccs_op(cmd_buffer, iview->image,
5180                                 iview->planes[0].isl.format,
5181                                 iview->planes[0].isl.swizzle,
5182                                 VK_IMAGE_ASPECT_COLOR_BIT,
5183                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5184                                 &fast_clear_color,
5185                                 false);
5186             } else {
5187                anv_image_mcs_op(cmd_buffer, iview->image,
5188                                 iview->planes[0].isl.format,
5189                                 iview->planes[0].isl.swizzle,
5190                                 VK_IMAGE_ASPECT_COLOR_BIT,
5191                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
5192                                 &fast_clear_color,
5193                                 false);
5194             }
5195             clear_view_mask &= ~1u;
5196             base_clear_layer++;
5197             clear_layer_count--;
5198 
5199             set_image_clear_color(cmd_buffer, iview->image,
5200                                   VK_IMAGE_ASPECT_COLOR_BIT, clear_color);
5201 
5202             if (isl_color_value_is_zero(clear_color,
5203                                         iview->planes[0].isl.format)) {
5204                /* This image has the auxiliary buffer enabled. We can mark the
5205                 * subresource as not needing a resolve because the clear color
5206                 * will match what's in every RENDER_SURFACE_STATE object when
5207                 * it's being used for sampling.
5208                 */
5209                set_image_fast_clear_state(cmd_buffer, iview->image,
5210                                           VK_IMAGE_ASPECT_COLOR_BIT,
5211                                           ANV_FAST_CLEAR_DEFAULT_VALUE);
5212             } else {
5213                set_image_fast_clear_state(cmd_buffer, iview->image,
5214                                           VK_IMAGE_ASPECT_COLOR_BIT,
5215                                           ANV_FAST_CLEAR_ANY);
5216             }
5217          }
5218 
5219          if (is_multiview) {
5220             u_foreach_bit(view, clear_view_mask) {
5221                anv_image_clear_color(cmd_buffer, iview->image,
5222                                      VK_IMAGE_ASPECT_COLOR_BIT,
5223                                      aux_usage,
5224                                      iview->planes[0].isl.format,
5225                                      iview->planes[0].isl.swizzle,
5226                                      iview->vk.base_mip_level,
5227                                      iview->vk.base_array_layer + view, 1,
5228                                      render_area, clear_color);
5229             }
5230          } else {
5231             anv_image_clear_color(cmd_buffer, iview->image,
5232                                   VK_IMAGE_ASPECT_COLOR_BIT,
5233                                   aux_usage,
5234                                   iview->planes[0].isl.format,
5235                                   iview->planes[0].isl.swizzle,
5236                                   iview->vk.base_mip_level,
5237                                   base_clear_layer, clear_layer_count,
5238                                   render_area, clear_color);
5239          }
5240       } else {
5241          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5242          assert(att->imageLayout == initial_layout);
5243       }
5244 
5245       gfx->color_att[i].vk_format = iview->vk.format;
5246       gfx->color_att[i].iview = iview;
5247       gfx->color_att[i].layout = att->imageLayout;
5248       gfx->color_att[i].aux_usage = aux_usage;
5249 
5250       struct isl_view isl_view = iview->planes[0].isl;
5251       if (pRenderingInfo->viewMask) {
5252          assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5253          isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5254       } else {
5255          assert(isl_view.array_len >= pRenderingInfo->layerCount);
5256          isl_view.array_len = pRenderingInfo->layerCount;
5257       }
5258 
5259       anv_image_fill_surface_state(cmd_buffer->device,
5260                                    iview->image,
5261                                    VK_IMAGE_ASPECT_COLOR_BIT,
5262                                    &isl_view,
5263                                    ISL_SURF_USAGE_RENDER_TARGET_BIT,
5264                                    aux_usage, &fast_clear_color,
5265                                    0, /* anv_image_view_state_flags */
5266                                    &gfx->color_att[i].surface_state,
5267                                    NULL);
5268 
5269       add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
5270 
5271       if ((att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5272            (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5273           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5274           iview->planes[0].isl.base_level == 0 &&
5275           iview->planes[0].isl.base_array_layer == 0) {
5276          genX(copy_fast_clear_dwords)(cmd_buffer,
5277                                       gfx->color_att[i].surface_state.state,
5278                                       iview->image,
5279                                       VK_IMAGE_ASPECT_COLOR_BIT,
5280                                       false /* copy to ss */);
5281       }
5282 
5283       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5284          gfx->color_att[i].resolve_mode = att->resolveMode;
5285          gfx->color_att[i].resolve_iview =
5286             anv_image_view_from_handle(att->resolveImageView);
5287          gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5288       }
5289    }
5290 
5291    anv_cmd_graphic_state_update_has_uint_rt(gfx);
5292 
5293    const struct anv_image_view *ds_iview = NULL;
5294    const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5295    const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5296    if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5297        (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5298       const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5299       VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5300       VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5301       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5302       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5303       enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5304       enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5305       float depth_clear_value = 0;
5306       uint32_t stencil_clear_value = 0;
5307 
5308       if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5309          d_iview = anv_image_view_from_handle(d_att->imageView);
5310          initial_depth_layout = attachment_initial_layout(d_att);
5311          depth_layout = d_att->imageLayout;
5312          depth_aux_usage =
5313             anv_layout_to_aux_usage(cmd_buffer->device->info,
5314                                     d_iview->image,
5315                                     VK_IMAGE_ASPECT_DEPTH_BIT,
5316                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5317                                     depth_layout);
5318          depth_clear_value = d_att->clearValue.depthStencil.depth;
5319       }
5320 
5321       if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5322          s_iview = anv_image_view_from_handle(s_att->imageView);
5323          initial_stencil_layout = attachment_initial_layout(s_att);
5324          stencil_layout = s_att->imageLayout;
5325          stencil_aux_usage =
5326             anv_layout_to_aux_usage(cmd_buffer->device->info,
5327                                     s_iview->image,
5328                                     VK_IMAGE_ASPECT_STENCIL_BIT,
5329                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5330                                     stencil_layout);
5331          stencil_clear_value = s_att->clearValue.depthStencil.stencil;
5332       }
5333 
5334       assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5335       ds_iview = d_iview != NULL ? d_iview : s_iview;
5336       assert(ds_iview != NULL);
5337 
5338       assert(render_area.offset.x + render_area.extent.width <=
5339              ds_iview->vk.extent.width);
5340       assert(render_area.offset.y + render_area.extent.height <=
5341              ds_iview->vk.extent.height);
5342       assert(layers <= ds_iview->vk.layer_count);
5343 
5344       fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5345       fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5346 
5347       assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5348       gfx->samples |= ds_iview->vk.image->samples;
5349 
5350       VkImageAspectFlags clear_aspects = 0;
5351       if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5352           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5353          clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5354       if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5355           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5356          clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5357 
5358       if (clear_aspects != 0) {
5359          const bool hiz_clear =
5360             anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5361                                       depth_layout, clear_aspects,
5362                                       depth_clear_value,
5363                                       render_area);
5364 
5365          if (depth_layout != initial_depth_layout) {
5366             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5367                    render_area.extent.width == d_iview->vk.extent.width &&
5368                    render_area.extent.height == d_iview->vk.extent.height);
5369 
5370             if (is_multiview) {
5371                u_foreach_bit(view, gfx->view_mask) {
5372                   transition_depth_buffer(cmd_buffer, d_iview->image,
5373                                           d_iview->vk.base_array_layer + view,
5374                                           1 /* layer_count */,
5375                                           initial_depth_layout, depth_layout,
5376                                           hiz_clear);
5377                }
5378             } else {
5379                transition_depth_buffer(cmd_buffer, d_iview->image,
5380                                        d_iview->vk.base_array_layer,
5381                                        gfx->layer_count,
5382                                        initial_depth_layout, depth_layout,
5383                                        hiz_clear);
5384             }
5385          }
5386 
5387          if (stencil_layout != initial_stencil_layout) {
5388             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5389                    render_area.extent.width == s_iview->vk.extent.width &&
5390                    render_area.extent.height == s_iview->vk.extent.height);
5391 
5392             if (is_multiview) {
5393                u_foreach_bit(view, gfx->view_mask) {
5394                   transition_stencil_buffer(cmd_buffer, s_iview->image,
5395                                             s_iview->vk.base_mip_level, 1,
5396                                             s_iview->vk.base_array_layer + view,
5397                                             1 /* layer_count */,
5398                                             initial_stencil_layout,
5399                                             stencil_layout,
5400                                             hiz_clear);
5401                }
5402             } else {
5403                transition_stencil_buffer(cmd_buffer, s_iview->image,
5404                                          s_iview->vk.base_mip_level, 1,
5405                                          s_iview->vk.base_array_layer,
5406                                          gfx->layer_count,
5407                                          initial_stencil_layout,
5408                                          stencil_layout,
5409                                          hiz_clear);
5410             }
5411          }
5412 
5413          if (is_multiview) {
5414             uint32_t clear_view_mask = pRenderingInfo->viewMask;
5415             while (clear_view_mask) {
5416                int view = u_bit_scan(&clear_view_mask);
5417 
5418                uint32_t level = ds_iview->vk.base_mip_level;
5419                uint32_t layer = ds_iview->vk.base_array_layer + view;
5420 
5421                if (hiz_clear) {
5422                   anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5423                                       clear_aspects,
5424                                       level, layer, 1,
5425                                       render_area,
5426                                       stencil_clear_value);
5427                } else {
5428                   anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5429                                                 clear_aspects,
5430                                                 depth_aux_usage,
5431                                                 level, layer, 1,
5432                                                 render_area,
5433                                                 depth_clear_value,
5434                                                 stencil_clear_value);
5435                }
5436             }
5437          } else {
5438             uint32_t level = ds_iview->vk.base_mip_level;
5439             uint32_t base_layer = ds_iview->vk.base_array_layer;
5440             uint32_t layer_count = gfx->layer_count;
5441 
5442             if (hiz_clear) {
5443                anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5444                                    clear_aspects,
5445                                    level, base_layer, layer_count,
5446                                    render_area,
5447                                    stencil_clear_value);
5448             } else {
5449                anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5450                                              clear_aspects,
5451                                              depth_aux_usage,
5452                                              level, base_layer, layer_count,
5453                                              render_area,
5454                                              depth_clear_value,
5455                                              stencil_clear_value);
5456             }
5457          }
5458       } else {
5459          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5460          assert(depth_layout == initial_depth_layout);
5461          assert(stencil_layout == initial_stencil_layout);
5462       }
5463 
5464       if (d_iview != NULL) {
5465          gfx->depth_att.vk_format = d_iview->vk.format;
5466          gfx->depth_att.iview = d_iview;
5467          gfx->depth_att.layout = depth_layout;
5468          gfx->depth_att.aux_usage = depth_aux_usage;
5469          if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5470             assert(d_att->resolveImageView != VK_NULL_HANDLE);
5471             gfx->depth_att.resolve_mode = d_att->resolveMode;
5472             gfx->depth_att.resolve_iview =
5473                anv_image_view_from_handle(d_att->resolveImageView);
5474             gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5475          }
5476       }
5477 
5478       if (s_iview != NULL) {
5479          gfx->stencil_att.vk_format = s_iview->vk.format;
5480          gfx->stencil_att.iview = s_iview;
5481          gfx->stencil_att.layout = stencil_layout;
5482          gfx->stencil_att.aux_usage = stencil_aux_usage;
5483          if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5484             assert(s_att->resolveImageView != VK_NULL_HANDLE);
5485             gfx->stencil_att.resolve_mode = s_att->resolveMode;
5486             gfx->stencil_att.resolve_iview =
5487                anv_image_view_from_handle(s_att->resolveImageView);
5488             gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5489          }
5490       }
5491    }
5492 
5493    /* Finally, now that we know the right size, set up the null surface */
5494    assert(util_bitcount(gfx->samples) <= 1);
5495    isl_null_fill_state(&cmd_buffer->device->isl_dev,
5496                        gfx->null_surface_state.map,
5497                        .size = fb_size);
5498 
5499    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5500       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5501          continue;
5502 
5503       isl_null_fill_state(&cmd_buffer->device->isl_dev,
5504                           gfx->color_att[i].surface_state.state.map,
5505                           .size = fb_size);
5506    }
5507 
5508    /****** We can now start emitting code to begin the render pass ******/
5509 
5510    gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5511 
5512    /* Our implementation of VK_KHR_multiview uses instancing to draw the
5513     * different views.  If the client asks for instancing, we need to use the
5514     * Instance Data Step Rate to ensure that we repeat the client's
5515     * per-instance data once for each view.  Since this bit is in
5516     * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
5517     * of each subpass.
5518     */
5519    if (GFX_VER == 7)
5520       gfx->vb_dirty |= ~0;
5521 
5522    /* It is possible to start a render pass with an old pipeline.  Because the
5523     * render pass and subpass index are both baked into the pipeline, this is
5524     * highly unlikely.  In order to do so, it requires that you have a render
5525     * pass with a single subpass and that you use that render pass twice
5526     * back-to-back and use the same pipeline at the start of the second render
5527     * pass as at the end of the first.  In order to avoid unpredictable issues
5528     * with this edge case, we just dirty the pipeline at the start of every
5529     * subpass.
5530     */
5531    gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5532 
5533    cmd_buffer_emit_depth_stencil(cmd_buffer);
5534 }
5535 
5536 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5537 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5538                                    struct anv_attachment *att,
5539                                    VkImageAspectFlagBits aspect)
5540 {
5541    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5542    const struct anv_image_view *iview = att->iview;
5543 
5544    if (iview == NULL)
5545       return;
5546 
5547    if (gfx->view_mask == 0) {
5548       genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5549                                           aspect, att->aux_usage,
5550                                           iview->planes[0].isl.base_level,
5551                                           iview->planes[0].isl.base_array_layer,
5552                                           gfx->layer_count);
5553    } else {
5554       uint32_t res_view_mask = gfx->view_mask;
5555       while (res_view_mask) {
5556          int i = u_bit_scan(&res_view_mask);
5557 
5558          const uint32_t level = iview->planes[0].isl.base_level;
5559          const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5560 
5561          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5562                                              aspect, att->aux_usage,
5563                                              level, layer, 1);
5564       }
5565    }
5566 }
5567 
5568 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)5569 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
5570 {
5571    switch (vk_mode) {
5572    case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
5573       return BLORP_FILTER_SAMPLE_0;
5574    case VK_RESOLVE_MODE_AVERAGE_BIT:
5575       return BLORP_FILTER_AVERAGE;
5576    case VK_RESOLVE_MODE_MIN_BIT:
5577       return BLORP_FILTER_MIN_SAMPLE;
5578    case VK_RESOLVE_MODE_MAX_BIT:
5579       return BLORP_FILTER_MAX_SAMPLE;
5580    default:
5581       return BLORP_FILTER_NONE;
5582    }
5583 }
5584 
5585 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)5586 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
5587                                    const struct anv_attachment *att,
5588                                    VkImageLayout layout,
5589                                    VkImageAspectFlagBits aspect)
5590 {
5591    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5592    const struct anv_image_view *src_iview = att->iview;
5593    const struct anv_image_view *dst_iview = att->resolve_iview;
5594 
5595    enum isl_aux_usage src_aux_usage =
5596       anv_layout_to_aux_usage(cmd_buffer->device->info,
5597                               src_iview->image, aspect,
5598                               VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
5599                               layout);
5600 
5601    enum isl_aux_usage dst_aux_usage =
5602       anv_layout_to_aux_usage(cmd_buffer->device->info,
5603                               dst_iview->image, aspect,
5604                               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
5605                               att->resolve_layout);
5606 
5607    enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
5608 
5609    const VkRect2D render_area = gfx->render_area;
5610    if (gfx->view_mask == 0) {
5611       anv_image_msaa_resolve(cmd_buffer,
5612                              src_iview->image, src_aux_usage,
5613                              src_iview->planes[0].isl.base_level,
5614                              src_iview->planes[0].isl.base_array_layer,
5615                              dst_iview->image, dst_aux_usage,
5616                              dst_iview->planes[0].isl.base_level,
5617                              dst_iview->planes[0].isl.base_array_layer,
5618                              aspect,
5619                              render_area.offset.x, render_area.offset.y,
5620                              render_area.offset.x, render_area.offset.y,
5621                              render_area.extent.width,
5622                              render_area.extent.height,
5623                              gfx->layer_count, filter);
5624    } else {
5625       uint32_t res_view_mask = gfx->view_mask;
5626       while (res_view_mask) {
5627          int i = u_bit_scan(&res_view_mask);
5628 
5629          anv_image_msaa_resolve(cmd_buffer,
5630                                 src_iview->image, src_aux_usage,
5631                                 src_iview->planes[0].isl.base_level,
5632                                 src_iview->planes[0].isl.base_array_layer + i,
5633                                 dst_iview->image, dst_aux_usage,
5634                                 dst_iview->planes[0].isl.base_level,
5635                                 dst_iview->planes[0].isl.base_array_layer + i,
5636                                 aspect,
5637                                 render_area.offset.x, render_area.offset.y,
5638                                 render_area.offset.x, render_area.offset.y,
5639                                 render_area.extent.width,
5640                                 render_area.extent.height,
5641                                 1, filter);
5642       }
5643    }
5644 }
5645 
genX(CmdEndRendering)5646 void genX(CmdEndRendering)(
5647     VkCommandBuffer                             commandBuffer)
5648 {
5649    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5650    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5651 
5652    if (anv_batch_has_error(&cmd_buffer->batch))
5653       return;
5654 
5655    const bool is_multiview = gfx->view_mask != 0;
5656    const uint32_t layers =
5657       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5658 
5659    bool has_color_resolve = false;
5660    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5661       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5662                                          VK_IMAGE_ASPECT_COLOR_BIT);
5663 
5664       /* Stash this off for later */
5665       if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
5666           !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5667          has_color_resolve = true;
5668    }
5669 
5670    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5671                                       VK_IMAGE_ASPECT_DEPTH_BIT);
5672 
5673    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5674                                       VK_IMAGE_ASPECT_STENCIL_BIT);
5675 
5676    if (has_color_resolve) {
5677       /* We are about to do some MSAA resolves.  We need to flush so that the
5678        * result of writes to the MSAA color attachments show up in the sampler
5679        * when we blit to the single-sampled resolve target.
5680        */
5681       anv_add_pending_pipe_bits(cmd_buffer,
5682                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5683                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5684                                 "MSAA resolve");
5685    }
5686 
5687    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
5688        gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
5689       /* We are about to do some MSAA resolves.  We need to flush so that the
5690        * result of writes to the MSAA depth attachments show up in the sampler
5691        * when we blit to the single-sampled resolve target.
5692        */
5693       anv_add_pending_pipe_bits(cmd_buffer,
5694                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5695                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5696                               "MSAA resolve");
5697    }
5698 
5699    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5700       const struct anv_attachment *att = &gfx->color_att[i];
5701       if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
5702           (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5703          continue;
5704 
5705       cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
5706                                          VK_IMAGE_ASPECT_COLOR_BIT);
5707    }
5708 
5709    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5710        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5711       const struct anv_image_view *src_iview = gfx->depth_att.iview;
5712 
5713       /* MSAA resolves sample from the source attachment.  Transition the
5714        * depth attachment first to get rid of any HiZ that we may not be
5715        * able to handle.
5716        */
5717       transition_depth_buffer(cmd_buffer, src_iview->image,
5718                               src_iview->planes[0].isl.base_array_layer,
5719                               layers,
5720                               gfx->depth_att.layout,
5721                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5722                               false /* will_full_fast_clear */);
5723 
5724       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
5725                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5726                                          VK_IMAGE_ASPECT_DEPTH_BIT);
5727 
5728       /* Transition the source back to the original layout.  This seems a bit
5729        * inefficient but, since HiZ resolves aren't destructive, going from
5730        * less HiZ to more is generally a no-op.
5731        */
5732       transition_depth_buffer(cmd_buffer, src_iview->image,
5733                               src_iview->planes[0].isl.base_array_layer,
5734                               layers,
5735                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5736                               gfx->depth_att.layout,
5737                               false /* will_full_fast_clear */);
5738    }
5739 
5740    if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5741        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5742       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
5743                                          gfx->stencil_att.layout,
5744                                          VK_IMAGE_ASPECT_STENCIL_BIT);
5745    }
5746 
5747 #if GFX_VER == 7
5748    /* On gfx7, we have to store a texturable version of the stencil buffer in
5749     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
5750     * forth at strategic points. Stencil writes are only allowed in following
5751     * layouts:
5752     *
5753     *  - VK_IMAGE_LAYOUT_GENERAL
5754     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
5755     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
5756     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
5757     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
5758     *  - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
5759     *  - VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT
5760     *
5761     * For general, we have no nice opportunity to transition so we do the copy
5762     * to the shadow unconditionally at the end of the subpass. For transfer
5763     * destinations, we can update it as part of the transfer op. For the other
5764     * layouts, we delay the copy until a transition into some other layout.
5765     */
5766    if (gfx->stencil_att.iview != NULL) {
5767       const struct anv_image_view *iview = gfx->stencil_att.iview;
5768       const struct anv_image *image = iview->image;
5769       const uint32_t plane =
5770          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5771 
5772       if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
5773           (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
5774            gfx->stencil_att.layout == VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT)) {
5775          anv_image_copy_to_shadow(cmd_buffer, image,
5776                                   VK_IMAGE_ASPECT_STENCIL_BIT,
5777                                   iview->planes[plane].isl.base_level, 1,
5778                                   iview->planes[plane].isl.base_array_layer,
5779                                   layers);
5780       }
5781    }
5782 #endif
5783 
5784    anv_cmd_buffer_reset_rendering(cmd_buffer);
5785 }
5786 
5787 void
genX(cmd_emit_conditional_render_predicate)5788 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5789 {
5790 #if GFX_VERx10 >= 75
5791    struct mi_builder b;
5792    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5793 
5794    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5795                 mi_reg32(ANV_PREDICATE_RESULT_REG));
5796    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5797 
5798    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5799       mip.LoadOperation    = LOAD_LOADINV;
5800       mip.CombineOperation = COMBINE_SET;
5801       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5802    }
5803 #endif
5804 }
5805 
5806 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)5807 void genX(CmdBeginConditionalRenderingEXT)(
5808    VkCommandBuffer                             commandBuffer,
5809    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
5810 {
5811    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5812    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5813    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5814    struct anv_address value_address =
5815       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5816 
5817    const bool isInverted = pConditionalRenderingBegin->flags &
5818                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5819 
5820    cmd_state->conditional_render_enabled = true;
5821 
5822    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5823 
5824    struct mi_builder b;
5825    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5826 
5827    /* Section 19.4 of the Vulkan 1.1.85 spec says:
5828     *
5829     *    If the value of the predicate in buffer memory changes
5830     *    while conditional rendering is active, the rendering commands
5831     *    may be discarded in an implementation-dependent way.
5832     *    Some implementations may latch the value of the predicate
5833     *    upon beginning conditional rendering while others
5834     *    may read it before every rendering command.
5835     *
5836     * So it's perfectly fine to read a value from the buffer once.
5837     */
5838    struct mi_value value =  mi_mem32(value_address);
5839 
5840    /* Precompute predicate result, it is necessary to support secondary
5841     * command buffers since it is unknown if conditional rendering is
5842     * inverted when populating them.
5843     */
5844    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5845                 isInverted ? mi_uge(&b, mi_imm(0), value) :
5846                              mi_ult(&b, mi_imm(0), value));
5847 }
5848 
genX(CmdEndConditionalRenderingEXT)5849 void genX(CmdEndConditionalRenderingEXT)(
5850 	VkCommandBuffer                             commandBuffer)
5851 {
5852    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5853    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5854 
5855    cmd_state->conditional_render_enabled = false;
5856 }
5857 #endif
5858 
5859 /* Set of stage bits for which are pipelined, i.e. they get queued
5860  * by the command streamer for later execution.
5861  */
5862 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5863    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5864      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5865      VK_PIPELINE_STAGE_2_HOST_BIT | \
5866      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5867 
genX(CmdSetEvent2)5868 void genX(CmdSetEvent2)(
5869     VkCommandBuffer                             commandBuffer,
5870     VkEvent                                     _event,
5871     const VkDependencyInfo*                     pDependencyInfo)
5872 {
5873    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5874    ANV_FROM_HANDLE(anv_event, event, _event);
5875 
5876    VkPipelineStageFlags2 src_stages = 0;
5877 
5878    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5879       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5880    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5881       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5882    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5883       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5884 
5885    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5886    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5887 
5888    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5889       if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5890          pc.StallAtPixelScoreboard = true;
5891          pc.CommandStreamerStallEnable = true;
5892       }
5893 
5894       pc.DestinationAddressType  = DAT_PPGTT,
5895       pc.PostSyncOperation       = WriteImmediateData,
5896       pc.Address = (struct anv_address) {
5897          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5898          event->state.offset
5899       };
5900       pc.ImmediateData           = VK_EVENT_SET;
5901       anv_debug_dump_pc(pc);
5902    }
5903 }
5904 
genX(CmdResetEvent2)5905 void genX(CmdResetEvent2)(
5906     VkCommandBuffer                             commandBuffer,
5907     VkEvent                                     _event,
5908     VkPipelineStageFlags2                       stageMask)
5909 {
5910    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5911    ANV_FROM_HANDLE(anv_event, event, _event);
5912 
5913    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5914    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5915 
5916    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5917       if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5918          pc.StallAtPixelScoreboard = true;
5919          pc.CommandStreamerStallEnable = true;
5920       }
5921 
5922       pc.DestinationAddressType  = DAT_PPGTT;
5923       pc.PostSyncOperation       = WriteImmediateData;
5924       pc.Address = (struct anv_address) {
5925          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5926          event->state.offset
5927       };
5928       pc.ImmediateData           = VK_EVENT_RESET;
5929       anv_debug_dump_pc(pc);
5930    }
5931 }
5932 
genX(CmdWaitEvents2)5933 void genX(CmdWaitEvents2)(
5934     VkCommandBuffer                             commandBuffer,
5935     uint32_t                                    eventCount,
5936     const VkEvent*                              pEvents,
5937     const VkDependencyInfo*                     pDependencyInfos)
5938 {
5939    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5940 
5941 #if GFX_VER >= 8
5942    for (uint32_t i = 0; i < eventCount; i++) {
5943       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5944 
5945       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5946          sem.WaitMode            = PollingMode,
5947          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
5948          sem.SemaphoreDataDword  = VK_EVENT_SET,
5949          sem.SemaphoreAddress = (struct anv_address) {
5950             cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5951             event->state.offset
5952          };
5953       }
5954    }
5955 #else
5956    anv_finishme("Implement events on gfx7");
5957 #endif
5958 
5959    cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
5960 }
5961 
vk_to_intel_index_type(VkIndexType type)5962 static uint32_t vk_to_intel_index_type(VkIndexType type)
5963 {
5964    switch (type) {
5965    case VK_INDEX_TYPE_UINT8_EXT:
5966       return INDEX_BYTE;
5967    case VK_INDEX_TYPE_UINT16:
5968       return INDEX_WORD;
5969    case VK_INDEX_TYPE_UINT32:
5970       return INDEX_DWORD;
5971    default:
5972       unreachable("invalid index type");
5973    }
5974 }
5975 
genX(CmdBindIndexBuffer)5976 void genX(CmdBindIndexBuffer)(
5977     VkCommandBuffer                             commandBuffer,
5978     VkBuffer                                    _buffer,
5979     VkDeviceSize                                offset,
5980     VkIndexType                                 indexType)
5981 {
5982    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5983    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5984 
5985    cmd_buffer->state.gfx.restart_index = vk_index_to_restart(indexType);
5986    cmd_buffer->state.gfx.index_buffer = buffer;
5987    cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
5988    cmd_buffer->state.gfx.index_offset = offset;
5989 
5990    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
5991 }
5992 
genX(CmdSetPerformanceOverrideINTEL)5993 VkResult genX(CmdSetPerformanceOverrideINTEL)(
5994     VkCommandBuffer                             commandBuffer,
5995     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
5996 {
5997    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5998 
5999    switch (pOverrideInfo->type) {
6000    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
6001       anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
6002          instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
6003          instpm.MediaInstructionDisable = pOverrideInfo->enable;
6004          instpm._3DRenderingInstructionDisableMask = true;
6005          instpm.MediaInstructionDisableMask = true;
6006       }
6007       break;
6008    }
6009 
6010    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
6011       if (pOverrideInfo->enable) {
6012          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
6013          anv_add_pending_pipe_bits(cmd_buffer,
6014                                    ANV_PIPE_FLUSH_BITS |
6015                                    ANV_PIPE_INVALIDATE_BITS,
6016                                    "perf counter isolation");
6017          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6018       }
6019       break;
6020 
6021    default:
6022       unreachable("Invalid override");
6023    }
6024 
6025    return VK_SUCCESS;
6026 }
6027 
genX(CmdSetPerformanceStreamMarkerINTEL)6028 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
6029     VkCommandBuffer                             commandBuffer,
6030     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
6031 {
6032    /* TODO: Waiting on the register to write, might depend on generation. */
6033 
6034    return VK_SUCCESS;
6035 }
6036 
6037 #define TIMESTAMP 0x2358
6038 
genX(cmd_emit_timestamp)6039 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6040                               struct anv_device *device,
6041                               struct anv_address addr,
6042                               enum anv_timestamp_capture_type type) {
6043    switch (type) {
6044    case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6045       struct mi_builder b;
6046       mi_builder_init(&b, device->info, batch);
6047       mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6048       break;
6049    }
6050 
6051    case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
6052       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6053          pc.PostSyncOperation   = WriteTimestamp;
6054          pc.Address             = addr;
6055          anv_debug_dump_pc(pc);
6056       }
6057       break;
6058 
6059    case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6060       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6061          pc.CommandStreamerStallEnable = true;
6062          pc.PostSyncOperation    = WriteTimestamp;
6063          pc.Address              = addr;
6064          anv_debug_dump_pc(pc);
6065       }
6066       break;
6067 
6068    default:
6069       unreachable("invalid");
6070    }
6071 }
6072 
genX(cmd_capture_data)6073 void genX(cmd_capture_data)(struct anv_batch *batch,
6074                             struct anv_device *device,
6075                             struct anv_address dst_addr,
6076                             struct anv_address src_addr,
6077                             uint32_t size_B)
6078 {
6079    struct mi_builder b;
6080    mi_builder_init(&b, device->info, batch);
6081    mi_memcpy(&b, dst_addr, src_addr, size_B);
6082 }
6083