1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33
34 #include "common/intel_l3_config.h"
35 #include "genxml/gen_macros.h"
36 #include "genxml/genX_pack.h"
37 #include "common/intel_guardband.h"
38 #include "compiler/elk/elk_prim.h"
39
40 #include "nir/nir_xfb_info.h"
41
42 #include "ds/intel_tracepoints.h"
43
44 /* We reserve :
45 * - GPR 14 for secondary command buffer returns
46 * - GPR 15 for conditional rendering
47 */
48 #define MI_BUILDER_NUM_ALLOC_GPRS 14
49 #define __gen_get_batch_dwords anv_batch_emit_dwords
50 #define __gen_address_offset anv_address_add
51 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
52 #include "common/mi_builder.h"
53
54 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
55 uint32_t pipeline);
56
57 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)58 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
59 enum anv_pipe_bits bits = 0;
60 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
61 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
62 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
63 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
64 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
65 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
66 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
67 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
68 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
69 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
70 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
71 return bits;
72 }
73
74 #define anv_debug_dump_pc(pc) \
75 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
76 fputs("pc: emit PC=( ", stderr); \
77 anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
78 fprintf(stderr, ") reason: %s\n", __func__); \
79 }
80
81 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)82 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
83 {
84 struct anv_queue_family *queue_family = cmd_buffer->queue_family;
85 return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
86 }
87
88 void
genX(cmd_buffer_emit_state_base_address)89 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
90 {
91 struct anv_device *device = cmd_buffer->device;
92 uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
93
94 /* If we are emitting a new state base address we probably need to re-emit
95 * binding tables.
96 */
97 cmd_buffer->state.descriptors_dirty |= ~0;
98
99 /* Emit a render target cache flush.
100 *
101 * This isn't documented anywhere in the PRM. However, it seems to be
102 * necessary prior to changing the surface state base address. Without
103 * this, we get GPU hangs when using multi-level command buffers which
104 * clear depth, reset state base address, and then go render stuff.
105 */
106 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
107 pc.DCFlushEnable = true;
108 pc.RenderTargetCacheFlushEnable = true;
109 pc.CommandStreamerStallEnable = true;
110 anv_debug_dump_pc(pc);
111 }
112
113 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
114 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
115 sba.GeneralStateMOCS = mocs;
116 sba.GeneralStateBaseAddressModifyEnable = true;
117
118 sba.StatelessDataPortAccessMOCS = mocs;
119
120 sba.SurfaceStateBaseAddress =
121 anv_cmd_buffer_surface_base_address(cmd_buffer);
122 sba.SurfaceStateMOCS = mocs;
123 sba.SurfaceStateBaseAddressModifyEnable = true;
124
125 sba.DynamicStateBaseAddress =
126 (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
127 sba.DynamicStateMOCS = mocs;
128 sba.DynamicStateBaseAddressModifyEnable = true;
129
130 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
131 sba.IndirectObjectMOCS = mocs;
132 sba.IndirectObjectBaseAddressModifyEnable = true;
133
134 sba.InstructionBaseAddress =
135 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
136 sba.InstructionMOCS = mocs;
137 sba.InstructionBaseAddressModifyEnable = true;
138
139 # if (GFX_VER >= 8)
140 /* Broadwell requires that we specify a buffer size for a bunch of
141 * these fields. However, since we will be growing the BO's live, we
142 * just set them all to the maximum.
143 */
144 sba.GeneralStateBufferSize = 0xfffff;
145 sba.IndirectObjectBufferSize = 0xfffff;
146 if (anv_use_relocations(device->physical)) {
147 sba.DynamicStateBufferSize = 0xfffff;
148 sba.InstructionBufferSize = 0xfffff;
149 } else {
150 /* With softpin, we use fixed addresses so we actually know how big
151 * our base addresses are.
152 */
153 sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
154 sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
155 }
156 sba.GeneralStateBufferSizeModifyEnable = true;
157 sba.IndirectObjectBufferSizeModifyEnable = true;
158 sba.DynamicStateBufferSizeModifyEnable = true;
159 sba.InstructionBuffersizeModifyEnable = true;
160 # else
161 /* On gfx7, we have upper bounds instead. According to the docs,
162 * setting an upper bound of zero means that no bounds checking is
163 * performed so, in theory, we should be able to leave them zero.
164 * However, border color is broken and the GPU bounds-checks anyway.
165 * To avoid this and other potential problems, we may as well set it
166 * for everything.
167 */
168 sba.GeneralStateAccessUpperBound =
169 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
170 sba.GeneralStateAccessUpperBoundModifyEnable = true;
171 sba.DynamicStateAccessUpperBound =
172 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
173 sba.DynamicStateAccessUpperBoundModifyEnable = true;
174 sba.InstructionAccessUpperBound =
175 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
176 sba.InstructionAccessUpperBoundModifyEnable = true;
177 # endif
178 }
179
180 /* After re-setting the surface state base address, we have to do some
181 * cache flushing so that the sampler engine will pick up the new
182 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
183 * Shared Function > 3D Sampler > State > State Caching (page 96):
184 *
185 * Coherency with system memory in the state cache, like the texture
186 * cache is handled partially by software. It is expected that the
187 * command stream or shader will issue Cache Flush operation or
188 * Cache_Flush sampler message to ensure that the L1 cache remains
189 * coherent with system memory.
190 *
191 * [...]
192 *
193 * Whenever the value of the Dynamic_State_Base_Addr,
194 * Surface_State_Base_Addr are altered, the L1 state cache must be
195 * invalidated to ensure the new surface or sampler state is fetched
196 * from system memory.
197 *
198 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
199 * which, according the PIPE_CONTROL instruction documentation in the
200 * Broadwell PRM:
201 *
202 * Setting this bit is independent of any other bit in this packet.
203 * This bit controls the invalidation of the L1 and L2 state caches
204 * at the top of the pipe i.e. at the parsing time.
205 *
206 * Unfortunately, experimentation seems to indicate that state cache
207 * invalidation through a PIPE_CONTROL does nothing whatsoever in
208 * regards to surface state and binding tables. In stead, it seems that
209 * invalidating the texture cache is what is actually needed.
210 *
211 * XXX: As far as we have been able to determine through
212 * experimentation, shows that flush the texture cache appears to be
213 * sufficient. The theory here is that all of the sampling/rendering
214 * units cache the binding table in the texture cache. However, we have
215 * yet to be able to actually confirm this.
216 *
217 * Wa_14013910100:
218 *
219 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
220 * or program pipe control with Instruction cache invalidate post
221 * STATE_BASE_ADDRESS command"
222 */
223 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
224 pc.TextureCacheInvalidationEnable = true;
225 pc.ConstantCacheInvalidationEnable = true;
226 pc.StateCacheInvalidationEnable = true;
227 anv_debug_dump_pc(pc);
228 }
229 }
230
231 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)232 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
233 struct anv_state state, struct anv_address addr)
234 {
235 VkResult result;
236
237 if (anv_use_relocations(cmd_buffer->device->physical)) {
238 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
239 result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
240 &cmd_buffer->vk.pool->alloc,
241 state.offset + isl_dev->ss.addr_offset,
242 addr.bo, addr.offset, NULL);
243 } else {
244 result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
245 &cmd_buffer->vk.pool->alloc,
246 addr.bo);
247 }
248
249 if (unlikely(result != VK_SUCCESS))
250 anv_batch_set_error(&cmd_buffer->batch, result);
251 }
252
253 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)254 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
255 struct anv_surface_state state)
256 {
257 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
258
259 assert(!anv_address_is_null(state.address));
260 add_surface_reloc(cmd_buffer, state.state, state.address);
261
262 if (!anv_address_is_null(state.aux_address)) {
263 VkResult result =
264 anv_reloc_list_add(&cmd_buffer->surface_relocs,
265 &cmd_buffer->vk.pool->alloc,
266 state.state.offset + isl_dev->ss.aux_addr_offset,
267 state.aux_address.bo,
268 state.aux_address.offset,
269 NULL);
270 if (result != VK_SUCCESS)
271 anv_batch_set_error(&cmd_buffer->batch, result);
272 }
273
274 if (!anv_address_is_null(state.clear_address)) {
275 VkResult result =
276 anv_reloc_list_add(&cmd_buffer->surface_relocs,
277 &cmd_buffer->vk.pool->alloc,
278 state.state.offset +
279 isl_dev->ss.clear_color_state_offset,
280 state.clear_address.bo,
281 state.clear_address.offset,
282 NULL);
283 if (result != VK_SUCCESS)
284 anv_batch_set_error(&cmd_buffer->batch, result);
285 }
286 }
287
288 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)289 isl_color_value_requires_conversion(union isl_color_value color,
290 const struct isl_surf *surf,
291 const struct isl_view *view)
292 {
293 if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
294 return false;
295
296 uint32_t surf_pack[4] = { 0, 0, 0, 0 };
297 isl_color_value_pack(&color, surf->format, surf_pack);
298
299 uint32_t view_pack[4] = { 0, 0, 0, 0 };
300 union isl_color_value swiz_color =
301 isl_color_value_swizzle_inv(color, view->swizzle);
302 isl_color_value_pack(&swiz_color, view->format, view_pack);
303
304 return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
305 }
306
307 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)308 anv_can_fast_clear_color_view(struct anv_device * device,
309 struct anv_image_view *iview,
310 VkImageLayout layout,
311 union isl_color_value clear_color,
312 uint32_t num_layers,
313 VkRect2D render_area)
314 {
315 if (iview->planes[0].isl.base_array_layer >=
316 anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
317 iview->planes[0].isl.base_level))
318 return false;
319
320 /* Start by getting the fast clear type. We use the first subpass
321 * layout here because we don't want to fast-clear if the first subpass
322 * to use the attachment can't handle fast-clears.
323 */
324 enum anv_fast_clear_type fast_clear_type =
325 anv_layout_to_fast_clear_type(device->info, iview->image,
326 VK_IMAGE_ASPECT_COLOR_BIT,
327 layout);
328 switch (fast_clear_type) {
329 case ANV_FAST_CLEAR_NONE:
330 return false;
331 case ANV_FAST_CLEAR_DEFAULT_VALUE:
332 if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
333 return false;
334 break;
335 case ANV_FAST_CLEAR_ANY:
336 break;
337 }
338
339 /* Potentially, we could do partial fast-clears but doing so has crazy
340 * alignment restrictions. It's easier to just restrict to full size
341 * fast clears for now.
342 */
343 if (render_area.offset.x != 0 ||
344 render_area.offset.y != 0 ||
345 render_area.extent.width != iview->vk.extent.width ||
346 render_area.extent.height != iview->vk.extent.height)
347 return false;
348
349 /* On Broadwell and earlier, we can only handle 0/1 clear colors */
350 if (!isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
351 return false;
352
353 /* If the clear color is one that would require non-trivial format
354 * conversion on resolve, we don't bother with the fast clear. This
355 * shouldn't be common as most clear colors are 0/1 and the most common
356 * format re-interpretation is for sRGB.
357 */
358 if (isl_color_value_requires_conversion(clear_color,
359 &iview->image->planes[0].primary_surface.isl,
360 &iview->planes[0].isl)) {
361 anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
362 "Cannot fast-clear to colors which would require "
363 "format conversion on resolve");
364 return false;
365 }
366
367 /* We only allow fast clears to the first slice of an image (level 0,
368 * layer 0) and only for the entire slice. This guarantees us that, at
369 * any given time, there is only one clear color on any given image at
370 * any given time. At the time of our testing (Jan 17, 2018), there
371 * were no known applications which would benefit from fast-clearing
372 * more than just the first slice.
373 */
374 if (iview->planes[0].isl.base_level > 0 ||
375 iview->planes[0].isl.base_array_layer > 0) {
376 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
377 "Rendering with multi-lod or multi-layer framebuffer "
378 "with LOAD_OP_LOAD and baseMipLevel > 0 or "
379 "baseArrayLayer > 0. Not fast clearing.");
380 return false;
381 }
382
383 if (num_layers > 1) {
384 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
385 "Rendering to a multi-layer framebuffer with "
386 "LOAD_OP_CLEAR. Only fast-clearing the first slice");
387 }
388
389 return true;
390 }
391
392 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,const struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)393 anv_can_hiz_clear_ds_view(struct anv_device *device,
394 const struct anv_image_view *iview,
395 VkImageLayout layout,
396 VkImageAspectFlags clear_aspects,
397 float depth_clear_value,
398 VkRect2D render_area)
399 {
400 /* We don't do any HiZ or depth fast-clears on gfx7 yet */
401 if (GFX_VER == 7)
402 return false;
403
404 /* If we're just clearing stencil, we can always HiZ clear */
405 if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
406 return true;
407
408 /* We must have depth in order to have HiZ */
409 if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
410 return false;
411
412 const enum isl_aux_usage clear_aux_usage =
413 anv_layout_to_aux_usage(device->info, iview->image,
414 VK_IMAGE_ASPECT_DEPTH_BIT,
415 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
416 layout);
417 if (!isl_aux_usage_has_fast_clears(clear_aux_usage))
418 return false;
419
420 assert(GFX_VER == 8);
421 assert(iview->vk.format != VK_FORMAT_D16_UNORM_S8_UINT);
422 if (iview->vk.format == VK_FORMAT_D16_UNORM) {
423 /* From the BDW PRM, Vol 7, "Depth Buffer Clear":
424 *
425 * The following restrictions apply only if the depth buffer surface
426 * type is D16_UNORM and software does not use the “full surf clear”:
427 *
428 * If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
429 * aligned to an 8x4 pixel block relative to the upper left corner of
430 * the depth buffer, and contain an integer number of these pixel
431 * blocks, and all 8x4 pixels must be lit.
432 *
433 * Simply disable partial clears for D16 on BDW.
434 */
435 if (render_area.offset.x > 0 ||
436 render_area.offset.y > 0 ||
437 render_area.extent.width !=
438 u_minify(iview->vk.extent.width, iview->vk.base_mip_level) ||
439 render_area.extent.height !=
440 u_minify(iview->vk.extent.height, iview->vk.base_mip_level)) {
441 return false;
442 }
443 }
444
445 if (depth_clear_value != ANV_HZ_FC_VAL)
446 return false;
447
448 /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
449 * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
450 * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
451 */
452 if (GFX_VER == 8 && anv_can_sample_with_hiz(device->info, iview->image))
453 return false;
454
455 /* If we got here, then we can fast clear */
456 return true;
457 }
458
459 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
460
461 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
462 * the initial layout is undefined, the HiZ buffer and depth buffer will
463 * represent the same data at the end of this operation.
464 */
465 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)466 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
467 const struct anv_image *image,
468 uint32_t base_layer, uint32_t layer_count,
469 VkImageLayout initial_layout,
470 VkImageLayout final_layout,
471 bool will_full_fast_clear)
472 {
473 const uint32_t depth_plane =
474 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
475 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
476 return;
477
478 /* If will_full_fast_clear is set, the caller promises to fast-clear the
479 * largest portion of the specified range as it can. For depth images,
480 * that means the entire image because we don't support multi-LOD HiZ.
481 */
482 assert(image->planes[0].primary_surface.isl.levels == 1);
483 if (will_full_fast_clear)
484 return;
485
486 const enum isl_aux_state initial_state =
487 anv_layout_to_aux_state(cmd_buffer->device->info, image,
488 VK_IMAGE_ASPECT_DEPTH_BIT,
489 initial_layout);
490 const enum isl_aux_state final_state =
491 anv_layout_to_aux_state(cmd_buffer->device->info, image,
492 VK_IMAGE_ASPECT_DEPTH_BIT,
493 final_layout);
494
495 const bool initial_depth_valid =
496 isl_aux_state_has_valid_primary(initial_state);
497 const bool initial_hiz_valid =
498 isl_aux_state_has_valid_aux(initial_state);
499 const bool final_needs_depth =
500 isl_aux_state_has_valid_primary(final_state);
501 const bool final_needs_hiz =
502 isl_aux_state_has_valid_aux(final_state);
503
504 /* Getting into the pass-through state for Depth is tricky and involves
505 * both a resolve and an ambiguate. We don't handle that state right now
506 * as anv_layout_to_aux_state never returns it.
507 */
508 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
509
510 if (final_needs_depth && !initial_depth_valid) {
511 assert(initial_hiz_valid);
512 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
513 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
514 } else if (final_needs_hiz && !initial_hiz_valid) {
515 assert(initial_depth_valid);
516 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
517 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
518 }
519 }
520
521 #if GFX_VER == 7
522 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)523 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
524 {
525 return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
526 layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
527 layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL ||
528 layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL;
529 }
530 #endif
531
532 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
533 * the initial layout is undefined, the HiZ buffer and depth buffer will
534 * represent the same data at the end of this operation.
535 */
536 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)537 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
538 const struct anv_image *image,
539 uint32_t base_level, uint32_t level_count,
540 uint32_t base_layer, uint32_t layer_count,
541 VkImageLayout initial_layout,
542 VkImageLayout final_layout,
543 bool will_full_fast_clear)
544 {
545 #if GFX_VER == 7
546 const uint32_t plane =
547 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
548
549 /* On gfx7, we have to store a texturable version of the stencil buffer in
550 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
551 * forth at strategic points. Stencil writes are only allowed in following
552 * layouts:
553 *
554 * - VK_IMAGE_LAYOUT_GENERAL
555 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
556 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
557 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
558 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
559 * - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
560 *
561 * For general, we have no nice opportunity to transition so we do the copy
562 * to the shadow unconditionally at the end of the subpass. For transfer
563 * destinations, we can update it as part of the transfer op. For the other
564 * layouts, we delay the copy until a transition into some other layout.
565 */
566 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
567 vk_image_layout_stencil_write_optimal(initial_layout) &&
568 !vk_image_layout_stencil_write_optimal(final_layout)) {
569 anv_image_copy_to_shadow(cmd_buffer, image,
570 VK_IMAGE_ASPECT_STENCIL_BIT,
571 base_level, level_count,
572 base_layer, layer_count);
573 }
574 #endif
575 }
576
577 #define MI_PREDICATE_SRC0 0x2400
578 #define MI_PREDICATE_SRC1 0x2408
579 #define MI_PREDICATE_RESULT 0x2418
580
581 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)582 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
583 const struct anv_image *image,
584 VkImageAspectFlagBits aspect,
585 enum anv_fast_clear_type fast_clear)
586 {
587 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
588 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
589 image, aspect);
590 sdi.ImmediateData = fast_clear;
591 }
592 }
593
594 /* This is only really practical on haswell and above because it requires
595 * MI math in order to get it correct.
596 */
597 #if GFX_VERx10 >= 75
598 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)599 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
600 const struct anv_image *image,
601 VkImageAspectFlagBits aspect,
602 uint32_t level, uint32_t array_layer,
603 enum isl_aux_op resolve_op,
604 enum anv_fast_clear_type fast_clear_supported)
605 {
606 struct mi_builder b;
607 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
608
609 const struct mi_value fast_clear_type =
610 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
611 image, aspect));
612
613 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
614 if (level == 0 && array_layer == 0) {
615 /* In this case, we are doing a partial resolve to get rid of fast-clear
616 * colors. We don't care about the compression state but we do care
617 * about how much fast clear is allowed by the final layout.
618 */
619 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
620 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
621
622 /* We need to compute (fast_clear_supported < image->fast_clear) */
623 struct mi_value pred =
624 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
625 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
626
627 /* If the predicate is true, we want to write 0 to the fast clear type
628 * and, if it's false, leave it alone. We can do this by writing
629 *
630 * clear_type = clear_type & ~predicate;
631 */
632 struct mi_value new_fast_clear_type =
633 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
634 mi_store(&b, fast_clear_type, new_fast_clear_type);
635 } else {
636 /* In this case, we're trying to do a partial resolve on a slice that
637 * doesn't have clear color. There's nothing to do.
638 */
639 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
640 return;
641 }
642
643 /* Set src1 to 0 and use a != condition */
644 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
645
646 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
647 mip.LoadOperation = LOAD_LOADINV;
648 mip.CombineOperation = COMBINE_SET;
649 mip.CompareOperation = COMPARE_SRCS_EQUAL;
650 }
651 }
652 #endif /* GFX_VERx10 >= 75 */
653
654 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)655 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
656 const struct anv_image *image,
657 VkImageAspectFlagBits aspect,
658 uint32_t level, uint32_t array_layer,
659 enum isl_aux_op resolve_op,
660 enum anv_fast_clear_type fast_clear_supported)
661 {
662 struct mi_builder b;
663 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
664
665 struct mi_value fast_clear_type_mem =
666 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
667 image, aspect));
668
669 /* This only works for partial resolves and only when the clear color is
670 * all or nothing. On the upside, this emits less command streamer code
671 * and works on Ivybridge and Bay Trail.
672 */
673 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
674 assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
675
676 /* We don't support fast clears on anything other than the first slice. */
677 if (level > 0 || array_layer > 0)
678 return;
679
680 /* On gfx8, we don't have a concept of default clear colors because we
681 * can't sample from CCS surfaces. It's enough to just load the fast clear
682 * state into the predicate register.
683 */
684 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
685 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
686 mi_store(&b, fast_clear_type_mem, mi_imm(0));
687
688 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
689 mip.LoadOperation = LOAD_LOADINV;
690 mip.CombineOperation = COMBINE_SET;
691 mip.CompareOperation = COMPARE_SRCS_EQUAL;
692 }
693 }
694
695 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)696 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
697 const struct anv_image *image,
698 enum isl_format format,
699 struct isl_swizzle swizzle,
700 VkImageAspectFlagBits aspect,
701 uint32_t level, uint32_t array_layer,
702 enum isl_aux_op resolve_op,
703 enum anv_fast_clear_type fast_clear_supported)
704 {
705 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
706
707 anv_cmd_simple_resolve_predicate(cmd_buffer, image,
708 aspect, level, array_layer,
709 resolve_op, fast_clear_supported);
710
711 /* CCS_D only supports full resolves and BLORP will assert on us if we try
712 * to do a partial resolve on a CCS_D surface.
713 */
714 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
715 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
716 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
717
718 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
719 level, array_layer, 1, resolve_op, NULL, true);
720 }
721
722 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)723 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
724 const struct anv_image *image,
725 enum isl_format format,
726 struct isl_swizzle swizzle,
727 VkImageAspectFlagBits aspect,
728 uint32_t array_layer,
729 enum isl_aux_op resolve_op,
730 enum anv_fast_clear_type fast_clear_supported)
731 {
732 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
733 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
734
735 #if GFX_VERx10 >= 75
736 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
737 aspect, 0, array_layer,
738 resolve_op, fast_clear_supported);
739
740 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
741 array_layer, 1, resolve_op, NULL, true);
742 #else
743 unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
744 #endif
745 }
746
747 void
genX(cmd_buffer_mark_image_written)748 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
749 const struct anv_image *image,
750 VkImageAspectFlagBits aspect,
751 enum isl_aux_usage aux_usage,
752 uint32_t level,
753 uint32_t base_layer,
754 uint32_t layer_count)
755 {
756 /* The aspect must be exactly one of the image aspects. */
757 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
758 }
759
760 static void
set_image_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,const VkImageAspectFlags aspect,const union isl_color_value clear_color)761 set_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
762 const struct anv_image *image,
763 const VkImageAspectFlags aspect,
764 const union isl_color_value clear_color)
765 {
766 uint32_t plane = anv_image_aspect_to_plane(image, aspect);
767 enum isl_format format = image->planes[plane].primary_surface.isl.format;
768
769 struct anv_address addr =
770 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
771 assert(!anv_address_is_null(addr));
772
773 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
774 sdi.Address = addr;
775 if (GFX_VERx10 >= 75) {
776 /* On HSW+, the RENDER_SURFACE_STATE dword containing the clear
777 * values also contains other fields. The dword constructed here
778 * will later be copied onto a surface state as-is. So, initialize
779 * those fields to match the values that we typically expect in a
780 * surface.
781 *
782 * XXX: Handle other values for ShaderChannelSelect and
783 * ResourceMinLOD.
784 */
785 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
786 ISL_CHANNEL_SELECT_GREEN << 22 |
787 ISL_CHANNEL_SELECT_BLUE << 19 |
788 ISL_CHANNEL_SELECT_ALPHA << 16;
789 }
790 if (isl_format_has_int_channel(format)) {
791 for (unsigned i = 0; i < 4; i++) {
792 assert(clear_color.u32[i] == 0 ||
793 clear_color.u32[i] == 1);
794 }
795 sdi.ImmediateData |= (clear_color.u32[0] != 0) << 31;
796 sdi.ImmediateData |= (clear_color.u32[1] != 0) << 30;
797 sdi.ImmediateData |= (clear_color.u32[2] != 0) << 29;
798 sdi.ImmediateData |= (clear_color.u32[3] != 0) << 28;
799 } else {
800 for (unsigned i = 0; i < 4; i++) {
801 assert(clear_color.f32[i] == 0.0f ||
802 clear_color.f32[i] == 1.0f);
803 }
804 sdi.ImmediateData |= (clear_color.f32[0] != 0.0f) << 31;
805 sdi.ImmediateData |= (clear_color.f32[1] != 0.0f) << 30;
806 sdi.ImmediateData |= (clear_color.f32[2] != 0.0f) << 29;
807 sdi.ImmediateData |= (clear_color.f32[3] != 0.0f) << 28;
808 }
809 }
810 }
811
812 /* Copy the fast-clear value dword(s) between a surface state object and an
813 * image's fast clear state buffer.
814 */
815 static void
genX(copy_fast_clear_dwords)816 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
817 struct anv_state surface_state,
818 const struct anv_image *image,
819 VkImageAspectFlagBits aspect,
820 bool copy_from_surface_state)
821 {
822 assert(cmd_buffer && image);
823 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
824
825 struct anv_address ss_clear_addr = {
826 .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
827 .offset = surface_state.offset +
828 cmd_buffer->device->isl_dev.ss.clear_value_offset,
829 };
830 const struct anv_address entry_addr =
831 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
832 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
833
834 #if GFX_VER == 7
835 /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
836 * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
837 * in-flight when they are issued even if the memory touched is not
838 * currently active for rendering. The weird bit is that it is not the
839 * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
840 * rendering hangs such that the next stalling command after the
841 * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
842 *
843 * It is unclear exactly why this hang occurs. Both MI commands come with
844 * warnings about the 3D pipeline but that doesn't seem to fully explain
845 * it. My (Faith's) best theory is that it has something to do with the
846 * fact that we're using a GPU state register as our temporary and that
847 * something with reading/writing it is causing problems.
848 *
849 * In order to work around this issue, we emit a PIPE_CONTROL with the
850 * command streamer stall bit set.
851 */
852 anv_add_pending_pipe_bits(cmd_buffer,
853 ANV_PIPE_CS_STALL_BIT,
854 "after copy_fast_clear_dwords. Avoid potential hang");
855 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
856 #endif
857
858 struct mi_builder b;
859 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
860
861 if (copy_from_surface_state) {
862 mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
863 } else {
864 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
865
866 /* Updating a surface state object may require that the state cache be
867 * invalidated. From the SKL PRM, Shared Functions -> State -> State
868 * Caching:
869 *
870 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
871 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
872 * modified [...], the L1 state cache must be invalidated to ensure
873 * the new surface or sampler state is fetched from system memory.
874 *
875 * In testing, SKL doesn't actually seem to need this, but HSW does.
876 */
877 anv_add_pending_pipe_bits(cmd_buffer,
878 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
879 "after copy_fast_clear_dwords surface state update");
880 }
881 }
882
883 /**
884 * @brief Transitions a color buffer from one layout to another.
885 *
886 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
887 * more information.
888 *
889 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
890 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
891 * this represents the maximum layers to transition at each
892 * specified miplevel.
893 */
894 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)895 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
896 const struct anv_image *image,
897 VkImageAspectFlagBits aspect,
898 const uint32_t base_level, uint32_t level_count,
899 uint32_t base_layer, uint32_t layer_count,
900 VkImageLayout initial_layout,
901 VkImageLayout final_layout,
902 uint32_t src_queue_family,
903 uint32_t dst_queue_family,
904 bool will_full_fast_clear)
905 {
906 struct anv_device *device = cmd_buffer->device;
907 const struct intel_device_info *devinfo = device->info;
908 /* Validate the inputs. */
909 assert(cmd_buffer);
910 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
911 /* These values aren't supported for simplicity's sake. */
912 assert(level_count != VK_REMAINING_MIP_LEVELS &&
913 layer_count != VK_REMAINING_ARRAY_LAYERS);
914 /* Ensure the subresource range is valid. */
915 UNUSED uint64_t last_level_num = base_level + level_count;
916 const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
917 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
918 assert((uint64_t)base_layer + layer_count <= image_layers);
919 assert(last_level_num <= image->vk.mip_levels);
920 /* If there is a layout transfer, the final layout cannot be undefined or
921 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
922 */
923 assert(initial_layout == final_layout ||
924 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
925 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
926 const struct isl_drm_modifier_info *isl_mod_info =
927 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
928 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
929 : NULL;
930
931 const bool src_queue_external =
932 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
933 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
934
935 const bool dst_queue_external =
936 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
937 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
938
939 /* Simultaneous acquire and release on external queues is illegal. */
940 assert(!src_queue_external || !dst_queue_external);
941
942 /* Ownership transition on an external queue requires special action if the
943 * image has a DRM format modifier because we store image data in
944 * a driver-private bo which is inaccessible to the external queue.
945 */
946 const bool private_binding_acquire =
947 src_queue_external &&
948 anv_image_is_externally_shared(image) &&
949 anv_image_has_private_binding(image);
950
951 const bool private_binding_release =
952 dst_queue_external &&
953 anv_image_is_externally_shared(image) &&
954 anv_image_has_private_binding(image);
955
956 if (initial_layout == final_layout &&
957 !private_binding_acquire && !private_binding_release) {
958 /* No work is needed. */
959 return;
960 }
961
962 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
963
964 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
965 final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
966 /* This surface is a linear compressed image with a tiled shadow surface
967 * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so
968 * we need to ensure the shadow copy is up-to-date.
969 */
970 assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
971 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
972 assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
973 assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
974 assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
975 assert(plane == 0);
976 anv_image_copy_to_shadow(cmd_buffer, image,
977 VK_IMAGE_ASPECT_COLOR_BIT,
978 base_level, level_count,
979 base_layer, layer_count);
980 }
981
982 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
983 return;
984
985 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
986
987 /* The following layouts are equivalent for non-linear images. */
988 const bool initial_layout_undefined =
989 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
990 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
991
992 bool must_init_fast_clear_state = false;
993 bool must_init_aux_surface = false;
994
995 if (initial_layout_undefined) {
996 /* The subresource may have been aliased and populated with arbitrary
997 * data.
998 */
999 must_init_fast_clear_state = true;
1000 must_init_aux_surface = true;
1001 } else if (private_binding_acquire) {
1002 /* The fast clear state lives in a driver-private bo, and therefore the
1003 * external/foreign queue is unaware of it.
1004 *
1005 * If this is the first time we are accessing the image, then the fast
1006 * clear state is uninitialized.
1007 *
1008 * If this is NOT the first time we are accessing the image, then the fast
1009 * clear state may still be valid and correct due to the resolve during
1010 * our most recent ownership release. However, we do not track the aux
1011 * state with MI stores, and therefore must assume the worst-case: that
1012 * this is the first time we are accessing the image.
1013 */
1014 assert(image->planes[plane].fast_clear_memory_range.binding ==
1015 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1016 must_init_fast_clear_state = true;
1017
1018 /* The aux surface, like the fast clear state, lives in
1019 * a driver-private bo. We must initialize the aux surface for the
1020 * same reasons we must initialize the fast clear state.
1021 */
1022 assert(image->planes[plane].aux_surface.memory_range.binding ==
1023 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1024 must_init_aux_surface = true;
1025 }
1026
1027 if (must_init_fast_clear_state) {
1028 if (base_level == 0 && base_layer == 0) {
1029 const union isl_color_value zero_color = {};
1030 set_image_clear_color(cmd_buffer, image, aspect, zero_color);
1031 set_image_fast_clear_state(cmd_buffer, image, aspect,
1032 ANV_FAST_CLEAR_NONE);
1033 }
1034 }
1035
1036 if (must_init_aux_surface) {
1037 assert(must_init_fast_clear_state);
1038
1039 /* Initialize the aux buffers to enable correct rendering. In order to
1040 * ensure that things such as storage images work correctly, aux buffers
1041 * need to be initialized to valid data.
1042 *
1043 * Having an aux buffer with invalid data is a problem for two reasons:
1044 *
1045 * 1) Having an invalid value in the buffer can confuse the hardware.
1046 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1047 * invalid and leads to the hardware doing strange things. It
1048 * doesn't hang as far as we can tell but rendering corruption can
1049 * occur.
1050 *
1051 * 2) If this transition is into the GENERAL layout and we then use the
1052 * image as a storage image, then we must have the aux buffer in the
1053 * pass-through state so that, if we then go to texture from the
1054 * image, we get the results of our storage image writes and not the
1055 * fast clear color or other random data.
1056 *
1057 * For CCS both of the problems above are real demonstrable issues. In
1058 * that case, the only thing we can do is to perform an ambiguate to
1059 * transition the aux surface into the pass-through state.
1060 *
1061 * For MCS, (2) is never an issue because we don't support multisampled
1062 * storage images. In theory, issue (1) is a problem with MCS but we've
1063 * never seen it in the wild. For 4x and 16x, all bit patters could, in
1064 * theory, be interpreted as something but we don't know that all bit
1065 * patterns are actually valid. For 2x and 8x, you could easily end up
1066 * with the MCS referring to an invalid plane because not all bits of
1067 * the MCS value are actually used. Even though we've never seen issues
1068 * in the wild, it's best to play it safe and initialize the MCS. We
1069 * can use a fast-clear for MCS because we only ever touch from render
1070 * and texture (no image load store).
1071 */
1072 if (image->vk.samples == 1) {
1073 for (uint32_t l = 0; l < level_count; l++) {
1074 const uint32_t level = base_level + l;
1075
1076 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1077 if (base_layer >= aux_layers)
1078 break; /* We will only get fewer layers as level increases */
1079 uint32_t level_layer_count =
1080 MIN2(layer_count, aux_layers - base_layer);
1081
1082 /* If will_full_fast_clear is set, the caller promises to
1083 * fast-clear the largest portion of the specified range as it can.
1084 * For color images, that means only the first LOD and array slice.
1085 */
1086 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1087 base_layer++;
1088 level_layer_count--;
1089 if (level_layer_count == 0)
1090 continue;
1091 }
1092
1093 anv_image_ccs_op(cmd_buffer, image,
1094 image->planes[plane].primary_surface.isl.format,
1095 ISL_SWIZZLE_IDENTITY,
1096 aspect, level, base_layer, level_layer_count,
1097 ISL_AUX_OP_AMBIGUATE, NULL, false);
1098 }
1099 } else {
1100 if (image->vk.samples == 4 || image->vk.samples == 16) {
1101 anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1102 "Doing a potentially unnecessary fast-clear to "
1103 "define an MCS buffer.");
1104 }
1105
1106 /* If will_full_fast_clear is set, the caller promises to fast-clear
1107 * the largest portion of the specified range as it can.
1108 */
1109 if (will_full_fast_clear)
1110 return;
1111
1112 assert(base_level == 0 && level_count == 1);
1113 anv_image_mcs_op(cmd_buffer, image,
1114 image->planes[plane].primary_surface.isl.format,
1115 ISL_SWIZZLE_IDENTITY,
1116 aspect, base_layer, layer_count,
1117 ISL_AUX_OP_FAST_CLEAR, NULL, false);
1118 }
1119 return;
1120 }
1121
1122 enum isl_aux_usage initial_aux_usage =
1123 anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1124 enum isl_aux_usage final_aux_usage =
1125 anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1126 enum anv_fast_clear_type initial_fast_clear =
1127 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1128 enum anv_fast_clear_type final_fast_clear =
1129 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1130
1131 /* We must override the anv_layout_to_* functions because they are unaware of
1132 * acquire/release direction.
1133 */
1134 if (private_binding_acquire) {
1135 assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1136 initial_aux_usage = ISL_AUX_USAGE_NONE;
1137 initial_fast_clear = ANV_FAST_CLEAR_NONE;
1138 } else if (private_binding_release) {
1139 assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1140 final_aux_usage = ISL_AUX_USAGE_NONE;
1141 final_fast_clear = ANV_FAST_CLEAR_NONE;
1142 }
1143
1144 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1145 * We can handle transitions between CCS_D/E to and from NONE. What we
1146 * don't yet handle is switching between CCS_E and CCS_D within a given
1147 * image. Doing so in a performant way requires more detailed aux state
1148 * tracking such as what is done in i965. For now, just assume that we
1149 * only have one type of compression.
1150 */
1151 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1152 final_aux_usage == ISL_AUX_USAGE_NONE ||
1153 initial_aux_usage == final_aux_usage);
1154
1155 /* If initial aux usage is NONE, there is nothing to resolve */
1156 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1157 return;
1158
1159 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1160
1161 /* If the initial layout supports more fast clear than the final layout
1162 * then we need at least a partial resolve.
1163 */
1164 if (final_fast_clear < initial_fast_clear)
1165 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1166
1167 if (resolve_op == ISL_AUX_OP_NONE)
1168 return;
1169
1170 /* Perform a resolve to synchronize data between the main and aux buffer.
1171 * Before we begin, we must satisfy the cache flushing requirement specified
1172 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1173 *
1174 * Any transition from any value in {Clear, Render, Resolve} to a
1175 * different value in {Clear, Render, Resolve} requires end of pipe
1176 * synchronization.
1177 *
1178 * We perform a flush of the write cache before and after the clear and
1179 * resolve operations to meet this requirement.
1180 *
1181 * Unlike other drawing, fast clear operations are not properly
1182 * synchronized. The first PIPE_CONTROL here likely ensures that the
1183 * contents of the previous render or clear hit the render target before we
1184 * resolve and the second likely ensures that the resolve is complete before
1185 * we do any more rendering or clearing.
1186 */
1187 anv_add_pending_pipe_bits(cmd_buffer,
1188 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1189 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1190 "after transition RT");
1191
1192 for (uint32_t l = 0; l < level_count; l++) {
1193 uint32_t level = base_level + l;
1194
1195 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1196 if (base_layer >= aux_layers)
1197 break; /* We will only get fewer layers as level increases */
1198 uint32_t level_layer_count =
1199 MIN2(layer_count, aux_layers - base_layer);
1200
1201 for (uint32_t a = 0; a < level_layer_count; a++) {
1202 uint32_t array_layer = base_layer + a;
1203
1204 /* If will_full_fast_clear is set, the caller promises to fast-clear
1205 * the largest portion of the specified range as it can. For color
1206 * images, that means only the first LOD and array slice.
1207 */
1208 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1209 continue;
1210
1211 if (image->vk.samples == 1) {
1212 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1213 image->planes[plane].primary_surface.isl.format,
1214 ISL_SWIZZLE_IDENTITY,
1215 aspect, level, array_layer, resolve_op,
1216 final_fast_clear);
1217 } else {
1218 /* We only support fast-clear on the first layer so partial
1219 * resolves should not be used on other layers as they will use
1220 * the clear color stored in memory that is only valid for layer0.
1221 */
1222 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1223 array_layer != 0)
1224 continue;
1225
1226 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1227 image->planes[plane].primary_surface.isl.format,
1228 ISL_SWIZZLE_IDENTITY,
1229 aspect, array_layer, resolve_op,
1230 final_fast_clear);
1231 }
1232 }
1233 }
1234
1235 anv_add_pending_pipe_bits(cmd_buffer,
1236 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1237 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1238 "after transition RT");
1239 }
1240
1241 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1242 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1243 uint32_t color_att_count)
1244 {
1245 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1246
1247 /* Reserve one for the NULL state. */
1248 unsigned num_states = 1 + color_att_count;
1249 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1250 const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1251 gfx->att_states =
1252 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1253 num_states * ss_stride, isl_dev->ss.align);
1254 if (gfx->att_states.map == NULL) {
1255 return anv_batch_set_error(&cmd_buffer->batch,
1256 VK_ERROR_OUT_OF_DEVICE_MEMORY);
1257 }
1258
1259 struct anv_state next_state = gfx->att_states;
1260 next_state.alloc_size = isl_dev->ss.size;
1261
1262 gfx->null_surface_state = next_state;
1263 next_state.offset += ss_stride;
1264 next_state.map += ss_stride;
1265
1266 gfx->color_att_count = color_att_count;
1267 for (uint32_t i = 0; i < color_att_count; i++) {
1268 gfx->color_att[i] = (struct anv_attachment) {
1269 .surface_state.state = next_state,
1270 };
1271 next_state.offset += ss_stride;
1272 next_state.map += ss_stride;
1273 }
1274 gfx->depth_att = (struct anv_attachment) { };
1275 gfx->stencil_att = (struct anv_attachment) { };
1276
1277 return VK_SUCCESS;
1278 }
1279
1280 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1281 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1282 {
1283 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1284
1285 gfx->render_area = (VkRect2D) { };
1286 gfx->layer_count = 0;
1287 gfx->samples = 0;
1288
1289 gfx->color_att_count = 0;
1290 gfx->depth_att = (struct anv_attachment) { };
1291 gfx->stencil_att = (struct anv_attachment) { };
1292 gfx->null_surface_state = ANV_STATE_NULL;
1293 }
1294
1295 VkResult
genX(BeginCommandBuffer)1296 genX(BeginCommandBuffer)(
1297 VkCommandBuffer commandBuffer,
1298 const VkCommandBufferBeginInfo* pBeginInfo)
1299 {
1300 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1301 VkResult result;
1302
1303 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1304 * command buffer's state. Otherwise, we must *reset* its state. In both
1305 * cases we reset it.
1306 *
1307 * From the Vulkan 1.0 spec:
1308 *
1309 * If a command buffer is in the executable state and the command buffer
1310 * was allocated from a command pool with the
1311 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1312 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
1313 * as if vkResetCommandBuffer had been called with
1314 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1315 * the command buffer in the recording state.
1316 */
1317 anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
1318 anv_cmd_buffer_reset_rendering(cmd_buffer);
1319
1320 cmd_buffer->usage_flags = pBeginInfo->flags;
1321
1322 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1323 * primary level command buffers.
1324 *
1325 * From the Vulkan 1.0 spec:
1326 *
1327 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1328 * secondary command buffer is considered to be entirely inside a render
1329 * pass. If this is a primary command buffer, then this bit is ignored.
1330 */
1331 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1332 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1333
1334 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
1335
1336 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1337
1338 /* We sometimes store vertex data in the dynamic state buffer for blorp
1339 * operations and our dynamic state stream may re-use data from previous
1340 * command buffers. In order to prevent stale cache data, we flush the VF
1341 * cache. We could do this on every blorp call but that's not really
1342 * needed as all of the data will get written by the CPU prior to the GPU
1343 * executing anything. The chances are fairly high that they will use
1344 * blorp at least once per primary command buffer so it shouldn't be
1345 * wasted.
1346 *
1347 * There is also a workaround on gfx8 which requires us to invalidate the
1348 * VF cache occasionally. It's easier if we can assume we start with a
1349 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1350 */
1351 anv_add_pending_pipe_bits(cmd_buffer,
1352 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1353 "new cmd buffer");
1354
1355 /* We send an "Indirect State Pointers Disable" packet at
1356 * EndCommandBuffer, so all push constant packets are ignored during a
1357 * context restore. Documentation says after that command, we need to
1358 * emit push constants again before any rendering operation. So we
1359 * flag them dirty here to make sure they get emitted.
1360 */
1361 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1362
1363 if (cmd_buffer->usage_flags &
1364 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1365 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1366
1367 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
1368 const VkRenderingInfo *resume_info =
1369 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
1370 pBeginInfo,
1371 gcbiar_data);
1372 if (resume_info != NULL) {
1373 genX(CmdBeginRendering)(commandBuffer, resume_info);
1374 } else {
1375 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
1376 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1377 pBeginInfo);
1378 assert(inheritance_info);
1379
1380 gfx->rendering_flags = inheritance_info->flags;
1381 gfx->render_area = (VkRect2D) { };
1382 gfx->layer_count = 0;
1383 gfx->samples = inheritance_info->rasterizationSamples;
1384 gfx->view_mask = inheritance_info->viewMask;
1385
1386 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1387 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
1388 if (result != VK_SUCCESS)
1389 return result;
1390
1391 for (uint32_t i = 0; i < color_att_count; i++) {
1392 gfx->color_att[i].vk_format =
1393 inheritance_info->pColorAttachmentFormats[i];
1394 }
1395 gfx->depth_att.vk_format =
1396 inheritance_info->depthAttachmentFormat;
1397 gfx->stencil_att.vk_format =
1398 inheritance_info->stencilAttachmentFormat;
1399
1400 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1401
1402 anv_cmd_graphic_state_update_has_uint_rt(gfx);
1403 }
1404 }
1405
1406 #if GFX_VER >= 8
1407 /* Emit the sample pattern at the beginning of the batch because the
1408 * default locations emitted at the device initialization might have been
1409 * changed by a previous command buffer.
1410 *
1411 * Do not change that when we're continuing a previous renderpass.
1412 */
1413 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1414 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1415 genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
1416 #endif
1417
1418 #if GFX_VERx10 >= 75
1419 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1420 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1421 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1422
1423 /* If secondary buffer supports conditional rendering
1424 * we should emit commands as if conditional rendering is enabled.
1425 */
1426 cmd_buffer->state.conditional_render_enabled =
1427 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1428 }
1429 #endif
1430
1431 return VK_SUCCESS;
1432 }
1433
1434 /* From the PRM, Volume 2a:
1435 *
1436 * "Indirect State Pointers Disable
1437 *
1438 * At the completion of the post-sync operation associated with this pipe
1439 * control packet, the indirect state pointers in the hardware are
1440 * considered invalid; the indirect pointers are not saved in the context.
1441 * If any new indirect state commands are executed in the command stream
1442 * while the pipe control is pending, the new indirect state commands are
1443 * preserved.
1444 *
1445 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1446 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1447 * commands are only considered as Indirect State Pointers. Once ISP is
1448 * issued in a context, SW must initialize by programming push constant
1449 * commands for all the shaders (at least to zero length) before attempting
1450 * any rendering operation for the same context."
1451 *
1452 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1453 * even though they point to a BO that has been already unreferenced at
1454 * the end of the previous batch buffer. This has been fine so far since
1455 * we are protected by these scratch page (every address not covered by
1456 * a BO should be pointing to the scratch page). But on CNL, it is
1457 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1458 * instruction.
1459 *
1460 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1461 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1462 * context restore, so the mentioned hang doesn't happen. However,
1463 * software must program push constant commands for all stages prior to
1464 * rendering anything. So we flag them dirty in BeginCommandBuffer.
1465 *
1466 * Finally, we also make sure to stall at pixel scoreboard to make sure the
1467 * constants have been loaded into the EUs prior to disable the push constants
1468 * so that it doesn't hang a previous 3DPRIMITIVE.
1469 */
1470 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1471 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1472 {
1473 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1474 pc.StallAtPixelScoreboard = true;
1475 pc.CommandStreamerStallEnable = true;
1476 anv_debug_dump_pc(pc);
1477 }
1478 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1479 pc.IndirectStatePointersDisable = true;
1480 pc.CommandStreamerStallEnable = true;
1481 anv_debug_dump_pc(pc);
1482 }
1483 }
1484
1485 VkResult
genX(EndCommandBuffer)1486 genX(EndCommandBuffer)(
1487 VkCommandBuffer commandBuffer)
1488 {
1489 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1490
1491 if (anv_batch_has_error(&cmd_buffer->batch))
1492 return cmd_buffer->batch.status;
1493
1494 anv_measure_endcommandbuffer(cmd_buffer);
1495
1496 /* We want every command buffer to start with the PMA fix in a known state,
1497 * so we disable it at the end of the command buffer.
1498 */
1499 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1500
1501 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1502
1503 emit_isp_disable(cmd_buffer);
1504
1505 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
1506
1507 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1508
1509 return VK_SUCCESS;
1510 }
1511
1512 void
genX(CmdExecuteCommands)1513 genX(CmdExecuteCommands)(
1514 VkCommandBuffer commandBuffer,
1515 uint32_t commandBufferCount,
1516 const VkCommandBuffer* pCmdBuffers)
1517 {
1518 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1519
1520 assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1521
1522 if (anv_batch_has_error(&primary->batch))
1523 return;
1524
1525 /* The secondary command buffers will assume that the PMA fix is disabled
1526 * when they begin executing. Make sure this is true.
1527 */
1528 genX(cmd_buffer_enable_pma_fix)(primary, false);
1529
1530 /* The secondary command buffer doesn't know which textures etc. have been
1531 * flushed prior to their execution. Apply those flushes now.
1532 */
1533 genX(cmd_buffer_apply_pipe_flushes)(primary);
1534
1535 for (uint32_t i = 0; i < commandBufferCount; i++) {
1536 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1537
1538 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1539 assert(!anv_batch_has_error(&secondary->batch));
1540
1541 #if GFX_VERx10 >= 75
1542 if (secondary->state.conditional_render_enabled) {
1543 if (!primary->state.conditional_render_enabled) {
1544 /* Secondary buffer is constructed as if it will be executed
1545 * with conditional rendering, we should satisfy this dependency
1546 * regardless of conditional rendering being enabled in primary.
1547 */
1548 struct mi_builder b;
1549 mi_builder_init(&b, primary->device->info, &primary->batch);
1550 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1551 mi_imm(UINT64_MAX));
1552 }
1553 }
1554 #endif
1555
1556 if (secondary->usage_flags &
1557 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1558 /* If we're continuing a render pass from the primary, we need to
1559 * copy the surface states for the current subpass into the storage
1560 * we allocated for them in BeginCommandBuffer.
1561 */
1562 struct anv_bo *ss_bo =
1563 primary->device->surface_state_pool.block_pool.bo;
1564 struct anv_state src_state = primary->state.gfx.att_states;
1565 struct anv_state dst_state = secondary->state.gfx.att_states;
1566 assert(src_state.alloc_size == dst_state.alloc_size);
1567
1568 genX(cmd_buffer_so_memcpy)(primary,
1569 (struct anv_address) {
1570 .bo = ss_bo,
1571 .offset = dst_state.offset,
1572 },
1573 (struct anv_address) {
1574 .bo = ss_bo,
1575 .offset = src_state.offset,
1576 },
1577 src_state.alloc_size);
1578 }
1579
1580 anv_cmd_buffer_add_secondary(primary, secondary);
1581
1582 assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1583 secondary->perf_query_pool == primary->perf_query_pool);
1584 if (secondary->perf_query_pool)
1585 primary->perf_query_pool = secondary->perf_query_pool;
1586 }
1587
1588 /* The secondary isn't counted in our VF cache tracking so we need to
1589 * invalidate the whole thing.
1590 */
1591 if (GFX_VER == 8) {
1592 anv_add_pending_pipe_bits(primary,
1593 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1594 "Secondary cmd buffer not tracked in VF cache");
1595 }
1596
1597 /* The secondary may have selected a different pipeline (3D or compute) and
1598 * may have changed the current L3$ configuration. Reset our tracking
1599 * variables to invalid values to ensure that we re-emit these in the case
1600 * where we do any draws or compute dispatches from the primary after the
1601 * secondary has returned.
1602 */
1603 primary->state.current_pipeline = UINT32_MAX;
1604 primary->state.current_l3_config = NULL;
1605 primary->state.current_hash_scale = 0;
1606 primary->state.gfx.push_constant_stages = 0;
1607 vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
1608
1609 /* Each of the secondary command buffers will use its own state base
1610 * address. We need to re-emit state base address for the primary after
1611 * all of the secondaries are done.
1612 *
1613 * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1614 * address calls?
1615 */
1616 genX(cmd_buffer_emit_state_base_address)(primary);
1617 }
1618
1619 /**
1620 * Program the hardware to use the specified L3 configuration.
1621 */
1622 void
genX(cmd_buffer_config_l3)1623 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1624 const struct intel_l3_config *cfg)
1625 {
1626 assert(cfg);
1627 if (cfg == cmd_buffer->state.current_l3_config)
1628 return;
1629
1630 if (INTEL_DEBUG(DEBUG_L3)) {
1631 mesa_logd("L3 config transition: ");
1632 intel_dump_l3_config(cfg, stderr);
1633 }
1634
1635 /* According to the hardware docs, the L3 partitioning can only be changed
1636 * while the pipeline is completely drained and the caches are flushed,
1637 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1638 */
1639 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1640 pc.DCFlushEnable = true;
1641 pc.PostSyncOperation = NoWrite;
1642 pc.CommandStreamerStallEnable = true;
1643 anv_debug_dump_pc(pc);
1644 }
1645
1646 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1647 * invalidation of the relevant caches. Note that because RO invalidation
1648 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1649 * command is processed by the CS) we cannot combine it with the previous
1650 * stalling flush as the hardware documentation suggests, because that
1651 * would cause the CS to stall on previous rendering *after* RO
1652 * invalidation and wouldn't prevent the RO caches from being polluted by
1653 * concurrent rendering before the stall completes. This intentionally
1654 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1655 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1656 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1657 * already guarantee that there is no concurrent GPGPU kernel execution
1658 * (see SKL HSD 2132585).
1659 */
1660 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1661 pc.TextureCacheInvalidationEnable = true;
1662 pc.ConstantCacheInvalidationEnable = true;
1663 pc.InstructionCacheInvalidateEnable = true;
1664 pc.StateCacheInvalidationEnable = true;
1665 pc.PostSyncOperation = NoWrite;
1666 anv_debug_dump_pc(pc);
1667 }
1668
1669 /* Now send a third stalling flush to make sure that invalidation is
1670 * complete when the L3 configuration registers are modified.
1671 */
1672 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1673 pc.DCFlushEnable = true;
1674 pc.PostSyncOperation = NoWrite;
1675 pc.CommandStreamerStallEnable = true;
1676 anv_debug_dump_pc(pc);
1677 }
1678
1679 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1680 cmd_buffer->state.current_l3_config = cfg;
1681 }
1682
1683 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1684 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1685 struct anv_device *device,
1686 uint32_t current_pipeline,
1687 enum anv_pipe_bits bits)
1688 {
1689 /*
1690 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1691 *
1692 * Write synchronization is a special case of end-of-pipe
1693 * synchronization that requires that the render cache and/or depth
1694 * related caches are flushed to memory, where the data will become
1695 * globally visible. This type of synchronization is required prior to
1696 * SW (CPU) actually reading the result data from memory, or initiating
1697 * an operation that will use as a read surface (such as a texture
1698 * surface) a previous render target and/or depth/stencil buffer
1699 *
1700 *
1701 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1702 *
1703 * Exercising the write cache flush bits (Render Target Cache Flush
1704 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1705 * ensures the write caches are flushed and doesn't guarantee the data
1706 * is globally visible.
1707 *
1708 * SW can track the completion of the end-of-pipe-synchronization by
1709 * using "Notify Enable" and "PostSync Operation - Write Immediate
1710 * Data" in the PIPE_CONTROL command.
1711 *
1712 * In other words, flushes are pipelined while invalidations are handled
1713 * immediately. Therefore, if we're flushing anything then we need to
1714 * schedule an end-of-pipe sync before any invalidations can happen.
1715 */
1716 if (bits & ANV_PIPE_FLUSH_BITS)
1717 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1718
1719 /* If we're going to do an invalidate and we have a pending end-of-pipe
1720 * sync that has yet to be resolved, we do the end-of-pipe sync now.
1721 */
1722 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1723 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1724 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1725 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1726 }
1727
1728 /* Project: SKL / Argument: LRI Post Sync Operation [23]
1729 *
1730 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1731 * programmed prior to programming a PIPECONTROL command with "LRI
1732 * Post Sync Operation" in GPGPU mode of operation (i.e when
1733 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
1734 *
1735 * The same text exists a few rows below for Post Sync Op.
1736 */
1737 if (bits & ANV_PIPE_POST_SYNC_BIT)
1738 bits &= ~ANV_PIPE_POST_SYNC_BIT;
1739
1740 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1741 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1742 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1743 /* Flushing HDC pipeline requires DC Flush on earlier HW. */
1744 pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
1745 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
1746 pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
1747 pipe.RenderTargetCacheFlushEnable =
1748 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
1749
1750 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
1751 #if GFX_VER == 8
1752 /* From Broadwell PRM, volume 2a:
1753 * PIPE_CONTROL: Command Streamer Stall Enable:
1754 *
1755 * "This bit must be always set when PIPE_CONTROL command is
1756 * programmed by GPGPU and MEDIA workloads, except for the cases
1757 * when only Read Only Cache Invalidation bits are set (State
1758 * Cache Invalidation Enable, Instruction cache Invalidation
1759 * Enable, Texture Cache Invalidation Enable, Constant Cache
1760 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
1761 * need not implemented when FF_DOP_CG is disabled."
1762 *
1763 * Since we do all the invalidation in the following PIPE_CONTROL,
1764 * if we got here, we need a stall.
1765 */
1766 pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
1767 #endif
1768
1769 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
1770
1771 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1772 *
1773 * "The most common action to perform upon reaching a
1774 * synchronization point is to write a value out to memory. An
1775 * immediate value (included with the synchronization command) may
1776 * be written."
1777 *
1778 *
1779 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1780 *
1781 * "In case the data flushed out by the render engine is to be
1782 * read back in to the render engine in coherent manner, then the
1783 * render engine has to wait for the fence completion before
1784 * accessing the flushed data. This can be achieved by following
1785 * means on various products: PIPE_CONTROL command with CS Stall
1786 * and the required write caches flushed with Post-Sync-Operation
1787 * as Write Immediate Data.
1788 *
1789 * Example:
1790 * - Workload-1 (3D/GPGPU/MEDIA)
1791 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1792 * Immediate Data, Required Write Cache Flush bits set)
1793 * - Workload-2 (Can use the data produce or output by
1794 * Workload-1)
1795 */
1796 if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1797 pipe.CommandStreamerStallEnable = true;
1798 pipe.PostSyncOperation = WriteImmediateData;
1799 pipe.Address = device->workaround_address;
1800 }
1801
1802 /*
1803 * According to the Broadwell documentation, any PIPE_CONTROL with the
1804 * "Command Streamer Stall" bit set must also have another bit set,
1805 * with five different options:
1806 *
1807 * - Render Target Cache Flush
1808 * - Depth Cache Flush
1809 * - Stall at Pixel Scoreboard
1810 * - Post-Sync Operation
1811 * - Depth Stall
1812 * - DC Flush Enable
1813 *
1814 * I chose "Stall at Pixel Scoreboard" since that's what we use in
1815 * mesa and it seems to work fine. The choice is fairly arbitrary.
1816 */
1817 if (pipe.CommandStreamerStallEnable &&
1818 !pipe.RenderTargetCacheFlushEnable &&
1819 !pipe.DepthCacheFlushEnable &&
1820 !pipe.StallAtPixelScoreboard &&
1821 !pipe.PostSyncOperation &&
1822 !pipe.DepthStallEnable &&
1823 !pipe.DCFlushEnable)
1824 pipe.StallAtPixelScoreboard = true;
1825 anv_debug_dump_pc(pipe);
1826 }
1827
1828 /* If a render target flush was emitted, then we can toggle off the bit
1829 * saying that render target writes are ongoing.
1830 */
1831 if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
1832 bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
1833
1834 if (GFX_VERx10 == 75) {
1835 /* Haswell needs addition work-arounds:
1836 *
1837 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1838 *
1839 * Option 1:
1840 * PIPE_CONTROL command with the CS Stall and the required write
1841 * caches flushed with Post-SyncOperation as Write Immediate Data
1842 * followed by eight dummy MI_STORE_DATA_IMM (write to scratch
1843 * spce) commands.
1844 *
1845 * Example:
1846 * - Workload-1
1847 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1848 * Immediate Data, Required Write Cache Flush bits set)
1849 * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
1850 * - Workload-2 (Can use the data produce or output by
1851 * Workload-1)
1852 *
1853 * Unfortunately, both the PRMs and the internal docs are a bit
1854 * out-of-date in this regard. What the windows driver does (and
1855 * this appears to actually work) is to emit a register read from the
1856 * memory address written by the pipe control above.
1857 *
1858 * What register we load into doesn't matter. We choose an indirect
1859 * rendering register because we know it always exists and it's one
1860 * of the first registers the command parser allows us to write. If
1861 * you don't have command parser support in your kernel (pre-4.2),
1862 * this will get turned into MI_NOOP and you won't get the
1863 * workaround. Unfortunately, there's just not much we can do in
1864 * that case. This register is perfectly safe to write since we
1865 * always re-load all of the indirect draw registers right before
1866 * 3DPRIMITIVE when needed anyway.
1867 */
1868 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1869 lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
1870 lrm.MemoryAddress = device->workaround_address;
1871 }
1872 }
1873
1874 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1875 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1876 }
1877
1878 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1879 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1880 pipe.StateCacheInvalidationEnable =
1881 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
1882 pipe.ConstantCacheInvalidationEnable =
1883 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
1884 pipe.VFCacheInvalidationEnable =
1885 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
1886 pipe.TextureCacheInvalidationEnable =
1887 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
1888 pipe.InstructionCacheInvalidateEnable =
1889 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
1890
1891 anv_debug_dump_pc(pipe);
1892 }
1893
1894 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1895 }
1896
1897 return bits;
1898 }
1899
1900 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1901 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1902 {
1903 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1904
1905 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1906 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1907 else if (bits == 0)
1908 return;
1909
1910 bool trace_flush =
1911 (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
1912 if (trace_flush)
1913 trace_intel_begin_stall(&cmd_buffer->trace);
1914
1915 if (GFX_VER == 8 &&
1916 (bits & ANV_PIPE_CS_STALL_BIT) &&
1917 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1918 /* If we are doing a VF cache invalidate AND a CS stall (it must be
1919 * both) then we can reset our vertex cache tracking.
1920 */
1921 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1922 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1923 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1924 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1925 }
1926
1927 cmd_buffer->state.pending_pipe_bits =
1928 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1929 cmd_buffer->device,
1930 cmd_buffer->state.current_pipeline,
1931 bits);
1932
1933 if (trace_flush) {
1934 trace_intel_end_stall(&cmd_buffer->trace, bits,
1935 anv_pipe_flush_bit_to_ds_stall_flag,
1936 NULL, NULL, NULL, NULL);
1937 }
1938 }
1939
1940 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,const char * reason)1941 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
1942 const VkDependencyInfo *dep_info,
1943 const char *reason)
1944 {
1945 /* XXX: Right now, we're really dumb and just flush whatever categories
1946 * the app asks for. One of these days we may make this a bit better
1947 * but right now that's all the hardware allows for in most areas.
1948 */
1949 VkAccessFlags2 src_flags = 0;
1950 VkAccessFlags2 dst_flags = 0;
1951
1952 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
1953 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
1954 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
1955 }
1956
1957 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
1958 src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
1959 dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
1960 }
1961
1962 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
1963 const VkImageMemoryBarrier2 *img_barrier =
1964 &dep_info->pImageMemoryBarriers[i];
1965
1966 src_flags |= img_barrier->srcAccessMask;
1967 dst_flags |= img_barrier->dstAccessMask;
1968
1969 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
1970 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
1971
1972 uint32_t base_layer, layer_count;
1973 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
1974 base_layer = 0;
1975 layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
1976 } else {
1977 base_layer = range->baseArrayLayer;
1978 layer_count = vk_image_subresource_layer_count(&image->vk, range);
1979 }
1980 const uint32_t level_count =
1981 vk_image_subresource_level_count(&image->vk, range);
1982
1983 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1984 transition_depth_buffer(cmd_buffer, image,
1985 base_layer, layer_count,
1986 img_barrier->oldLayout,
1987 img_barrier->newLayout,
1988 false /* will_full_fast_clear */);
1989 }
1990
1991 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1992 transition_stencil_buffer(cmd_buffer, image,
1993 range->baseMipLevel, level_count,
1994 base_layer, layer_count,
1995 img_barrier->oldLayout,
1996 img_barrier->newLayout,
1997 false /* will_full_fast_clear */);
1998
1999 /* If we are in a renderpass, the gfx7 stencil shadow may need to be
2000 * updated even if the layout doesn't change
2001 */
2002 if (cmd_buffer->state.gfx.samples &&
2003 (img_barrier->dstAccessMask & (VK_ACCESS_2_SHADER_READ_BIT |
2004 VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2005 VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))) {
2006 const uint32_t plane =
2007 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
2008 if (anv_surface_is_valid(&image->planes[plane].shadow_surface))
2009 anv_image_copy_to_shadow(cmd_buffer, image,
2010 VK_IMAGE_ASPECT_STENCIL_BIT,
2011 range->baseMipLevel, level_count,
2012 base_layer, layer_count);
2013 }
2014 }
2015
2016 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2017 VkImageAspectFlags color_aspects =
2018 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2019 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2020 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2021 range->baseMipLevel, level_count,
2022 base_layer, layer_count,
2023 img_barrier->oldLayout,
2024 img_barrier->newLayout,
2025 img_barrier->srcQueueFamilyIndex,
2026 img_barrier->dstQueueFamilyIndex,
2027 false /* will_full_fast_clear */);
2028 }
2029 }
2030 }
2031
2032 enum anv_pipe_bits bits =
2033 anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2034 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2035
2036 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2037 }
2038
genX(CmdPipelineBarrier2)2039 void genX(CmdPipelineBarrier2)(
2040 VkCommandBuffer commandBuffer,
2041 const VkDependencyInfo* pDependencyInfo)
2042 {
2043 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2044
2045 cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2046 }
2047
2048 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2049 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2050 {
2051 VkShaderStageFlags stages =
2052 cmd_buffer->state.gfx.pipeline->active_stages;
2053
2054 /* In order to avoid thrash, we assume that vertex and fragment stages
2055 * always exist. In the rare case where one is missing *and* the other
2056 * uses push concstants, this may be suboptimal. However, avoiding stalls
2057 * seems more important.
2058 */
2059 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2060 if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2061 stages |= VK_SHADER_STAGE_VERTEX_BIT;
2062
2063 if (stages == cmd_buffer->state.gfx.push_constant_stages)
2064 return;
2065
2066 const unsigned push_constant_kb =
2067 cmd_buffer->device->info->max_constant_urb_size_kb;
2068
2069 const unsigned num_stages =
2070 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2071 unsigned size_per_stage = push_constant_kb / num_stages;
2072
2073 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2074 * units of 2KB. Incidentally, these are the same platforms that have
2075 * 32KB worth of push constant space.
2076 */
2077 if (push_constant_kb == 32)
2078 size_per_stage &= ~1u;
2079
2080 uint32_t kb_used = 0;
2081 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2082 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2083 anv_batch_emit(&cmd_buffer->batch,
2084 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2085 alloc._3DCommandSubOpcode = 18 + i;
2086 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2087 alloc.ConstantBufferSize = push_size;
2088 }
2089 kb_used += push_size;
2090 }
2091
2092 anv_batch_emit(&cmd_buffer->batch,
2093 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2094 alloc.ConstantBufferOffset = kb_used;
2095 alloc.ConstantBufferSize = push_constant_kb - kb_used;
2096 }
2097
2098 cmd_buffer->state.gfx.push_constant_stages = stages;
2099
2100 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2101 *
2102 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2103 * the next 3DPRIMITIVE command after programming the
2104 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2105 *
2106 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2107 * pipeline setup, we need to dirty push constants.
2108 */
2109 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2110 }
2111
2112 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2113 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2114 struct anv_cmd_pipeline_state *pipe_state,
2115 struct anv_shader_bin *shader,
2116 struct anv_state *bt_state)
2117 {
2118 uint32_t state_offset;
2119
2120 struct anv_pipeline_bind_map *map = &shader->bind_map;
2121 if (map->surface_count == 0) {
2122 *bt_state = (struct anv_state) { 0, };
2123 return VK_SUCCESS;
2124 }
2125
2126 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2127 map->surface_count,
2128 &state_offset);
2129 uint32_t *bt_map = bt_state->map;
2130
2131 if (bt_state->map == NULL)
2132 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2133
2134 /* We only need to emit relocs if we're not using softpin. If we are using
2135 * softpin then we always keep all user-allocated memory objects resident.
2136 */
2137 const bool need_client_mem_relocs =
2138 anv_use_relocations(cmd_buffer->device->physical);
2139 struct anv_push_constants *push = &pipe_state->push_constants;
2140
2141 for (uint32_t s = 0; s < map->surface_count; s++) {
2142 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2143
2144 struct anv_state surface_state;
2145
2146 switch (binding->set) {
2147 case ANV_DESCRIPTOR_SET_NULL:
2148 bt_map[s] = 0;
2149 break;
2150
2151 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2152 /* Color attachment binding */
2153 assert(shader->stage == MESA_SHADER_FRAGMENT);
2154 if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2155 const struct anv_attachment *att =
2156 &cmd_buffer->state.gfx.color_att[binding->index];
2157 surface_state = att->surface_state.state;
2158 } else {
2159 surface_state = cmd_buffer->state.gfx.null_surface_state;
2160 }
2161 assert(surface_state.map);
2162 bt_map[s] = surface_state.offset + state_offset;
2163 break;
2164
2165 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2166 struct anv_state surface_state =
2167 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2168
2169 struct anv_address constant_data = {
2170 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2171 .offset = shader->kernel.offset +
2172 shader->prog_data->const_data_offset,
2173 };
2174 unsigned constant_data_size = shader->prog_data->const_data_size;
2175
2176 const enum isl_format format =
2177 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2178 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2179 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2180 format, ISL_SWIZZLE_IDENTITY,
2181 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2182 constant_data, constant_data_size, 1);
2183
2184 assert(surface_state.map);
2185 bt_map[s] = surface_state.offset + state_offset;
2186 add_surface_reloc(cmd_buffer, surface_state, constant_data);
2187 break;
2188 }
2189
2190 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2191 /* This is always the first binding for compute shaders */
2192 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2193
2194 struct anv_state surface_state =
2195 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2196
2197 const enum isl_format format =
2198 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2199 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2200 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2201 format, ISL_SWIZZLE_IDENTITY,
2202 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2203 cmd_buffer->state.compute.num_workgroups,
2204 12, 1);
2205
2206 assert(surface_state.map);
2207 bt_map[s] = surface_state.offset + state_offset;
2208 if (need_client_mem_relocs) {
2209 add_surface_reloc(cmd_buffer, surface_state,
2210 cmd_buffer->state.compute.num_workgroups);
2211 }
2212 break;
2213 }
2214
2215 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2216 /* This is a descriptor set buffer so the set index is actually
2217 * given by binding->binding. (Yes, that's confusing.)
2218 */
2219 struct anv_descriptor_set *set =
2220 pipe_state->descriptors[binding->index];
2221 assert(set->desc_mem.alloc_size);
2222 assert(set->desc_surface_state.alloc_size);
2223 bt_map[s] = set->desc_surface_state.offset + state_offset;
2224 add_surface_reloc(cmd_buffer, set->desc_surface_state,
2225 anv_descriptor_set_address(set));
2226 break;
2227 }
2228
2229 default: {
2230 assert(binding->set < MAX_SETS);
2231 const struct anv_descriptor_set *set =
2232 pipe_state->descriptors[binding->set];
2233 if (binding->index >= set->descriptor_count) {
2234 /* From the Vulkan spec section entitled "DescriptorSet and
2235 * Binding Assignment":
2236 *
2237 * "If the array is runtime-sized, then array elements greater
2238 * than or equal to the size of that binding in the bound
2239 * descriptor set must not be used."
2240 *
2241 * Unfortunately, the compiler isn't smart enough to figure out
2242 * when a dynamic binding isn't used so it may grab the whole
2243 * array and stick it in the binding table. In this case, it's
2244 * safe to just skip those bindings that are OOB.
2245 */
2246 assert(binding->index < set->layout->descriptor_count);
2247 continue;
2248 }
2249 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2250
2251 switch (desc->type) {
2252 case VK_DESCRIPTOR_TYPE_SAMPLER:
2253 /* Nothing for us to do here */
2254 continue;
2255
2256 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2257 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2258 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2259 if (desc->image_view) {
2260 struct anv_surface_state sstate =
2261 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2262 desc->image_view->planes[binding->plane].general_sampler_surface_state :
2263 desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2264 surface_state = sstate.state;
2265 assert(surface_state.alloc_size);
2266 if (need_client_mem_relocs)
2267 add_surface_state_relocs(cmd_buffer, sstate);
2268 } else {
2269 surface_state = cmd_buffer->device->null_surface_state;
2270 }
2271 break;
2272 }
2273
2274 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2275 if (desc->image_view) {
2276 struct anv_surface_state sstate =
2277 binding->lowered_storage_surface
2278 ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2279 : desc->image_view->planes[binding->plane].storage_surface_state;
2280 surface_state = sstate.state;
2281 assert(surface_state.alloc_size);
2282 if (surface_state.offset == 0) {
2283 mesa_loge("Bound a image to a descriptor where the "
2284 "descriptor does not have NonReadable "
2285 "set and the image does not have a "
2286 "corresponding SPIR-V format enum.");
2287 vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2288 VK_DEBUG_REPORT_ERROR_BIT_EXT,
2289 &desc->image_view->vk.base,
2290 __LINE__, 0, "anv",
2291 "Bound a image to a descriptor where the "
2292 "descriptor does not have NonReadable "
2293 "set and the image does not have a "
2294 "corresponding SPIR-V format enum.");
2295 }
2296 if (surface_state.offset && need_client_mem_relocs)
2297 add_surface_state_relocs(cmd_buffer, sstate);
2298 } else {
2299 surface_state = cmd_buffer->device->null_surface_state;
2300 }
2301 break;
2302 }
2303
2304 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2305 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2306 if (desc->set_buffer_view) {
2307 surface_state = desc->set_buffer_view->surface_state;
2308 assert(surface_state.alloc_size);
2309 if (need_client_mem_relocs) {
2310 add_surface_reloc(cmd_buffer, surface_state,
2311 desc->set_buffer_view->address);
2312 }
2313 } else {
2314 surface_state = cmd_buffer->device->null_surface_state;
2315 }
2316 break;
2317
2318 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2319 if (desc->buffer_view) {
2320 surface_state = desc->buffer_view->surface_state;
2321 assert(surface_state.alloc_size);
2322 if (need_client_mem_relocs) {
2323 add_surface_reloc(cmd_buffer, surface_state,
2324 desc->buffer_view->address);
2325 }
2326 } else {
2327 surface_state = cmd_buffer->device->null_surface_state;
2328 }
2329 break;
2330
2331 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2332 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2333 if (desc->buffer) {
2334 /* Compute the offset within the buffer */
2335 uint32_t dynamic_offset =
2336 push->dynamic_offsets[binding->dynamic_offset_index];
2337 uint64_t offset = desc->offset + dynamic_offset;
2338 /* Clamp to the buffer size */
2339 offset = MIN2(offset, desc->buffer->vk.size);
2340 /* Clamp the range to the buffer size */
2341 uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
2342
2343 /* Align the range for consistency */
2344 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2345 range = align(range, ANV_UBO_ALIGNMENT);
2346
2347 struct anv_address address =
2348 anv_address_add(desc->buffer->address, offset);
2349
2350 surface_state =
2351 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2352 enum isl_format format =
2353 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2354 desc->type);
2355
2356 isl_surf_usage_flags_t usage =
2357 desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2358 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2359 ISL_SURF_USAGE_STORAGE_BIT;
2360
2361 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2362 format, ISL_SWIZZLE_IDENTITY,
2363 usage, address, range, 1);
2364 if (need_client_mem_relocs)
2365 add_surface_reloc(cmd_buffer, surface_state, address);
2366 } else {
2367 surface_state = cmd_buffer->device->null_surface_state;
2368 }
2369 break;
2370 }
2371
2372 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2373 if (desc->buffer_view) {
2374 surface_state = binding->lowered_storage_surface
2375 ? desc->buffer_view->lowered_storage_surface_state
2376 : desc->buffer_view->storage_surface_state;
2377 assert(surface_state.alloc_size);
2378 if (need_client_mem_relocs) {
2379 add_surface_reloc(cmd_buffer, surface_state,
2380 desc->buffer_view->address);
2381 }
2382 } else {
2383 surface_state = cmd_buffer->device->null_surface_state;
2384 }
2385 break;
2386
2387 default:
2388 assert(!"Invalid descriptor type");
2389 continue;
2390 }
2391 assert(surface_state.map);
2392 bt_map[s] = surface_state.offset + state_offset;
2393 break;
2394 }
2395 }
2396 }
2397
2398 return VK_SUCCESS;
2399 }
2400
2401 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2402 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2403 struct anv_cmd_pipeline_state *pipe_state,
2404 struct anv_shader_bin *shader,
2405 struct anv_state *state)
2406 {
2407 struct anv_pipeline_bind_map *map = &shader->bind_map;
2408 if (map->sampler_count == 0) {
2409 *state = (struct anv_state) { 0, };
2410 return VK_SUCCESS;
2411 }
2412
2413 uint32_t size = map->sampler_count * 16;
2414 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2415
2416 if (state->map == NULL)
2417 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2418
2419 for (uint32_t s = 0; s < map->sampler_count; s++) {
2420 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2421 const struct anv_descriptor *desc =
2422 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2423
2424 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2425 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2426 continue;
2427
2428 struct anv_sampler *sampler = desc->sampler;
2429
2430 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2431 * happens to be zero.
2432 */
2433 if (sampler == NULL)
2434 continue;
2435
2436 memcpy(state->map + (s * 16),
2437 sampler->state[binding->plane], sizeof(sampler->state[0]));
2438 }
2439
2440 return VK_SUCCESS;
2441 }
2442
2443 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2444 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2445 struct anv_cmd_pipeline_state *pipe_state,
2446 const VkShaderStageFlags dirty,
2447 struct anv_shader_bin **shaders,
2448 uint32_t num_shaders)
2449 {
2450 VkShaderStageFlags flushed = 0;
2451
2452 VkResult result = VK_SUCCESS;
2453 for (uint32_t i = 0; i < num_shaders; i++) {
2454 if (!shaders[i])
2455 continue;
2456
2457 gl_shader_stage stage = shaders[i]->stage;
2458 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2459 if ((vk_stage & dirty) == 0)
2460 continue;
2461
2462 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2463 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2464 &cmd_buffer->state.samplers[stage]);
2465 if (result != VK_SUCCESS)
2466 break;
2467
2468 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2469 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2470 &cmd_buffer->state.binding_tables[stage]);
2471 if (result != VK_SUCCESS)
2472 break;
2473
2474 flushed |= vk_stage;
2475 }
2476
2477 if (result != VK_SUCCESS) {
2478 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2479
2480 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2481 if (result != VK_SUCCESS)
2482 return 0;
2483
2484 /* Re-emit state base addresses so we get the new surface state base
2485 * address before we start emitting binding tables etc.
2486 */
2487 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2488
2489 /* Re-emit all active binding tables */
2490 flushed = 0;
2491
2492 for (uint32_t i = 0; i < num_shaders; i++) {
2493 if (!shaders[i])
2494 continue;
2495
2496 gl_shader_stage stage = shaders[i]->stage;
2497
2498 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2499 &cmd_buffer->state.samplers[stage]);
2500 if (result != VK_SUCCESS) {
2501 anv_batch_set_error(&cmd_buffer->batch, result);
2502 return 0;
2503 }
2504 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2505 &cmd_buffer->state.binding_tables[stage]);
2506 if (result != VK_SUCCESS) {
2507 anv_batch_set_error(&cmd_buffer->batch, result);
2508 return 0;
2509 }
2510
2511 flushed |= mesa_to_vk_shader_stage(stage);
2512 }
2513 }
2514
2515 return flushed;
2516 }
2517
2518 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2519 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2520 uint32_t stages)
2521 {
2522 static const uint32_t sampler_state_opcodes[] = {
2523 [MESA_SHADER_VERTEX] = 43,
2524 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
2525 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
2526 [MESA_SHADER_GEOMETRY] = 46,
2527 [MESA_SHADER_FRAGMENT] = 47,
2528 };
2529
2530 static const uint32_t binding_table_opcodes[] = {
2531 [MESA_SHADER_VERTEX] = 38,
2532 [MESA_SHADER_TESS_CTRL] = 39,
2533 [MESA_SHADER_TESS_EVAL] = 40,
2534 [MESA_SHADER_GEOMETRY] = 41,
2535 [MESA_SHADER_FRAGMENT] = 42,
2536 };
2537
2538 anv_foreach_stage(s, stages) {
2539 assert(s < ARRAY_SIZE(binding_table_opcodes));
2540
2541 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2542 anv_batch_emit(&cmd_buffer->batch,
2543 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2544 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2545 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2546 }
2547 }
2548
2549 /* Always emit binding table pointers if we're asked to, since on SKL
2550 * this is what flushes push constants. */
2551 anv_batch_emit(&cmd_buffer->batch,
2552 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
2553 btp._3DCommandSubOpcode = binding_table_opcodes[s];
2554 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
2555 }
2556 }
2557 }
2558
2559 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2560 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
2561 const struct anv_shader_bin *shader,
2562 const struct anv_push_range *range)
2563 {
2564 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2565 switch (range->set) {
2566 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2567 /* This is a descriptor set buffer so the set index is
2568 * actually given by binding->binding. (Yes, that's
2569 * confusing.)
2570 */
2571 struct anv_descriptor_set *set =
2572 gfx_state->base.descriptors[range->index];
2573 return anv_descriptor_set_address(set);
2574 }
2575
2576 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
2577 if (gfx_state->base.push_constants_state.alloc_size == 0) {
2578 gfx_state->base.push_constants_state =
2579 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
2580 }
2581 return (struct anv_address) {
2582 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
2583 .offset = gfx_state->base.push_constants_state.offset,
2584 };
2585 }
2586
2587 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2588 return (struct anv_address) {
2589 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2590 .offset = shader->kernel.offset +
2591 shader->prog_data->const_data_offset,
2592 };
2593
2594 default: {
2595 assert(range->set < MAX_SETS);
2596 struct anv_descriptor_set *set =
2597 gfx_state->base.descriptors[range->set];
2598 const struct anv_descriptor *desc =
2599 &set->descriptors[range->index];
2600
2601 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2602 if (desc->buffer_view)
2603 return desc->buffer_view->address;
2604 } else {
2605 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2606 if (desc->buffer) {
2607 const struct anv_push_constants *push =
2608 &gfx_state->base.push_constants;
2609 uint32_t dynamic_offset =
2610 push->dynamic_offsets[range->dynamic_offset_index];
2611 return anv_address_add(desc->buffer->address,
2612 desc->offset + dynamic_offset);
2613 }
2614 }
2615
2616 /* For NULL UBOs, we just return an address in the workaround BO. We do
2617 * writes to it for workarounds but always at the bottom. The higher
2618 * bytes should be all zeros.
2619 */
2620 assert(range->length * 32 <= 2048);
2621 return (struct anv_address) {
2622 .bo = cmd_buffer->device->workaround_bo,
2623 .offset = 1024,
2624 };
2625 }
2626 }
2627 }
2628
2629
2630 /** Returns the size in bytes of the bound buffer
2631 *
2632 * The range is relative to the start of the buffer, not the start of the
2633 * range. The returned range may be smaller than
2634 *
2635 * (range->start + range->length) * 32;
2636 */
2637 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2638 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
2639 const struct anv_shader_bin *shader,
2640 const struct anv_push_range *range)
2641 {
2642 assert(shader->stage != MESA_SHADER_COMPUTE);
2643 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2644 switch (range->set) {
2645 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2646 struct anv_descriptor_set *set =
2647 gfx_state->base.descriptors[range->index];
2648 assert(range->start * 32 < set->desc_mem.alloc_size);
2649 assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
2650 return set->desc_mem.alloc_size;
2651 }
2652
2653 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
2654 return (range->start + range->length) * 32;
2655
2656 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2657 return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
2658
2659 default: {
2660 assert(range->set < MAX_SETS);
2661 struct anv_descriptor_set *set =
2662 gfx_state->base.descriptors[range->set];
2663 const struct anv_descriptor *desc =
2664 &set->descriptors[range->index];
2665
2666 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2667 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
2668 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
2669 */
2670 if (!desc->set_buffer_view)
2671 return 0;
2672
2673 if (range->start * 32 > desc->set_buffer_view->range)
2674 return 0;
2675
2676 return desc->set_buffer_view->range;
2677 } else {
2678 if (!desc->buffer)
2679 return 0;
2680
2681 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2682 /* Compute the offset within the buffer */
2683 const struct anv_push_constants *push =
2684 &gfx_state->base.push_constants;
2685 uint32_t dynamic_offset =
2686 push->dynamic_offsets[range->dynamic_offset_index];
2687 uint64_t offset = desc->offset + dynamic_offset;
2688 /* Clamp to the buffer size */
2689 offset = MIN2(offset, desc->buffer->vk.size);
2690 /* Clamp the range to the buffer size */
2691 uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
2692
2693 /* Align the range for consistency */
2694 bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
2695
2696 return bound_range;
2697 }
2698 }
2699 }
2700 }
2701
2702 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)2703 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
2704 gl_shader_stage stage,
2705 struct anv_address *buffers,
2706 unsigned buffer_count)
2707 {
2708 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2709 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2710
2711 static const uint32_t push_constant_opcodes[] = {
2712 [MESA_SHADER_VERTEX] = 21,
2713 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
2714 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
2715 [MESA_SHADER_GEOMETRY] = 22,
2716 [MESA_SHADER_FRAGMENT] = 23,
2717 };
2718
2719 assert(stage < ARRAY_SIZE(push_constant_opcodes));
2720
2721 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
2722
2723 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
2724 c._3DCommandSubOpcode = push_constant_opcodes[stage];
2725
2726 /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
2727 *
2728 * "Constant Buffer Object Control State must be always
2729 * programmed to zero."
2730 *
2731 * This restriction does not exist on any newer platforms.
2732 *
2733 * We only have one MOCS field for the whole packet, not one per
2734 * buffer. We could go out of our way here to walk over all of
2735 * the buffers and see if any of them are used externally and use
2736 * the external MOCS. However, the notion that someone would use
2737 * the same bit of memory for both scanout and a UBO is nuts.
2738 *
2739 * Let's not bother and assume it's all internal.
2740 */
2741 #if GFX_VER != 8
2742 c.ConstantBody.MOCS = mocs;
2743 #endif
2744
2745 if (anv_pipeline_has_stage(pipeline, stage)) {
2746 const struct anv_pipeline_bind_map *bind_map =
2747 &pipeline->shaders[stage]->bind_map;
2748
2749 #if GFX_VERx10 >= 75
2750 /* The Skylake PRM contains the following restriction:
2751 *
2752 * "The driver must ensure The following case does not occur
2753 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
2754 * buffer 3 read length equal to zero committed followed by a
2755 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
2756 * zero committed."
2757 *
2758 * To avoid this, we program the buffers in the highest slots.
2759 * This way, slot 0 is only used if slot 3 is also used.
2760 */
2761 assert(buffer_count <= 4);
2762 const unsigned shift = 4 - buffer_count;
2763 for (unsigned i = 0; i < buffer_count; i++) {
2764 const struct anv_push_range *range = &bind_map->push_ranges[i];
2765
2766 /* At this point we only have non-empty ranges */
2767 assert(range->length > 0);
2768
2769 /* For Ivy Bridge, make sure we only set the first range (actual
2770 * push constants)
2771 */
2772 assert((GFX_VERx10 >= 75) || i == 0);
2773
2774 c.ConstantBody.ReadLength[i + shift] = range->length;
2775 c.ConstantBody.Buffer[i + shift] =
2776 anv_address_add(buffers[i], range->start * 32);
2777 }
2778 #else
2779 /* For Ivy Bridge, push constants are relative to dynamic state
2780 * base address and we only ever push actual push constants.
2781 */
2782 if (bind_map->push_ranges[0].length > 0) {
2783 assert(buffer_count == 1);
2784 assert(bind_map->push_ranges[0].set ==
2785 ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
2786 assert(buffers[0].bo ==
2787 cmd_buffer->device->dynamic_state_pool.block_pool.bo);
2788 c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
2789 c.ConstantBody.Buffer[0].bo = NULL;
2790 c.ConstantBody.Buffer[0].offset = buffers[0].offset;
2791 }
2792 assert(bind_map->push_ranges[1].length == 0);
2793 assert(bind_map->push_ranges[2].length == 0);
2794 assert(bind_map->push_ranges[3].length == 0);
2795 #endif
2796 }
2797 }
2798 }
2799
2800 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)2801 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
2802 VkShaderStageFlags dirty_stages)
2803 {
2804 VkShaderStageFlags flushed = 0;
2805 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2806 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2807
2808 /* Compute robust pushed register access mask for each stage. */
2809 if (cmd_buffer->device->vk.enabled_features.robustBufferAccess) {
2810 anv_foreach_stage(stage, dirty_stages) {
2811 if (!anv_pipeline_has_stage(pipeline, stage))
2812 continue;
2813
2814 const struct anv_shader_bin *shader = pipeline->shaders[stage];
2815 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2816 struct anv_push_constants *push = &gfx_state->base.push_constants;
2817
2818 push->push_reg_mask[stage] = 0;
2819 /* Start of the current range in the shader, relative to the start of
2820 * push constants in the shader.
2821 */
2822 unsigned range_start_reg = 0;
2823 for (unsigned i = 0; i < 4; i++) {
2824 const struct anv_push_range *range = &bind_map->push_ranges[i];
2825 if (range->length == 0)
2826 continue;
2827
2828 unsigned bound_size =
2829 get_push_range_bound_size(cmd_buffer, shader, range);
2830 if (bound_size >= range->start * 32) {
2831 unsigned bound_regs =
2832 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
2833 range->length);
2834 assert(range_start_reg + bound_regs <= 64);
2835 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
2836 bound_regs);
2837 }
2838
2839 cmd_buffer->state.push_constants_dirty |=
2840 mesa_to_vk_shader_stage(stage);
2841
2842 range_start_reg += range->length;
2843 }
2844 }
2845 }
2846
2847 /* Resets the push constant state so that we allocate a new one if
2848 * needed.
2849 */
2850 gfx_state->base.push_constants_state = ANV_STATE_NULL;
2851
2852 anv_foreach_stage(stage, dirty_stages) {
2853 unsigned buffer_count = 0;
2854 flushed |= mesa_to_vk_shader_stage(stage);
2855 UNUSED uint32_t max_push_range = 0;
2856
2857 struct anv_address buffers[4] = {};
2858 if (anv_pipeline_has_stage(pipeline, stage)) {
2859 const struct anv_shader_bin *shader = pipeline->shaders[stage];
2860 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2861
2862 /* We have to gather buffer addresses as a second step because the
2863 * loop above puts data into the push constant area and the call to
2864 * get_push_range_address is what locks our push constants and copies
2865 * them into the actual GPU buffer. If we did the two loops at the
2866 * same time, we'd risk only having some of the sizes in the push
2867 * constant buffer when we did the copy.
2868 */
2869 for (unsigned i = 0; i < 4; i++) {
2870 const struct anv_push_range *range = &bind_map->push_ranges[i];
2871 if (range->length == 0)
2872 break;
2873
2874 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
2875 max_push_range = MAX2(max_push_range, range->length);
2876 buffer_count++;
2877 }
2878
2879 /* We have at most 4 buffers but they should be tightly packed */
2880 for (unsigned i = buffer_count; i < 4; i++)
2881 assert(bind_map->push_ranges[i].length == 0);
2882 }
2883
2884 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
2885 }
2886
2887 cmd_buffer->state.push_constants_dirty &= ~flushed;
2888 }
2889
2890 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)2891 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
2892 {
2893 const struct vk_dynamic_graphics_state *dyn =
2894 &cmd_buffer->vk.dynamic_graphics_state;
2895
2896 if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
2897 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
2898 #if GFX_VER <= 7
2899 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
2900 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
2901 #endif
2902 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
2903 return;
2904
2905 /* Take dynamic primitive topology in to account with
2906 * 3DSTATE_CLIP::ViewportXYClipTestEnable
2907 */
2908 VkPolygonMode dynamic_raster_mode =
2909 genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
2910 dyn->ia.primitive_topology);
2911 bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
2912
2913 struct GENX(3DSTATE_CLIP) clip = {
2914 GENX(3DSTATE_CLIP_header),
2915 #if GFX_VER <= 7
2916 .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
2917 .CullMode = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
2918 #endif
2919 .ViewportXYClipTestEnable = xy_clip_test_enable,
2920 };
2921 uint32_t dwords[GENX(3DSTATE_CLIP_length)];
2922
2923 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2924 if (anv_pipeline_is_primitive(pipeline)) {
2925 const struct elk_vue_prog_data *last =
2926 anv_pipeline_get_last_vue_prog_data(pipeline);
2927 if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2928 clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
2929 dyn->vp.viewport_count - 1 : 0;
2930 }
2931 }
2932
2933 GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
2934 anv_batch_emit_merge(&cmd_buffer->batch, dwords,
2935 pipeline->gfx7.clip);
2936 }
2937
2938 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer * cmd_buffer)2939 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
2940 {
2941 struct anv_instance *instance = cmd_buffer->device->physical->instance;
2942 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2943 const struct vk_dynamic_graphics_state *dyn =
2944 &cmd_buffer->vk.dynamic_graphics_state;
2945 uint32_t count = dyn->vp.viewport_count;
2946 const VkViewport *viewports = dyn->vp.viewports;
2947 struct anv_state sf_clip_state =
2948 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
2949
2950 bool negative_one_to_one =
2951 cmd_buffer->state.gfx.pipeline->negative_one_to_one;
2952
2953 float scale = negative_one_to_one ? 0.5f : 1.0f;
2954
2955 for (uint32_t i = 0; i < count; i++) {
2956 const VkViewport *vp = &viewports[i];
2957
2958 /* The gfx7 state struct has just the matrix and guardband fields, the
2959 * gfx8 struct adds the min/max viewport fields. */
2960 struct GENX(SF_CLIP_VIEWPORT) sfv = {
2961 .ViewportMatrixElementm00 = vp->width / 2,
2962 .ViewportMatrixElementm11 = vp->height / 2,
2963 .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
2964 .ViewportMatrixElementm30 = vp->x + vp->width / 2,
2965 .ViewportMatrixElementm31 = vp->y + vp->height / 2,
2966 .ViewportMatrixElementm32 = negative_one_to_one ?
2967 (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
2968 .XMinClipGuardband = -1.0f,
2969 .XMaxClipGuardband = 1.0f,
2970 .YMinClipGuardband = -1.0f,
2971 .YMaxClipGuardband = 1.0f,
2972 #if GFX_VER >= 8
2973 .XMinViewPort = vp->x,
2974 .XMaxViewPort = vp->x + vp->width - 1,
2975 .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
2976 .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
2977 #endif
2978 };
2979
2980 /* Fix depth test misrenderings by lowering translated depth range */
2981 if (instance->lower_depth_range_rate != 1.0f)
2982 sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
2983
2984 const uint32_t fb_size_max = 1 << 14;
2985 uint32_t x_min = 0, x_max = fb_size_max;
2986 uint32_t y_min = 0, y_max = fb_size_max;
2987
2988 /* If we have a valid renderArea, include that */
2989 if (gfx->render_area.extent.width > 0 &&
2990 gfx->render_area.extent.height > 0) {
2991 x_min = MAX2(x_min, gfx->render_area.offset.x);
2992 x_max = MIN2(x_min, gfx->render_area.offset.x +
2993 gfx->render_area.extent.width);
2994 y_min = MAX2(y_min, gfx->render_area.offset.y);
2995 y_max = MIN2(y_min, gfx->render_area.offset.y +
2996 gfx->render_area.extent.height);
2997 }
2998
2999 /* The client is required to have enough scissors for whatever it sets
3000 * as ViewportIndex but it's possible that they've got more viewports
3001 * set from a previous command. Also, from the Vulkan 1.3.207:
3002 *
3003 * "The application must ensure (using scissor if necessary) that
3004 * all rendering is contained within the render area."
3005 *
3006 * If the client doesn't set a scissor, that basically means it
3007 * guarantees everything is in-bounds already. If we end up using a
3008 * guardband of [-1, 1] in that case, there shouldn't be much loss.
3009 * It's theoretically possible that they could do all their clipping
3010 * with clip planes but that'd be a bit odd.
3011 */
3012 if (i < dyn->vp.scissor_count) {
3013 const VkRect2D *scissor = &dyn->vp.scissors[i];
3014 x_min = MAX2(x_min, scissor->offset.x);
3015 x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
3016 y_min = MAX2(y_min, scissor->offset.y);
3017 y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
3018 }
3019
3020 /* Only bother calculating the guardband if our known render area is
3021 * less than the maximum size. Otherwise, it will calculate [-1, 1]
3022 * anyway but possibly with precision loss.
3023 */
3024 if (x_min > 0 || x_max < fb_size_max ||
3025 y_min > 0 || y_max < fb_size_max) {
3026 intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
3027 sfv.ViewportMatrixElementm00,
3028 sfv.ViewportMatrixElementm11,
3029 sfv.ViewportMatrixElementm30,
3030 sfv.ViewportMatrixElementm31,
3031 &sfv.XMinClipGuardband,
3032 &sfv.XMaxClipGuardband,
3033 &sfv.YMinClipGuardband,
3034 &sfv.YMaxClipGuardband);
3035 }
3036
3037 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3038 }
3039
3040 anv_batch_emit(&cmd_buffer->batch,
3041 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3042 clip.SFClipViewportPointer = sf_clip_state.offset;
3043 }
3044 }
3045
3046 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer * cmd_buffer,bool depth_clamp_enable)3047 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3048 bool depth_clamp_enable)
3049 {
3050 const struct vk_dynamic_graphics_state *dyn =
3051 &cmd_buffer->vk.dynamic_graphics_state;
3052 uint32_t count = dyn->vp.viewport_count;
3053 const VkViewport *viewports = dyn->vp.viewports;
3054 struct anv_state cc_state =
3055 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3056
3057 for (uint32_t i = 0; i < count; i++) {
3058 const VkViewport *vp = &viewports[i];
3059
3060 /* From the Vulkan spec:
3061 *
3062 * "It is valid for minDepth to be greater than or equal to
3063 * maxDepth."
3064 */
3065 float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3066 float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3067
3068 struct GENX(CC_VIEWPORT) cc_viewport = {
3069 .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3070 .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3071 };
3072
3073 GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3074 }
3075
3076 anv_batch_emit(&cmd_buffer->batch,
3077 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3078 cc.CCViewportPointer = cc_state.offset;
3079 }
3080 }
3081
3082 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer * cmd_buffer)3083 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3084 {
3085 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3086 const struct vk_dynamic_graphics_state *dyn =
3087 &cmd_buffer->vk.dynamic_graphics_state;
3088 uint32_t count = dyn->vp.scissor_count;
3089 const VkRect2D *scissors = dyn->vp.scissors;
3090 const VkViewport *viewports = dyn->vp.viewports;
3091
3092 /* Wa_1409725701:
3093 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3094 * stored as an array of up to 16 elements. The location of first
3095 * element of the array, as specified by Pointer to SCISSOR_RECT, should
3096 * be aligned to a 64-byte boundary.
3097 */
3098 uint32_t alignment = 64;
3099 struct anv_state scissor_state =
3100 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3101
3102 for (uint32_t i = 0; i < count; i++) {
3103 const VkRect2D *s = &scissors[i];
3104 const VkViewport *vp = &viewports[i];
3105
3106 /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3107 * ymax < ymin for empty clips. In case clip x, y, width height are all
3108 * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3109 * what we want. Just special case empty clips and produce a canonical
3110 * empty clip. */
3111 static const struct GENX(SCISSOR_RECT) empty_scissor = {
3112 .ScissorRectangleYMin = 1,
3113 .ScissorRectangleXMin = 1,
3114 .ScissorRectangleYMax = 0,
3115 .ScissorRectangleXMax = 0
3116 };
3117
3118 const int max = 0xffff;
3119
3120 uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3121 uint32_t x_min = MAX2(s->offset.x, vp->x);
3122 int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3123 MAX2(vp->y, vp->y + vp->height) - 1);
3124 int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3125 vp->x + vp->width - 1);
3126
3127 y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
3128 x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
3129
3130 /* Do this math using int64_t so overflow gets clamped correctly. */
3131 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3132 y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
3133 x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
3134 y_max = CLAMP((uint64_t) y_max, 0,
3135 gfx->render_area.offset.y +
3136 gfx->render_area.extent.height - 1);
3137 x_max = CLAMP((uint64_t) x_max, 0,
3138 gfx->render_area.offset.x +
3139 gfx->render_area.extent.width - 1);
3140 }
3141
3142 struct GENX(SCISSOR_RECT) scissor = {
3143 .ScissorRectangleYMin = y_min,
3144 .ScissorRectangleXMin = x_min,
3145 .ScissorRectangleYMax = y_max,
3146 .ScissorRectangleXMax = x_max
3147 };
3148
3149 if (s->extent.width <= 0 || s->extent.height <= 0) {
3150 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3151 &empty_scissor);
3152 } else {
3153 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3154 }
3155 }
3156
3157 anv_batch_emit(&cmd_buffer->batch,
3158 GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3159 ssp.ScissorRectPointer = scissor_state.offset;
3160 }
3161 }
3162
3163 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3164 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3165 {
3166 const struct vk_dynamic_graphics_state *dyn =
3167 &cmd_buffer->vk.dynamic_graphics_state;
3168 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3169
3170 #if GFX_VER == 7
3171 # define streamout_state_dw pipeline->gfx7.streamout_state
3172 #else
3173 # define streamout_state_dw pipeline->gfx8.streamout_state
3174 #endif
3175
3176 uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3177
3178 struct GENX(3DSTATE_STREAMOUT) so = {
3179 GENX(3DSTATE_STREAMOUT_header),
3180 .RenderingDisable = dyn->rs.rasterizer_discard_enable,
3181 };
3182 GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3183 anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3184 }
3185
3186 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)3187 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
3188 {
3189 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3190 const struct vk_dynamic_graphics_state *dyn =
3191 &cmd_buffer->vk.dynamic_graphics_state;
3192 uint32_t *p;
3193
3194 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3195
3196 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3197
3198 genX(flush_pipeline_select_3d)(cmd_buffer);
3199
3200 /* Apply any pending pipeline flushes we may have. We want to apply them
3201 * now because, if any of those flushes are for things like push constants,
3202 * the GPU will read the state at weird times.
3203 */
3204 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3205
3206 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3207 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3208 vb_emit |= pipeline->vb_used;
3209
3210 if (vb_emit) {
3211 const uint32_t num_buffers = __builtin_popcount(vb_emit);
3212 const uint32_t num_dwords = 1 + num_buffers * 4;
3213
3214 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3215 GENX(3DSTATE_VERTEX_BUFFERS));
3216 uint32_t i = 0;
3217 u_foreach_bit(vb, vb_emit) {
3218 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3219 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3220
3221 struct GENX(VERTEX_BUFFER_STATE) state;
3222 if (buffer) {
3223 uint32_t stride = dyn->vi_binding_strides[vb];
3224 UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
3225
3226 #if GFX_VER <= 7
3227 bool per_instance = pipeline->vb[vb].instanced;
3228 uint32_t divisor = pipeline->vb[vb].instance_divisor *
3229 pipeline->instance_multiplier;
3230 #endif
3231
3232 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3233 .VertexBufferIndex = vb,
3234
3235 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3236 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3237 #if GFX_VER <= 7
3238 .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
3239 .InstanceDataStepRate = per_instance ? divisor : 1,
3240 #endif
3241 .AddressModifyEnable = true,
3242 .BufferPitch = stride,
3243 .BufferStartingAddress = anv_address_add(buffer->address, offset),
3244 .NullVertexBuffer = offset >= buffer->vk.size,
3245
3246 #if GFX_VER >= 8
3247 .BufferSize = size,
3248 #else
3249 /* XXX: to handle dynamic offset for older gens we might want
3250 * to modify Endaddress, but there are issues when doing so:
3251 *
3252 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3253 */
3254 .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
3255 #endif
3256 };
3257 } else {
3258 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3259 .VertexBufferIndex = vb,
3260 .NullVertexBuffer = true,
3261 .MOCS = anv_mocs(cmd_buffer->device, NULL,
3262 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3263 };
3264 }
3265
3266 #if GFX_VER == 8
3267 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3268 state.BufferStartingAddress,
3269 state.BufferSize);
3270 #endif
3271
3272 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3273 i++;
3274 }
3275 }
3276
3277 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3278
3279 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3280 pipeline->active_stages;
3281 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3282 !vk_dynamic_graphics_state_any_dirty(dyn) &&
3283 !cmd_buffer->state.push_constants_dirty)
3284 return;
3285
3286 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3287 (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3288 ANV_CMD_DIRTY_PIPELINE))) {
3289 /* Wa_16011411144:
3290 *
3291 * SW must insert a PIPE_CONTROL cmd before and after the
3292 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3293 * state is not combined with other state changes.
3294 */
3295 if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
3296 anv_add_pending_pipe_bits(cmd_buffer,
3297 ANV_PIPE_CS_STALL_BIT,
3298 "before SO_BUFFER change WA");
3299 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3300 }
3301
3302 /* We don't need any per-buffer dirty tracking because you're not
3303 * allowed to bind different XFB buffers while XFB is enabled.
3304 */
3305 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3306 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3307 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3308 sob.SOBufferIndex = idx;
3309
3310 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3311 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
3312 ISL_SURF_USAGE_STREAM_OUT_BIT);
3313 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3314 xfb->offset);
3315 #if GFX_VER >= 8
3316 sob.SOBufferEnable = true;
3317 sob.StreamOffsetWriteEnable = false;
3318 /* Size is in DWords - 1 */
3319 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3320 #else
3321 /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3322 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3323 * default for an empty SO_BUFFER packet) to disable them.
3324 */
3325 sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3326 sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3327 xfb->offset + xfb->size);
3328 #endif
3329 } else {
3330 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3331 }
3332 }
3333 }
3334 }
3335
3336 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3337 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3338
3339 /* If the pipeline changed, we may need to re-allocate push constant
3340 * space in the URB.
3341 */
3342 cmd_buffer_alloc_push_constants(cmd_buffer);
3343 }
3344
3345 #if GFX_VER <= 7
3346 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3347 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3348 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3349 *
3350 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3351 * stall needs to be sent just prior to any 3DSTATE_VS,
3352 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3353 * 3DSTATE_BINDING_TABLE_POINTER_VS,
3354 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
3355 * PIPE_CONTROL needs to be sent before any combination of VS
3356 * associated 3DSTATE."
3357 */
3358 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3359 pc.DepthStallEnable = true;
3360 pc.PostSyncOperation = WriteImmediateData;
3361 pc.Address = cmd_buffer->device->workaround_address;
3362 anv_debug_dump_pc(pc);
3363 }
3364 }
3365 #endif
3366
3367 /* Render targets live in the same binding table as fragment descriptors */
3368 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3369 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3370
3371 /* We emit the binding tables and sampler tables first, then emit push
3372 * constants and then finally emit binding table and sampler table
3373 * pointers. It has to happen in this order, since emitting the binding
3374 * tables may change the push constants (in case of storage images). After
3375 * emitting push constants, on SKL+ we have to emit the corresponding
3376 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
3377 */
3378 uint32_t dirty = 0;
3379 if (descriptors_dirty) {
3380 dirty = flush_descriptor_sets(cmd_buffer,
3381 &cmd_buffer->state.gfx.base,
3382 descriptors_dirty,
3383 pipeline->shaders,
3384 ARRAY_SIZE(pipeline->shaders));
3385 cmd_buffer->state.descriptors_dirty &= ~dirty;
3386 }
3387
3388 if (dirty || cmd_buffer->state.push_constants_dirty) {
3389 /* Because we're pushing UBOs, we have to push whenever either
3390 * descriptors or push constants is dirty.
3391 */
3392 dirty |= cmd_buffer->state.push_constants_dirty;
3393 cmd_buffer_flush_push_constants(cmd_buffer,
3394 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3395 }
3396
3397 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
3398 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
3399 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3400 }
3401
3402 cmd_buffer_emit_clip(cmd_buffer);
3403
3404 if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3405 ANV_CMD_DIRTY_XFB_ENABLE)) ||
3406 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
3407 cmd_buffer_emit_streamout(cmd_buffer);
3408
3409 if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3410 ANV_CMD_DIRTY_RENDER_TARGETS)) ||
3411 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
3412 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
3413 cmd_buffer_emit_viewport(cmd_buffer);
3414 cmd_buffer_emit_depth_viewport(cmd_buffer,
3415 pipeline->depth_clamp_enable);
3416 cmd_buffer_emit_scissor(cmd_buffer);
3417 }
3418
3419 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
3420 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
3421 uint32_t topology;
3422 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
3423 topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
3424 else
3425 topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
3426
3427 cmd_buffer->state.gfx.primitive_topology = topology;
3428
3429 #if (GFX_VER >= 8)
3430 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
3431 vft.PrimitiveTopologyType = topology;
3432 }
3433 #endif
3434 }
3435
3436 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
3437 }
3438
3439 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)3440 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
3441 struct anv_address addr,
3442 uint32_t size, uint32_t index)
3443 {
3444 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
3445 GENX(3DSTATE_VERTEX_BUFFERS));
3446
3447 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
3448 &(struct GENX(VERTEX_BUFFER_STATE)) {
3449 .VertexBufferIndex = index,
3450 .AddressModifyEnable = true,
3451 .BufferPitch = 0,
3452 .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
3453 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3454 .NullVertexBuffer = size == 0,
3455 #if (GFX_VER >= 8)
3456 .BufferStartingAddress = addr,
3457 .BufferSize = size
3458 #else
3459 .BufferStartingAddress = addr,
3460 .EndAddress = anv_address_add(addr, size),
3461 #endif
3462 });
3463
3464 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
3465 index, addr, size);
3466 }
3467
3468 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)3469 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
3470 struct anv_address addr)
3471 {
3472 emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
3473 }
3474
3475 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)3476 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
3477 uint32_t base_vertex, uint32_t base_instance)
3478 {
3479 if (base_vertex == 0 && base_instance == 0) {
3480 emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
3481 } else {
3482 struct anv_state id_state =
3483 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
3484
3485 ((uint32_t *)id_state.map)[0] = base_vertex;
3486 ((uint32_t *)id_state.map)[1] = base_instance;
3487
3488 struct anv_address addr = {
3489 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3490 .offset = id_state.offset,
3491 };
3492
3493 emit_base_vertex_instance_bo(cmd_buffer, addr);
3494 }
3495 }
3496
3497 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)3498 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
3499 {
3500 struct anv_state state =
3501 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
3502
3503 ((uint32_t *)state.map)[0] = draw_index;
3504
3505 struct anv_address addr = {
3506 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3507 .offset = state.offset,
3508 };
3509
3510 emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
3511 }
3512
3513 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)3514 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
3515 uint32_t access_type)
3516 {
3517 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3518 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3519
3520 uint64_t vb_used = pipeline->vb_used;
3521 if (vs_prog_data->uses_firstvertex ||
3522 vs_prog_data->uses_baseinstance)
3523 vb_used |= 1ull << ANV_SVGS_VB_INDEX;
3524 if (vs_prog_data->uses_drawid)
3525 vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
3526
3527 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
3528 access_type == RANDOM,
3529 vb_used);
3530 }
3531
3532 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct elk_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)3533 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
3534 const struct elk_vs_prog_data *vs_prog_data,
3535 uint32_t base_vertex,
3536 uint32_t base_instance,
3537 uint32_t draw_id,
3538 bool force_flush)
3539 {
3540 bool emitted = false;
3541 if (vs_prog_data->uses_firstvertex ||
3542 vs_prog_data->uses_baseinstance) {
3543 emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
3544 emitted = true;
3545 }
3546 if (vs_prog_data->uses_drawid) {
3547 emit_draw_index(cmd_buffer, draw_id);
3548 emitted = true;
3549 }
3550 /* Emitting draw index or vertex index BOs may result in needing
3551 * additional VF cache flushes.
3552 */
3553 if (emitted || force_flush)
3554 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3555 }
3556
genX(CmdDraw)3557 void genX(CmdDraw)(
3558 VkCommandBuffer commandBuffer,
3559 uint32_t vertexCount,
3560 uint32_t instanceCount,
3561 uint32_t firstVertex,
3562 uint32_t firstInstance)
3563 {
3564 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3565 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3566 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3567
3568 if (anv_batch_has_error(&cmd_buffer->batch))
3569 return;
3570
3571 const uint32_t count =
3572 vertexCount * instanceCount * pipeline->instance_multiplier;
3573 anv_measure_snapshot(cmd_buffer,
3574 INTEL_SNAPSHOT_DRAW,
3575 "draw", count);
3576 trace_intel_begin_draw(&cmd_buffer->trace);
3577
3578 /* Select pipeline here to allow
3579 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3580 * cmd_buffer_flush_gfx_state().
3581 */
3582 genX(flush_pipeline_select_3d)(cmd_buffer);
3583
3584 if (cmd_buffer->state.conditional_render_enabled)
3585 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3586
3587 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3588 firstVertex, firstInstance, 0,
3589 false /* force_flush */);
3590
3591 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3592
3593 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3594 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3595 prim.VertexAccessType = SEQUENTIAL;
3596 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3597 prim.VertexCountPerInstance = vertexCount;
3598 prim.StartVertexLocation = firstVertex;
3599 prim.InstanceCount = instanceCount *
3600 pipeline->instance_multiplier;
3601 prim.StartInstanceLocation = firstInstance;
3602 prim.BaseVertexLocation = 0;
3603 }
3604
3605 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3606
3607 trace_intel_end_draw(&cmd_buffer->trace, count);
3608 }
3609
genX(CmdDrawMultiEXT)3610 void genX(CmdDrawMultiEXT)(
3611 VkCommandBuffer commandBuffer,
3612 uint32_t drawCount,
3613 const VkMultiDrawInfoEXT *pVertexInfo,
3614 uint32_t instanceCount,
3615 uint32_t firstInstance,
3616 uint32_t stride)
3617 {
3618 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3619 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3620 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3621
3622 if (anv_batch_has_error(&cmd_buffer->batch))
3623 return;
3624
3625 const uint32_t count =
3626 drawCount * instanceCount * pipeline->instance_multiplier;
3627 anv_measure_snapshot(cmd_buffer,
3628 INTEL_SNAPSHOT_DRAW,
3629 "draw_multi", count);
3630 trace_intel_begin_draw_multi(&cmd_buffer->trace);
3631
3632 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3633
3634 if (cmd_buffer->state.conditional_render_enabled)
3635 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3636
3637 uint32_t i = 0;
3638 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3639 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3640 draw->firstVertex,
3641 firstInstance, i, !i);
3642
3643 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3644 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3645 prim.VertexAccessType = SEQUENTIAL;
3646 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3647 prim.VertexCountPerInstance = draw->vertexCount;
3648 prim.StartVertexLocation = draw->firstVertex;
3649 prim.InstanceCount = instanceCount *
3650 pipeline->instance_multiplier;
3651 prim.StartInstanceLocation = firstInstance;
3652 prim.BaseVertexLocation = 0;
3653 }
3654 }
3655
3656 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3657
3658 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
3659 }
3660
genX(CmdDrawIndexed)3661 void genX(CmdDrawIndexed)(
3662 VkCommandBuffer commandBuffer,
3663 uint32_t indexCount,
3664 uint32_t instanceCount,
3665 uint32_t firstIndex,
3666 int32_t vertexOffset,
3667 uint32_t firstInstance)
3668 {
3669 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3670 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3671 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3672
3673 if (anv_batch_has_error(&cmd_buffer->batch))
3674 return;
3675
3676 const uint32_t count =
3677 indexCount * instanceCount * pipeline->instance_multiplier;
3678 anv_measure_snapshot(cmd_buffer,
3679 INTEL_SNAPSHOT_DRAW,
3680 "draw indexed",
3681 count);
3682 trace_intel_begin_draw_indexed(&cmd_buffer->trace);
3683
3684 /* Select pipeline here to allow
3685 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3686 * cmd_buffer_flush_gfx_state().
3687 */
3688 genX(flush_pipeline_select_3d)(cmd_buffer);
3689
3690 if (cmd_buffer->state.conditional_render_enabled)
3691 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3692
3693 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3694 vertexOffset, firstInstance,
3695 0, false /* force_flush */);
3696
3697 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3698
3699 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3700 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3701 prim.VertexAccessType = RANDOM;
3702 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3703 prim.VertexCountPerInstance = indexCount;
3704 prim.StartVertexLocation = firstIndex;
3705 prim.InstanceCount = instanceCount *
3706 pipeline->instance_multiplier;
3707 prim.StartInstanceLocation = firstInstance;
3708 prim.BaseVertexLocation = vertexOffset;
3709 }
3710
3711 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3712
3713 trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
3714 }
3715
genX(CmdDrawMultiIndexedEXT)3716 void genX(CmdDrawMultiIndexedEXT)(
3717 VkCommandBuffer commandBuffer,
3718 uint32_t drawCount,
3719 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3720 uint32_t instanceCount,
3721 uint32_t firstInstance,
3722 uint32_t stride,
3723 const int32_t *pVertexOffset)
3724 {
3725 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3726 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3727 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3728
3729 if (anv_batch_has_error(&cmd_buffer->batch))
3730 return;
3731
3732 const uint32_t count =
3733 drawCount * instanceCount * pipeline->instance_multiplier;
3734 anv_measure_snapshot(cmd_buffer,
3735 INTEL_SNAPSHOT_DRAW,
3736 "draw indexed_multi",
3737 count);
3738 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
3739
3740 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3741
3742 if (cmd_buffer->state.conditional_render_enabled)
3743 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3744
3745 uint32_t i = 0;
3746 if (pVertexOffset) {
3747 if (vs_prog_data->uses_drawid) {
3748 bool emitted = true;
3749 if (vs_prog_data->uses_firstvertex ||
3750 vs_prog_data->uses_baseinstance) {
3751 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3752 emitted = true;
3753 }
3754 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3755 if (vs_prog_data->uses_drawid) {
3756 emit_draw_index(cmd_buffer, i);
3757 emitted = true;
3758 }
3759 /* Emitting draw index or vertex index BOs may result in needing
3760 * additional VF cache flushes.
3761 */
3762 if (emitted)
3763 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3764
3765 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3766 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3767 prim.VertexAccessType = RANDOM;
3768 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3769 prim.VertexCountPerInstance = draw->indexCount;
3770 prim.StartVertexLocation = draw->firstIndex;
3771 prim.InstanceCount = instanceCount *
3772 pipeline->instance_multiplier;
3773 prim.StartInstanceLocation = firstInstance;
3774 prim.BaseVertexLocation = *pVertexOffset;
3775 }
3776 emitted = false;
3777 }
3778 } else {
3779 if (vs_prog_data->uses_firstvertex ||
3780 vs_prog_data->uses_baseinstance) {
3781 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3782 /* Emitting draw index or vertex index BOs may result in needing
3783 * additional VF cache flushes.
3784 */
3785 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3786 }
3787 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3788 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3789 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3790 prim.VertexAccessType = RANDOM;
3791 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3792 prim.VertexCountPerInstance = draw->indexCount;
3793 prim.StartVertexLocation = draw->firstIndex;
3794 prim.InstanceCount = instanceCount *
3795 pipeline->instance_multiplier;
3796 prim.StartInstanceLocation = firstInstance;
3797 prim.BaseVertexLocation = *pVertexOffset;
3798 }
3799 }
3800 }
3801 } else {
3802 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3803 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3804 draw->vertexOffset,
3805 firstInstance, i, i != 0);
3806
3807 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3808 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3809 prim.VertexAccessType = RANDOM;
3810 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3811 prim.VertexCountPerInstance = draw->indexCount;
3812 prim.StartVertexLocation = draw->firstIndex;
3813 prim.InstanceCount = instanceCount *
3814 pipeline->instance_multiplier;
3815 prim.StartInstanceLocation = firstInstance;
3816 prim.BaseVertexLocation = draw->vertexOffset;
3817 }
3818 }
3819 }
3820
3821 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3822
3823 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
3824 }
3825
3826 /* Auto-Draw / Indirect Registers */
3827 #define GFX7_3DPRIM_END_OFFSET 0x2420
3828 #define GFX7_3DPRIM_START_VERTEX 0x2430
3829 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
3830 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
3831 #define GFX7_3DPRIM_START_INSTANCE 0x243C
3832 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
3833
genX(CmdDrawIndirectByteCountEXT)3834 void genX(CmdDrawIndirectByteCountEXT)(
3835 VkCommandBuffer commandBuffer,
3836 uint32_t instanceCount,
3837 uint32_t firstInstance,
3838 VkBuffer counterBuffer,
3839 VkDeviceSize counterBufferOffset,
3840 uint32_t counterOffset,
3841 uint32_t vertexStride)
3842 {
3843 #if GFX_VERx10 >= 75
3844 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3845 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
3846 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3847 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3848
3849 /* firstVertex is always zero for this draw function */
3850 const uint32_t firstVertex = 0;
3851
3852 if (anv_batch_has_error(&cmd_buffer->batch))
3853 return;
3854
3855 anv_measure_snapshot(cmd_buffer,
3856 INTEL_SNAPSHOT_DRAW,
3857 "draw indirect byte count",
3858 instanceCount * pipeline->instance_multiplier);
3859 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
3860
3861 /* Select pipeline here to allow
3862 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3863 * emit_base_vertex_instance() & emit_draw_index().
3864 */
3865 genX(flush_pipeline_select_3d)(cmd_buffer);
3866
3867 if (cmd_buffer->state.conditional_render_enabled)
3868 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3869
3870 if (vs_prog_data->uses_firstvertex ||
3871 vs_prog_data->uses_baseinstance)
3872 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
3873 if (vs_prog_data->uses_drawid)
3874 emit_draw_index(cmd_buffer, 0);
3875
3876 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3877
3878 struct mi_builder b;
3879 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3880 struct mi_value count =
3881 mi_mem32(anv_address_add(counter_buffer->address,
3882 counterBufferOffset));
3883 if (counterOffset)
3884 count = mi_isub(&b, count, mi_imm(counterOffset));
3885 count = mi_udiv32_imm(&b, count, vertexStride);
3886 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
3887
3888 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
3889 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
3890 mi_imm(instanceCount * pipeline->instance_multiplier));
3891 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
3892 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3893
3894 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3895 prim.IndirectParameterEnable = true;
3896 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3897 prim.VertexAccessType = SEQUENTIAL;
3898 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3899 }
3900
3901 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3902
3903 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
3904 instanceCount * pipeline->instance_multiplier);
3905 #endif /* GFX_VERx10 >= 75 */
3906 }
3907
3908 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)3909 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
3910 struct anv_address addr,
3911 bool indexed)
3912 {
3913 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3914
3915 struct mi_builder b;
3916 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3917
3918 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
3919 mi_mem32(anv_address_add(addr, 0)));
3920
3921 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
3922 if (pipeline->instance_multiplier > 1) {
3923 #if GFX_VERx10 >= 75
3924 instance_count = mi_imul_imm(&b, instance_count,
3925 pipeline->instance_multiplier);
3926 #else
3927 anv_finishme("Multiview + indirect draw requires MI_MATH; "
3928 "MI_MATH is not supported on Ivy Bridge");
3929 #endif
3930 }
3931 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
3932
3933 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
3934 mi_mem32(anv_address_add(addr, 8)));
3935
3936 if (indexed) {
3937 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
3938 mi_mem32(anv_address_add(addr, 12)));
3939 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3940 mi_mem32(anv_address_add(addr, 16)));
3941 } else {
3942 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3943 mi_mem32(anv_address_add(addr, 12)));
3944 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3945 }
3946 }
3947
genX(CmdDrawIndirect)3948 void genX(CmdDrawIndirect)(
3949 VkCommandBuffer commandBuffer,
3950 VkBuffer _buffer,
3951 VkDeviceSize offset,
3952 uint32_t drawCount,
3953 uint32_t stride)
3954 {
3955 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3956 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
3957 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3958 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3959
3960 if (anv_batch_has_error(&cmd_buffer->batch))
3961 return;
3962
3963 anv_measure_snapshot(cmd_buffer,
3964 INTEL_SNAPSHOT_DRAW,
3965 "draw indirect",
3966 drawCount);
3967 trace_intel_begin_draw_indirect(&cmd_buffer->trace);
3968
3969 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3970
3971 if (cmd_buffer->state.conditional_render_enabled)
3972 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3973
3974 for (uint32_t i = 0; i < drawCount; i++) {
3975 struct anv_address draw = anv_address_add(buffer->address, offset);
3976
3977 if (vs_prog_data->uses_firstvertex ||
3978 vs_prog_data->uses_baseinstance)
3979 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
3980 if (vs_prog_data->uses_drawid)
3981 emit_draw_index(cmd_buffer, i);
3982
3983 /* Emitting draw index or vertex index BOs may result in needing
3984 * additional VF cache flushes.
3985 */
3986 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3987
3988 load_indirect_parameters(cmd_buffer, draw, false);
3989
3990 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3991 prim.IndirectParameterEnable = true;
3992 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3993 prim.VertexAccessType = SEQUENTIAL;
3994 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3995 }
3996
3997 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3998
3999 offset += stride;
4000 }
4001
4002 trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
4003 }
4004
genX(CmdDrawIndexedIndirect)4005 void genX(CmdDrawIndexedIndirect)(
4006 VkCommandBuffer commandBuffer,
4007 VkBuffer _buffer,
4008 VkDeviceSize offset,
4009 uint32_t drawCount,
4010 uint32_t stride)
4011 {
4012 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4013 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4014 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4015 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4016
4017 if (anv_batch_has_error(&cmd_buffer->batch))
4018 return;
4019
4020 anv_measure_snapshot(cmd_buffer,
4021 INTEL_SNAPSHOT_DRAW,
4022 "draw indexed indirect",
4023 drawCount);
4024 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
4025
4026 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4027
4028 if (cmd_buffer->state.conditional_render_enabled)
4029 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4030
4031 for (uint32_t i = 0; i < drawCount; i++) {
4032 struct anv_address draw = anv_address_add(buffer->address, offset);
4033
4034 /* TODO: We need to stomp base vertex to 0 somehow */
4035 if (vs_prog_data->uses_firstvertex ||
4036 vs_prog_data->uses_baseinstance)
4037 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4038 if (vs_prog_data->uses_drawid)
4039 emit_draw_index(cmd_buffer, i);
4040
4041 /* Emitting draw index or vertex index BOs may result in needing
4042 * additional VF cache flushes.
4043 */
4044 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4045
4046 load_indirect_parameters(cmd_buffer, draw, true);
4047
4048 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4049 prim.IndirectParameterEnable = true;
4050 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4051 prim.VertexAccessType = RANDOM;
4052 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4053 }
4054
4055 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4056
4057 offset += stride;
4058 }
4059
4060 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
4061 }
4062
4063 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)4064 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4065 struct mi_builder *b,
4066 struct anv_address count_address)
4067 {
4068 struct mi_value ret = mi_imm(0);
4069
4070 if (cmd_buffer->state.conditional_render_enabled) {
4071 #if GFX_VERx10 >= 75
4072 ret = mi_new_gpr(b);
4073 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4074 #endif
4075 } else {
4076 /* Upload the current draw count from the draw parameters buffer to
4077 * MI_PREDICATE_SRC0.
4078 */
4079 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4080 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4081 }
4082
4083 return ret;
4084 }
4085
4086 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4087 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4088 struct mi_builder *b,
4089 uint32_t draw_index)
4090 {
4091 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4092 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4093
4094 if (draw_index == 0) {
4095 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4096 mip.LoadOperation = LOAD_LOADINV;
4097 mip.CombineOperation = COMBINE_SET;
4098 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4099 }
4100 } else {
4101 /* While draw_index < draw_count the predicate's result will be
4102 * (draw_index == draw_count) ^ TRUE = TRUE
4103 * When draw_index == draw_count the result is
4104 * (TRUE) ^ TRUE = FALSE
4105 * After this all results will be:
4106 * (FALSE) ^ FALSE = FALSE
4107 */
4108 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4109 mip.LoadOperation = LOAD_LOAD;
4110 mip.CombineOperation = COMBINE_XOR;
4111 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4112 }
4113 }
4114 }
4115
4116 #if GFX_VERx10 >= 75
4117 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4118 emit_draw_count_predicate_with_conditional_render(
4119 struct anv_cmd_buffer *cmd_buffer,
4120 struct mi_builder *b,
4121 uint32_t draw_index,
4122 struct mi_value max)
4123 {
4124 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4125 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4126
4127 #if GFX_VER >= 8
4128 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4129 #else
4130 /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4131 * so we emit MI_PREDICATE to set it.
4132 */
4133
4134 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4135 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4136
4137 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4138 mip.LoadOperation = LOAD_LOADINV;
4139 mip.CombineOperation = COMBINE_SET;
4140 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4141 }
4142 #endif
4143 }
4144 #endif
4145
4146 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4147 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4148 struct mi_builder *b,
4149 uint32_t draw_index,
4150 struct mi_value max)
4151 {
4152 #if GFX_VERx10 >= 75
4153 if (cmd_buffer->state.conditional_render_enabled) {
4154 emit_draw_count_predicate_with_conditional_render(
4155 cmd_buffer, b, draw_index, mi_value_ref(b, max));
4156 } else {
4157 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4158 }
4159 #else
4160 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4161 #endif
4162 }
4163
genX(CmdDrawIndirectCount)4164 void genX(CmdDrawIndirectCount)(
4165 VkCommandBuffer commandBuffer,
4166 VkBuffer _buffer,
4167 VkDeviceSize offset,
4168 VkBuffer _countBuffer,
4169 VkDeviceSize countBufferOffset,
4170 uint32_t maxDrawCount,
4171 uint32_t stride)
4172 {
4173 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4174 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4175 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4176 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4177 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4178 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4179
4180 if (anv_batch_has_error(&cmd_buffer->batch))
4181 return;
4182
4183 anv_measure_snapshot(cmd_buffer,
4184 INTEL_SNAPSHOT_DRAW,
4185 "draw indirect count",
4186 0);
4187 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
4188
4189 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4190
4191 struct mi_builder b;
4192 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4193 struct anv_address count_address =
4194 anv_address_add(count_buffer->address, countBufferOffset);
4195 struct mi_value max =
4196 prepare_for_draw_count_predicate(cmd_buffer, &b, count_address);
4197
4198 for (uint32_t i = 0; i < maxDrawCount; i++) {
4199 struct anv_address draw = anv_address_add(buffer->address, offset);
4200
4201 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4202
4203 if (vs_prog_data->uses_firstvertex ||
4204 vs_prog_data->uses_baseinstance)
4205 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4206 if (vs_prog_data->uses_drawid)
4207 emit_draw_index(cmd_buffer, i);
4208
4209 /* Emitting draw index or vertex index BOs may result in needing
4210 * additional VF cache flushes.
4211 */
4212 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4213
4214 load_indirect_parameters(cmd_buffer, draw, false);
4215
4216 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4217 prim.IndirectParameterEnable = true;
4218 prim.PredicateEnable = true;
4219 prim.VertexAccessType = SEQUENTIAL;
4220 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4221 }
4222
4223 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4224
4225 offset += stride;
4226 }
4227
4228 mi_value_unref(&b, max);
4229
4230 trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
4231 anv_address_utrace(count_address));
4232 }
4233
genX(CmdDrawIndexedIndirectCount)4234 void genX(CmdDrawIndexedIndirectCount)(
4235 VkCommandBuffer commandBuffer,
4236 VkBuffer _buffer,
4237 VkDeviceSize offset,
4238 VkBuffer _countBuffer,
4239 VkDeviceSize countBufferOffset,
4240 uint32_t maxDrawCount,
4241 uint32_t stride)
4242 {
4243 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4244 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4245 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4246 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4247 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4248 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4249
4250 if (anv_batch_has_error(&cmd_buffer->batch))
4251 return;
4252
4253 anv_measure_snapshot(cmd_buffer,
4254 INTEL_SNAPSHOT_DRAW,
4255 "draw indexed indirect count",
4256 0);
4257 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
4258
4259 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4260
4261 struct mi_builder b;
4262 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4263 struct anv_address count_address =
4264 anv_address_add(count_buffer->address, countBufferOffset);
4265 struct mi_value max =
4266 prepare_for_draw_count_predicate(cmd_buffer, &b, count_address);
4267
4268 for (uint32_t i = 0; i < maxDrawCount; i++) {
4269 struct anv_address draw = anv_address_add(buffer->address, offset);
4270
4271 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4272
4273 /* TODO: We need to stomp base vertex to 0 somehow */
4274 if (vs_prog_data->uses_firstvertex ||
4275 vs_prog_data->uses_baseinstance)
4276 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4277 if (vs_prog_data->uses_drawid)
4278 emit_draw_index(cmd_buffer, i);
4279
4280 /* Emitting draw index or vertex index BOs may result in needing
4281 * additional VF cache flushes.
4282 */
4283 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4284
4285 load_indirect_parameters(cmd_buffer, draw, true);
4286
4287 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4288 prim.IndirectParameterEnable = true;
4289 prim.PredicateEnable = true;
4290 prim.VertexAccessType = RANDOM;
4291 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4292 }
4293
4294 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4295
4296 offset += stride;
4297 }
4298
4299 mi_value_unref(&b, max);
4300
4301 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
4302 anv_address_utrace(count_address));
4303 }
4304
genX(CmdBeginTransformFeedbackEXT)4305 void genX(CmdBeginTransformFeedbackEXT)(
4306 VkCommandBuffer commandBuffer,
4307 uint32_t firstCounterBuffer,
4308 uint32_t counterBufferCount,
4309 const VkBuffer* pCounterBuffers,
4310 const VkDeviceSize* pCounterBufferOffsets)
4311 {
4312 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4313
4314 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4315 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4316 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4317
4318 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4319 *
4320 * "Ssoftware must ensure that no HW stream output operations can be in
4321 * process or otherwise pending at the point that the MI_LOAD/STORE
4322 * commands are processed. This will likely require a pipeline flush."
4323 */
4324 anv_add_pending_pipe_bits(cmd_buffer,
4325 ANV_PIPE_CS_STALL_BIT,
4326 "begin transform feedback");
4327 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4328
4329 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4330 /* If we have a counter buffer, this is a resume so we need to load the
4331 * value into the streamout offset register. Otherwise, this is a begin
4332 * and we need to reset it to zero.
4333 */
4334 if (pCounterBuffers &&
4335 idx >= firstCounterBuffer &&
4336 idx - firstCounterBuffer < counterBufferCount &&
4337 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4338 uint32_t cb_idx = idx - firstCounterBuffer;
4339 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4340 uint64_t offset = pCounterBufferOffsets ?
4341 pCounterBufferOffsets[cb_idx] : 0;
4342
4343 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4344 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4345 lrm.MemoryAddress = anv_address_add(counter_buffer->address,
4346 offset);
4347 }
4348 } else {
4349 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4350 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4351 lri.DataDWord = 0;
4352 }
4353 }
4354 }
4355
4356 cmd_buffer->state.xfb_enabled = true;
4357 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4358 }
4359
genX(CmdEndTransformFeedbackEXT)4360 void genX(CmdEndTransformFeedbackEXT)(
4361 VkCommandBuffer commandBuffer,
4362 uint32_t firstCounterBuffer,
4363 uint32_t counterBufferCount,
4364 const VkBuffer* pCounterBuffers,
4365 const VkDeviceSize* pCounterBufferOffsets)
4366 {
4367 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4368
4369 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4370 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4371 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4372
4373 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4374 *
4375 * "Ssoftware must ensure that no HW stream output operations can be in
4376 * process or otherwise pending at the point that the MI_LOAD/STORE
4377 * commands are processed. This will likely require a pipeline flush."
4378 */
4379 anv_add_pending_pipe_bits(cmd_buffer,
4380 ANV_PIPE_CS_STALL_BIT,
4381 "end transform feedback");
4382 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4383
4384 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
4385 unsigned idx = firstCounterBuffer + cb_idx;
4386
4387 /* If we have a counter buffer, this is a resume so we need to load the
4388 * value into the streamout offset register. Otherwise, this is a begin
4389 * and we need to reset it to zero.
4390 */
4391 if (pCounterBuffers &&
4392 cb_idx < counterBufferCount &&
4393 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
4394 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4395 uint64_t offset = pCounterBufferOffsets ?
4396 pCounterBufferOffsets[cb_idx] : 0;
4397
4398 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
4399 srm.MemoryAddress = anv_address_add(counter_buffer->address,
4400 offset);
4401 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4402 }
4403 }
4404 }
4405
4406 cmd_buffer->state.xfb_enabled = false;
4407 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4408 }
4409
4410 static void
genX(cmd_buffer_flush_compute_state)4411 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
4412 {
4413 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
4414 struct anv_compute_pipeline *pipeline = comp_state->pipeline;
4415
4416 assert(pipeline->cs);
4417
4418 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
4419
4420 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
4421
4422 /* Apply any pending pipeline flushes we may have. We want to apply them
4423 * now because, if any of those flushes are for things like push constants,
4424 * the GPU will read the state at weird times.
4425 */
4426 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4427
4428 if (cmd_buffer->state.compute.pipeline_dirty) {
4429 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4430 *
4431 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4432 * the only bits that are changed are scoreboard related: Scoreboard
4433 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4434 * these scoreboard related states, a MEDIA_STATE_FLUSH is
4435 * sufficient."
4436 */
4437 anv_add_pending_pipe_bits(cmd_buffer,
4438 ANV_PIPE_CS_STALL_BIT,
4439 "flush compute state");
4440 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4441
4442 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
4443
4444 /* The workgroup size of the pipeline affects our push constant layout
4445 * so flag push constants as dirty if we change the pipeline.
4446 */
4447 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4448 }
4449
4450 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
4451 cmd_buffer->state.compute.pipeline_dirty) {
4452 flush_descriptor_sets(cmd_buffer,
4453 &cmd_buffer->state.compute.base,
4454 VK_SHADER_STAGE_COMPUTE_BIT,
4455 &pipeline->cs, 1);
4456 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4457
4458 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
4459 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
4460 .BindingTablePointer =
4461 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
4462 .SamplerStatePointer =
4463 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
4464 };
4465 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
4466
4467 struct anv_state state =
4468 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
4469 pipeline->interface_descriptor_data,
4470 GENX(INTERFACE_DESCRIPTOR_DATA_length),
4471 64);
4472
4473 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4474 anv_batch_emit(&cmd_buffer->batch,
4475 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
4476 mid.InterfaceDescriptorTotalLength = size;
4477 mid.InterfaceDescriptorDataStartAddress = state.offset;
4478 }
4479 }
4480
4481 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
4482 comp_state->push_data =
4483 anv_cmd_buffer_cs_push_constants(cmd_buffer);
4484
4485 if (comp_state->push_data.alloc_size) {
4486 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
4487 curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
4488 curbe.CURBEDataStartAddress = comp_state->push_data.offset;
4489 }
4490 }
4491
4492 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4493 }
4494
4495 cmd_buffer->state.compute.pipeline_dirty = false;
4496
4497 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4498 }
4499
4500 #if GFX_VER == 7
4501
4502 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)4503 verify_cmd_parser(const struct anv_device *device,
4504 int required_version,
4505 const char *function)
4506 {
4507 if (device->physical->cmd_parser_version < required_version) {
4508 return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
4509 "cmd parser version %d is required for %s",
4510 required_version, function);
4511 } else {
4512 return VK_SUCCESS;
4513 }
4514 }
4515
4516 #endif
4517
4518 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)4519 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
4520 uint32_t baseGroupX,
4521 uint32_t baseGroupY,
4522 uint32_t baseGroupZ)
4523 {
4524 if (anv_batch_has_error(&cmd_buffer->batch))
4525 return;
4526
4527 struct anv_push_constants *push =
4528 &cmd_buffer->state.compute.base.push_constants;
4529 if (push->cs.base_work_group_id[0] != baseGroupX ||
4530 push->cs.base_work_group_id[1] != baseGroupY ||
4531 push->cs.base_work_group_id[2] != baseGroupZ) {
4532 push->cs.base_work_group_id[0] = baseGroupX;
4533 push->cs.base_work_group_id[1] = baseGroupY;
4534 push->cs.base_work_group_id[2] = baseGroupZ;
4535
4536 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4537 }
4538 }
4539
4540 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4541 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
4542 const struct anv_compute_pipeline *pipeline, bool indirect,
4543 const struct elk_cs_prog_data *prog_data,
4544 uint32_t groupCountX, uint32_t groupCountY,
4545 uint32_t groupCountZ)
4546 {
4547 bool predicate = (GFX_VER <= 7 && indirect) ||
4548 cmd_buffer->state.conditional_render_enabled;
4549
4550 const struct intel_device_info *devinfo = pipeline->base.device->info;
4551 const struct intel_cs_dispatch_info dispatch =
4552 elk_cs_get_dispatch_info(devinfo, prog_data, NULL);
4553
4554 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
4555 ggw.IndirectParameterEnable = indirect;
4556 ggw.PredicateEnable = predicate;
4557 ggw.SIMDSize = dispatch.simd_size / 16;
4558 ggw.ThreadDepthCounterMaximum = 0;
4559 ggw.ThreadHeightCounterMaximum = 0;
4560 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
4561 ggw.ThreadGroupIDXDimension = groupCountX;
4562 ggw.ThreadGroupIDYDimension = groupCountY;
4563 ggw.ThreadGroupIDZDimension = groupCountZ;
4564 ggw.RightExecutionMask = dispatch.right_mask;
4565 ggw.BottomExecutionMask = 0xffffffff;
4566 }
4567
4568 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
4569 }
4570
4571 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4572 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
4573 const struct anv_compute_pipeline *pipeline, bool indirect,
4574 const struct elk_cs_prog_data *prog_data,
4575 uint32_t groupCountX, uint32_t groupCountY,
4576 uint32_t groupCountZ)
4577 {
4578 emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
4579 groupCountY, groupCountZ);
4580 }
4581
genX(CmdDispatchBase)4582 void genX(CmdDispatchBase)(
4583 VkCommandBuffer commandBuffer,
4584 uint32_t baseGroupX,
4585 uint32_t baseGroupY,
4586 uint32_t baseGroupZ,
4587 uint32_t groupCountX,
4588 uint32_t groupCountY,
4589 uint32_t groupCountZ)
4590 {
4591 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4592 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4593 const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4594
4595 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
4596 baseGroupY, baseGroupZ);
4597
4598 if (anv_batch_has_error(&cmd_buffer->batch))
4599 return;
4600
4601 anv_measure_snapshot(cmd_buffer,
4602 INTEL_SNAPSHOT_COMPUTE,
4603 "compute",
4604 groupCountX * groupCountY * groupCountZ *
4605 prog_data->local_size[0] * prog_data->local_size[1] *
4606 prog_data->local_size[2]);
4607
4608 trace_intel_begin_compute(&cmd_buffer->trace);
4609
4610 if (prog_data->uses_num_work_groups) {
4611 struct anv_state state =
4612 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
4613 uint32_t *sizes = state.map;
4614 sizes[0] = groupCountX;
4615 sizes[1] = groupCountY;
4616 sizes[2] = groupCountZ;
4617 cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
4618 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4619 .offset = state.offset,
4620 };
4621
4622 /* The num_workgroups buffer goes in the binding table */
4623 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4624 }
4625
4626 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4627
4628 if (cmd_buffer->state.conditional_render_enabled)
4629 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4630
4631 emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
4632 groupCountY, groupCountZ);
4633
4634 trace_intel_end_compute(&cmd_buffer->trace,
4635 groupCountX, groupCountY, groupCountZ);
4636 }
4637
4638 #define GPGPU_DISPATCHDIMX 0x2500
4639 #define GPGPU_DISPATCHDIMY 0x2504
4640 #define GPGPU_DISPATCHDIMZ 0x2508
4641
genX(CmdDispatchIndirect)4642 void genX(CmdDispatchIndirect)(
4643 VkCommandBuffer commandBuffer,
4644 VkBuffer _buffer,
4645 VkDeviceSize offset)
4646 {
4647 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4648 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4649 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4650 const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4651 struct anv_address addr = anv_address_add(buffer->address, offset);
4652 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
4653
4654 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
4655
4656 #if GFX_VER == 7
4657 /* Linux 4.4 added command parser version 5 which allows the GPGPU
4658 * indirect dispatch registers to be written.
4659 */
4660 if (verify_cmd_parser(cmd_buffer->device, 5,
4661 "vkCmdDispatchIndirect") != VK_SUCCESS)
4662 return;
4663 #endif
4664
4665 anv_measure_snapshot(cmd_buffer,
4666 INTEL_SNAPSHOT_COMPUTE,
4667 "compute indirect",
4668 0);
4669 trace_intel_begin_compute(&cmd_buffer->trace);
4670
4671 if (prog_data->uses_num_work_groups) {
4672 cmd_buffer->state.compute.num_workgroups = addr;
4673
4674 /* The num_workgroups buffer goes in the binding table */
4675 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4676 }
4677
4678 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4679
4680 struct mi_builder b;
4681 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4682
4683 struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
4684 struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
4685 struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
4686
4687 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
4688 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
4689 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
4690
4691 #if GFX_VER <= 7
4692 /* predicate = (compute_dispatch_indirect_x_size == 0); */
4693 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
4694 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4695 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4696 mip.LoadOperation = LOAD_LOAD;
4697 mip.CombineOperation = COMBINE_SET;
4698 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4699 }
4700
4701 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4702 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
4703 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4704 mip.LoadOperation = LOAD_LOAD;
4705 mip.CombineOperation = COMBINE_OR;
4706 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4707 }
4708
4709 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4710 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
4711 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4712 mip.LoadOperation = LOAD_LOAD;
4713 mip.CombineOperation = COMBINE_OR;
4714 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4715 }
4716
4717 /* predicate = !predicate; */
4718 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4719 mip.LoadOperation = LOAD_LOADINV;
4720 mip.CombineOperation = COMBINE_OR;
4721 mip.CompareOperation = COMPARE_FALSE;
4722 }
4723
4724 #if GFX_VERx10 == 75
4725 if (cmd_buffer->state.conditional_render_enabled) {
4726 /* predicate &= !(conditional_rendering_predicate == 0); */
4727 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
4728 mi_reg32(ANV_PREDICATE_RESULT_REG));
4729 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4730 mip.LoadOperation = LOAD_LOADINV;
4731 mip.CombineOperation = COMBINE_AND;
4732 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4733 }
4734 }
4735 #endif
4736
4737 #else /* GFX_VER > 7 */
4738 if (cmd_buffer->state.conditional_render_enabled)
4739 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4740 #endif
4741
4742 emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
4743
4744 trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
4745 }
4746
4747 static void
genX(flush_pipeline_select)4748 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4749 uint32_t pipeline)
4750 {
4751 UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4752
4753 if (cmd_buffer->state.current_pipeline == pipeline)
4754 return;
4755
4756 #if GFX_VER >= 8
4757 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4758 *
4759 * Software must clear the COLOR_CALC_STATE Valid field in
4760 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4761 * with Pipeline Select set to GPGPU.
4762 *
4763 * The internal hardware docs recommend the same workaround for Gfx9
4764 * hardware too.
4765 */
4766 if (pipeline == GPGPU)
4767 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4768 #endif
4769
4770 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4771 * PIPELINE_SELECT [DevBWR+]":
4772 *
4773 * Project: DEVSNB+
4774 *
4775 * Software must ensure all the write caches are flushed through a
4776 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4777 * command to invalidate read only caches prior to programming
4778 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4779 *
4780 * Note the cmd_buffer_apply_pipe_flushes will split this into two
4781 * PIPE_CONTROLs.
4782 */
4783 anv_add_pending_pipe_bits(cmd_buffer,
4784 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4785 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4786 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4787 ANV_PIPE_CS_STALL_BIT |
4788 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4789 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4790 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4791 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4792 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT,
4793 "flush and invalidate for PIPELINE_SELECT");
4794 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4795
4796 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
4797 ps.PipelineSelection = pipeline;
4798 }
4799
4800 cmd_buffer->state.current_pipeline = pipeline;
4801 }
4802
4803 void
genX(flush_pipeline_select_3d)4804 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4805 {
4806 genX(flush_pipeline_select)(cmd_buffer, _3D);
4807 }
4808
4809 void
genX(flush_pipeline_select_gpgpu)4810 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4811 {
4812 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4813 }
4814
4815 void
genX(cmd_buffer_emit_gfx7_depth_flush)4816 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
4817 {
4818 if (GFX_VER >= 8)
4819 return;
4820
4821 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
4822 *
4823 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
4824 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
4825 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
4826 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
4827 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with
4828 * Depth Flush Bit set, followed by another pipelined depth stall
4829 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
4830 * guarantee that the pipeline from WM onwards is already flushed (e.g.,
4831 * via a preceding MI_FLUSH)."
4832 */
4833 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4834 pipe.DepthStallEnable = true;
4835 anv_debug_dump_pc(pipe);
4836 }
4837 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4838 pipe.DepthCacheFlushEnable = true;
4839 anv_debug_dump_pc(pipe);
4840 }
4841 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4842 pipe.DepthStallEnable = true;
4843 anv_debug_dump_pc(pipe);
4844 }
4845 }
4846
4847 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4848 *
4849 * "The VF cache needs to be invalidated before binding and then using
4850 * Vertex Buffers that overlap with any previously bound Vertex Buffer
4851 * (at a 64B granularity) since the last invalidation. A VF cache
4852 * invalidate is performed by setting the "VF Cache Invalidation Enable"
4853 * bit in PIPE_CONTROL."
4854 *
4855 * This is implemented by carefully tracking all vertex and index buffer
4856 * bindings and flushing if the cache ever ends up with a range in the cache
4857 * that would exceed 4 GiB. This is implemented in three parts:
4858 *
4859 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4860 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4861 * tracking code of the new binding. If this new binding would cause
4862 * the cache to have a too-large range on the next draw call, a pipeline
4863 * stall and VF cache invalidate are added to pending_pipeline_bits.
4864 *
4865 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4866 * empty whenever we emit a VF invalidate.
4867 *
4868 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4869 * after every 3DPRIMITIVE and copies the bound range into the dirty
4870 * range for each used buffer. This has to be a separate step because
4871 * we don't always re-bind all buffers and so 1. can't know which
4872 * buffers are actually bound.
4873 */
4874 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4875 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4876 int vb_index,
4877 struct anv_address vb_address,
4878 uint32_t vb_size)
4879 {
4880 if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4881 return;
4882
4883 struct anv_vb_cache_range *bound, *dirty;
4884 if (vb_index == -1) {
4885 bound = &cmd_buffer->state.gfx.ib_bound_range;
4886 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4887 } else {
4888 assert(vb_index >= 0);
4889 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4890 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4891 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4892 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4893 }
4894
4895 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4896 vb_address,
4897 vb_size)) {
4898 anv_add_pending_pipe_bits(cmd_buffer,
4899 ANV_PIPE_CS_STALL_BIT |
4900 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4901 "vb > 32b range");
4902 }
4903 }
4904
4905 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4906 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4907 uint32_t access_type,
4908 uint64_t vb_used)
4909 {
4910 if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4911 return;
4912
4913 if (access_type == RANDOM) {
4914 /* We have an index buffer */
4915 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4916 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4917
4918 anv_merge_vb_cache_range(dirty, bound);
4919 }
4920
4921 uint64_t mask = vb_used;
4922 while (mask) {
4923 int i = u_bit_scan64(&mask);
4924 assert(i >= 0);
4925 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4926 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4927
4928 struct anv_vb_cache_range *bound, *dirty;
4929 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4930 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4931
4932 anv_merge_vb_cache_range(dirty, bound);
4933 }
4934 }
4935
4936 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4937 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4938 {
4939 struct anv_device *device = cmd_buffer->device;
4940 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4941
4942 /* FIXME: Width and Height are wrong */
4943
4944 genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
4945
4946 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4947 device->isl_dev.ds.size / 4);
4948 if (dw == NULL)
4949 return;
4950
4951 struct isl_view isl_view = {};
4952 struct isl_depth_stencil_hiz_emit_info info = {
4953 .view = &isl_view,
4954 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4955 };
4956
4957 if (gfx->depth_att.iview != NULL) {
4958 isl_view = gfx->depth_att.iview->planes[0].isl;
4959 } else if (gfx->stencil_att.iview != NULL) {
4960 isl_view = gfx->stencil_att.iview->planes[0].isl;
4961 }
4962
4963 if (gfx->view_mask) {
4964 assert(isl_view.array_len == 0 ||
4965 isl_view.array_len >= util_last_bit(gfx->view_mask));
4966 isl_view.array_len = util_last_bit(gfx->view_mask);
4967 } else {
4968 assert(isl_view.array_len == 0 ||
4969 isl_view.array_len >= util_last_bit(gfx->layer_count));
4970 isl_view.array_len = gfx->layer_count;
4971 }
4972
4973 if (gfx->depth_att.iview != NULL) {
4974 const struct anv_image_view *iview = gfx->depth_att.iview;
4975 const struct anv_image *image = iview->image;
4976
4977 const uint32_t depth_plane =
4978 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4979 const struct anv_surface *depth_surface =
4980 &image->planes[depth_plane].primary_surface;
4981 const struct anv_address depth_address =
4982 anv_image_address(image, &depth_surface->memory_range);
4983
4984 info.depth_surf = &depth_surface->isl;
4985
4986 info.depth_address =
4987 anv_batch_emit_reloc(&cmd_buffer->batch,
4988 dw + device->isl_dev.ds.depth_offset / 4,
4989 depth_address.bo, depth_address.offset);
4990 info.mocs =
4991 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
4992
4993 info.hiz_usage = gfx->depth_att.aux_usage;
4994 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
4995 assert(isl_aux_usage_has_hiz(info.hiz_usage));
4996
4997 const struct anv_surface *hiz_surface =
4998 &image->planes[depth_plane].aux_surface;
4999 const struct anv_address hiz_address =
5000 anv_image_address(image, &hiz_surface->memory_range);
5001
5002 info.hiz_surf = &hiz_surface->isl;
5003
5004 info.hiz_address =
5005 anv_batch_emit_reloc(&cmd_buffer->batch,
5006 dw + device->isl_dev.ds.hiz_offset / 4,
5007 hiz_address.bo, hiz_address.offset);
5008
5009 info.depth_clear_value = ANV_HZ_FC_VAL;
5010 }
5011 }
5012
5013 if (gfx->stencil_att.iview != NULL) {
5014 const struct anv_image_view *iview = gfx->stencil_att.iview;
5015 const struct anv_image *image = iview->image;
5016
5017 const uint32_t stencil_plane =
5018 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5019 const struct anv_surface *stencil_surface =
5020 &image->planes[stencil_plane].primary_surface;
5021 const struct anv_address stencil_address =
5022 anv_image_address(image, &stencil_surface->memory_range);
5023
5024 info.stencil_surf = &stencil_surface->isl;
5025
5026 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
5027 info.stencil_address =
5028 anv_batch_emit_reloc(&cmd_buffer->batch,
5029 dw + device->isl_dev.ds.stencil_offset / 4,
5030 stencil_address.bo, stencil_address.offset);
5031 info.mocs =
5032 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
5033 }
5034
5035 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5036
5037 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5038 }
5039
5040 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5041 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5042 {
5043 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5044 vk_find_struct_const(att->pNext,
5045 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5046 if (layout_info != NULL)
5047 return layout_info->initialLayout;
5048
5049 return att->imageLayout;
5050 }
5051
genX(CmdBeginRendering)5052 void genX(CmdBeginRendering)(
5053 VkCommandBuffer commandBuffer,
5054 const VkRenderingInfo* pRenderingInfo)
5055 {
5056 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5057 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5058 VkResult result;
5059
5060 if (!is_render_queue_cmd_buffer(cmd_buffer)) {
5061 assert(!"Trying to start a render pass on non-render queue!");
5062 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5063 return;
5064 }
5065
5066 anv_measure_beginrenderpass(cmd_buffer);
5067 trace_intel_begin_render_pass(&cmd_buffer->trace);
5068
5069 gfx->rendering_flags = pRenderingInfo->flags;
5070 gfx->render_area = pRenderingInfo->renderArea;
5071 gfx->view_mask = pRenderingInfo->viewMask;
5072 gfx->layer_count = pRenderingInfo->layerCount;
5073 gfx->samples = 0;
5074
5075 const bool is_multiview = gfx->view_mask != 0;
5076 const VkRect2D render_area = gfx->render_area;
5077 const uint32_t layers =
5078 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5079
5080 /* The framebuffer size is at least large enough to contain the render
5081 * area. Because a zero renderArea is possible, we MAX with 1.
5082 */
5083 struct isl_extent3d fb_size = {
5084 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5085 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5086 .d = layers,
5087 };
5088
5089 const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5090 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5091 if (result != VK_SUCCESS)
5092 return;
5093
5094 genX(flush_pipeline_select_3d)(cmd_buffer);
5095
5096 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5097 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
5098 continue;
5099
5100 const VkRenderingAttachmentInfo *att =
5101 &pRenderingInfo->pColorAttachments[i];
5102 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5103 const VkImageLayout initial_layout = attachment_initial_layout(att);
5104
5105 assert(render_area.offset.x + render_area.extent.width <=
5106 iview->vk.extent.width);
5107 assert(render_area.offset.y + render_area.extent.height <=
5108 iview->vk.extent.height);
5109 assert(layers <= iview->vk.layer_count);
5110
5111 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5112 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5113
5114 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5115 gfx->samples |= iview->vk.image->samples;
5116
5117 enum isl_aux_usage aux_usage =
5118 anv_layout_to_aux_usage(cmd_buffer->device->info,
5119 iview->image,
5120 VK_IMAGE_ASPECT_COLOR_BIT,
5121 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5122 att->imageLayout);
5123
5124 union isl_color_value fast_clear_color = { .u32 = { 0, } };
5125
5126 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5127 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5128 const union isl_color_value clear_color =
5129 vk_to_isl_color_with_format(att->clearValue.color,
5130 iview->planes[0].isl.format);
5131
5132 /* We only support fast-clears on the first layer */
5133 const bool fast_clear =
5134 (!is_multiview || (gfx->view_mask & 1)) &&
5135 anv_can_fast_clear_color_view(cmd_buffer->device, iview,
5136 att->imageLayout, clear_color,
5137 layers, render_area);
5138
5139 if (att->imageLayout != initial_layout) {
5140 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5141 render_area.extent.width == iview->vk.extent.width &&
5142 render_area.extent.height == iview->vk.extent.height);
5143 if (is_multiview) {
5144 u_foreach_bit(view, gfx->view_mask) {
5145 transition_color_buffer(cmd_buffer, iview->image,
5146 VK_IMAGE_ASPECT_COLOR_BIT,
5147 iview->vk.base_mip_level, 1,
5148 iview->vk.base_array_layer + view,
5149 1, /* layer_count */
5150 initial_layout, att->imageLayout,
5151 VK_QUEUE_FAMILY_IGNORED,
5152 VK_QUEUE_FAMILY_IGNORED,
5153 fast_clear);
5154 }
5155 } else {
5156 transition_color_buffer(cmd_buffer, iview->image,
5157 VK_IMAGE_ASPECT_COLOR_BIT,
5158 iview->vk.base_mip_level, 1,
5159 iview->vk.base_array_layer,
5160 gfx->layer_count,
5161 initial_layout, att->imageLayout,
5162 VK_QUEUE_FAMILY_IGNORED,
5163 VK_QUEUE_FAMILY_IGNORED,
5164 fast_clear);
5165 }
5166 }
5167
5168 uint32_t clear_view_mask = pRenderingInfo->viewMask;
5169 uint32_t base_clear_layer = iview->vk.base_array_layer;
5170 uint32_t clear_layer_count = gfx->layer_count;
5171 if (fast_clear) {
5172 /* We only support fast-clears on the first layer */
5173 assert(iview->vk.base_mip_level == 0 &&
5174 iview->vk.base_array_layer == 0);
5175
5176 fast_clear_color = clear_color;
5177
5178 if (iview->image->vk.samples == 1) {
5179 anv_image_ccs_op(cmd_buffer, iview->image,
5180 iview->planes[0].isl.format,
5181 iview->planes[0].isl.swizzle,
5182 VK_IMAGE_ASPECT_COLOR_BIT,
5183 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5184 &fast_clear_color,
5185 false);
5186 } else {
5187 anv_image_mcs_op(cmd_buffer, iview->image,
5188 iview->planes[0].isl.format,
5189 iview->planes[0].isl.swizzle,
5190 VK_IMAGE_ASPECT_COLOR_BIT,
5191 0, 1, ISL_AUX_OP_FAST_CLEAR,
5192 &fast_clear_color,
5193 false);
5194 }
5195 clear_view_mask &= ~1u;
5196 base_clear_layer++;
5197 clear_layer_count--;
5198
5199 set_image_clear_color(cmd_buffer, iview->image,
5200 VK_IMAGE_ASPECT_COLOR_BIT, clear_color);
5201
5202 if (isl_color_value_is_zero(clear_color,
5203 iview->planes[0].isl.format)) {
5204 /* This image has the auxiliary buffer enabled. We can mark the
5205 * subresource as not needing a resolve because the clear color
5206 * will match what's in every RENDER_SURFACE_STATE object when
5207 * it's being used for sampling.
5208 */
5209 set_image_fast_clear_state(cmd_buffer, iview->image,
5210 VK_IMAGE_ASPECT_COLOR_BIT,
5211 ANV_FAST_CLEAR_DEFAULT_VALUE);
5212 } else {
5213 set_image_fast_clear_state(cmd_buffer, iview->image,
5214 VK_IMAGE_ASPECT_COLOR_BIT,
5215 ANV_FAST_CLEAR_ANY);
5216 }
5217 }
5218
5219 if (is_multiview) {
5220 u_foreach_bit(view, clear_view_mask) {
5221 anv_image_clear_color(cmd_buffer, iview->image,
5222 VK_IMAGE_ASPECT_COLOR_BIT,
5223 aux_usage,
5224 iview->planes[0].isl.format,
5225 iview->planes[0].isl.swizzle,
5226 iview->vk.base_mip_level,
5227 iview->vk.base_array_layer + view, 1,
5228 render_area, clear_color);
5229 }
5230 } else {
5231 anv_image_clear_color(cmd_buffer, iview->image,
5232 VK_IMAGE_ASPECT_COLOR_BIT,
5233 aux_usage,
5234 iview->planes[0].isl.format,
5235 iview->planes[0].isl.swizzle,
5236 iview->vk.base_mip_level,
5237 base_clear_layer, clear_layer_count,
5238 render_area, clear_color);
5239 }
5240 } else {
5241 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5242 assert(att->imageLayout == initial_layout);
5243 }
5244
5245 gfx->color_att[i].vk_format = iview->vk.format;
5246 gfx->color_att[i].iview = iview;
5247 gfx->color_att[i].layout = att->imageLayout;
5248 gfx->color_att[i].aux_usage = aux_usage;
5249
5250 struct isl_view isl_view = iview->planes[0].isl;
5251 if (pRenderingInfo->viewMask) {
5252 assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5253 isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5254 } else {
5255 assert(isl_view.array_len >= pRenderingInfo->layerCount);
5256 isl_view.array_len = pRenderingInfo->layerCount;
5257 }
5258
5259 anv_image_fill_surface_state(cmd_buffer->device,
5260 iview->image,
5261 VK_IMAGE_ASPECT_COLOR_BIT,
5262 &isl_view,
5263 ISL_SURF_USAGE_RENDER_TARGET_BIT,
5264 aux_usage, &fast_clear_color,
5265 0, /* anv_image_view_state_flags */
5266 &gfx->color_att[i].surface_state,
5267 NULL);
5268
5269 add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
5270
5271 if ((att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5272 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5273 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5274 iview->planes[0].isl.base_level == 0 &&
5275 iview->planes[0].isl.base_array_layer == 0) {
5276 genX(copy_fast_clear_dwords)(cmd_buffer,
5277 gfx->color_att[i].surface_state.state,
5278 iview->image,
5279 VK_IMAGE_ASPECT_COLOR_BIT,
5280 false /* copy to ss */);
5281 }
5282
5283 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5284 gfx->color_att[i].resolve_mode = att->resolveMode;
5285 gfx->color_att[i].resolve_iview =
5286 anv_image_view_from_handle(att->resolveImageView);
5287 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5288 }
5289 }
5290
5291 anv_cmd_graphic_state_update_has_uint_rt(gfx);
5292
5293 const struct anv_image_view *ds_iview = NULL;
5294 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5295 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5296 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5297 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5298 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5299 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5300 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5301 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5302 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5303 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5304 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5305 float depth_clear_value = 0;
5306 uint32_t stencil_clear_value = 0;
5307
5308 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5309 d_iview = anv_image_view_from_handle(d_att->imageView);
5310 initial_depth_layout = attachment_initial_layout(d_att);
5311 depth_layout = d_att->imageLayout;
5312 depth_aux_usage =
5313 anv_layout_to_aux_usage(cmd_buffer->device->info,
5314 d_iview->image,
5315 VK_IMAGE_ASPECT_DEPTH_BIT,
5316 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5317 depth_layout);
5318 depth_clear_value = d_att->clearValue.depthStencil.depth;
5319 }
5320
5321 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5322 s_iview = anv_image_view_from_handle(s_att->imageView);
5323 initial_stencil_layout = attachment_initial_layout(s_att);
5324 stencil_layout = s_att->imageLayout;
5325 stencil_aux_usage =
5326 anv_layout_to_aux_usage(cmd_buffer->device->info,
5327 s_iview->image,
5328 VK_IMAGE_ASPECT_STENCIL_BIT,
5329 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5330 stencil_layout);
5331 stencil_clear_value = s_att->clearValue.depthStencil.stencil;
5332 }
5333
5334 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5335 ds_iview = d_iview != NULL ? d_iview : s_iview;
5336 assert(ds_iview != NULL);
5337
5338 assert(render_area.offset.x + render_area.extent.width <=
5339 ds_iview->vk.extent.width);
5340 assert(render_area.offset.y + render_area.extent.height <=
5341 ds_iview->vk.extent.height);
5342 assert(layers <= ds_iview->vk.layer_count);
5343
5344 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5345 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5346
5347 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5348 gfx->samples |= ds_iview->vk.image->samples;
5349
5350 VkImageAspectFlags clear_aspects = 0;
5351 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5352 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5353 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5354 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5355 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5356 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5357
5358 if (clear_aspects != 0) {
5359 const bool hiz_clear =
5360 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5361 depth_layout, clear_aspects,
5362 depth_clear_value,
5363 render_area);
5364
5365 if (depth_layout != initial_depth_layout) {
5366 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5367 render_area.extent.width == d_iview->vk.extent.width &&
5368 render_area.extent.height == d_iview->vk.extent.height);
5369
5370 if (is_multiview) {
5371 u_foreach_bit(view, gfx->view_mask) {
5372 transition_depth_buffer(cmd_buffer, d_iview->image,
5373 d_iview->vk.base_array_layer + view,
5374 1 /* layer_count */,
5375 initial_depth_layout, depth_layout,
5376 hiz_clear);
5377 }
5378 } else {
5379 transition_depth_buffer(cmd_buffer, d_iview->image,
5380 d_iview->vk.base_array_layer,
5381 gfx->layer_count,
5382 initial_depth_layout, depth_layout,
5383 hiz_clear);
5384 }
5385 }
5386
5387 if (stencil_layout != initial_stencil_layout) {
5388 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5389 render_area.extent.width == s_iview->vk.extent.width &&
5390 render_area.extent.height == s_iview->vk.extent.height);
5391
5392 if (is_multiview) {
5393 u_foreach_bit(view, gfx->view_mask) {
5394 transition_stencil_buffer(cmd_buffer, s_iview->image,
5395 s_iview->vk.base_mip_level, 1,
5396 s_iview->vk.base_array_layer + view,
5397 1 /* layer_count */,
5398 initial_stencil_layout,
5399 stencil_layout,
5400 hiz_clear);
5401 }
5402 } else {
5403 transition_stencil_buffer(cmd_buffer, s_iview->image,
5404 s_iview->vk.base_mip_level, 1,
5405 s_iview->vk.base_array_layer,
5406 gfx->layer_count,
5407 initial_stencil_layout,
5408 stencil_layout,
5409 hiz_clear);
5410 }
5411 }
5412
5413 if (is_multiview) {
5414 uint32_t clear_view_mask = pRenderingInfo->viewMask;
5415 while (clear_view_mask) {
5416 int view = u_bit_scan(&clear_view_mask);
5417
5418 uint32_t level = ds_iview->vk.base_mip_level;
5419 uint32_t layer = ds_iview->vk.base_array_layer + view;
5420
5421 if (hiz_clear) {
5422 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5423 clear_aspects,
5424 level, layer, 1,
5425 render_area,
5426 stencil_clear_value);
5427 } else {
5428 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5429 clear_aspects,
5430 depth_aux_usage,
5431 level, layer, 1,
5432 render_area,
5433 depth_clear_value,
5434 stencil_clear_value);
5435 }
5436 }
5437 } else {
5438 uint32_t level = ds_iview->vk.base_mip_level;
5439 uint32_t base_layer = ds_iview->vk.base_array_layer;
5440 uint32_t layer_count = gfx->layer_count;
5441
5442 if (hiz_clear) {
5443 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5444 clear_aspects,
5445 level, base_layer, layer_count,
5446 render_area,
5447 stencil_clear_value);
5448 } else {
5449 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5450 clear_aspects,
5451 depth_aux_usage,
5452 level, base_layer, layer_count,
5453 render_area,
5454 depth_clear_value,
5455 stencil_clear_value);
5456 }
5457 }
5458 } else {
5459 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5460 assert(depth_layout == initial_depth_layout);
5461 assert(stencil_layout == initial_stencil_layout);
5462 }
5463
5464 if (d_iview != NULL) {
5465 gfx->depth_att.vk_format = d_iview->vk.format;
5466 gfx->depth_att.iview = d_iview;
5467 gfx->depth_att.layout = depth_layout;
5468 gfx->depth_att.aux_usage = depth_aux_usage;
5469 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5470 assert(d_att->resolveImageView != VK_NULL_HANDLE);
5471 gfx->depth_att.resolve_mode = d_att->resolveMode;
5472 gfx->depth_att.resolve_iview =
5473 anv_image_view_from_handle(d_att->resolveImageView);
5474 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5475 }
5476 }
5477
5478 if (s_iview != NULL) {
5479 gfx->stencil_att.vk_format = s_iview->vk.format;
5480 gfx->stencil_att.iview = s_iview;
5481 gfx->stencil_att.layout = stencil_layout;
5482 gfx->stencil_att.aux_usage = stencil_aux_usage;
5483 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5484 assert(s_att->resolveImageView != VK_NULL_HANDLE);
5485 gfx->stencil_att.resolve_mode = s_att->resolveMode;
5486 gfx->stencil_att.resolve_iview =
5487 anv_image_view_from_handle(s_att->resolveImageView);
5488 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5489 }
5490 }
5491 }
5492
5493 /* Finally, now that we know the right size, set up the null surface */
5494 assert(util_bitcount(gfx->samples) <= 1);
5495 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5496 gfx->null_surface_state.map,
5497 .size = fb_size);
5498
5499 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5500 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5501 continue;
5502
5503 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5504 gfx->color_att[i].surface_state.state.map,
5505 .size = fb_size);
5506 }
5507
5508 /****** We can now start emitting code to begin the render pass ******/
5509
5510 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5511
5512 /* Our implementation of VK_KHR_multiview uses instancing to draw the
5513 * different views. If the client asks for instancing, we need to use the
5514 * Instance Data Step Rate to ensure that we repeat the client's
5515 * per-instance data once for each view. Since this bit is in
5516 * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
5517 * of each subpass.
5518 */
5519 if (GFX_VER == 7)
5520 gfx->vb_dirty |= ~0;
5521
5522 /* It is possible to start a render pass with an old pipeline. Because the
5523 * render pass and subpass index are both baked into the pipeline, this is
5524 * highly unlikely. In order to do so, it requires that you have a render
5525 * pass with a single subpass and that you use that render pass twice
5526 * back-to-back and use the same pipeline at the start of the second render
5527 * pass as at the end of the first. In order to avoid unpredictable issues
5528 * with this edge case, we just dirty the pipeline at the start of every
5529 * subpass.
5530 */
5531 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5532
5533 cmd_buffer_emit_depth_stencil(cmd_buffer);
5534 }
5535
5536 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5537 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5538 struct anv_attachment *att,
5539 VkImageAspectFlagBits aspect)
5540 {
5541 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5542 const struct anv_image_view *iview = att->iview;
5543
5544 if (iview == NULL)
5545 return;
5546
5547 if (gfx->view_mask == 0) {
5548 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5549 aspect, att->aux_usage,
5550 iview->planes[0].isl.base_level,
5551 iview->planes[0].isl.base_array_layer,
5552 gfx->layer_count);
5553 } else {
5554 uint32_t res_view_mask = gfx->view_mask;
5555 while (res_view_mask) {
5556 int i = u_bit_scan(&res_view_mask);
5557
5558 const uint32_t level = iview->planes[0].isl.base_level;
5559 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5560
5561 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5562 aspect, att->aux_usage,
5563 level, layer, 1);
5564 }
5565 }
5566 }
5567
5568 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)5569 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
5570 {
5571 switch (vk_mode) {
5572 case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
5573 return BLORP_FILTER_SAMPLE_0;
5574 case VK_RESOLVE_MODE_AVERAGE_BIT:
5575 return BLORP_FILTER_AVERAGE;
5576 case VK_RESOLVE_MODE_MIN_BIT:
5577 return BLORP_FILTER_MIN_SAMPLE;
5578 case VK_RESOLVE_MODE_MAX_BIT:
5579 return BLORP_FILTER_MAX_SAMPLE;
5580 default:
5581 return BLORP_FILTER_NONE;
5582 }
5583 }
5584
5585 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)5586 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
5587 const struct anv_attachment *att,
5588 VkImageLayout layout,
5589 VkImageAspectFlagBits aspect)
5590 {
5591 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5592 const struct anv_image_view *src_iview = att->iview;
5593 const struct anv_image_view *dst_iview = att->resolve_iview;
5594
5595 enum isl_aux_usage src_aux_usage =
5596 anv_layout_to_aux_usage(cmd_buffer->device->info,
5597 src_iview->image, aspect,
5598 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
5599 layout);
5600
5601 enum isl_aux_usage dst_aux_usage =
5602 anv_layout_to_aux_usage(cmd_buffer->device->info,
5603 dst_iview->image, aspect,
5604 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
5605 att->resolve_layout);
5606
5607 enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
5608
5609 const VkRect2D render_area = gfx->render_area;
5610 if (gfx->view_mask == 0) {
5611 anv_image_msaa_resolve(cmd_buffer,
5612 src_iview->image, src_aux_usage,
5613 src_iview->planes[0].isl.base_level,
5614 src_iview->planes[0].isl.base_array_layer,
5615 dst_iview->image, dst_aux_usage,
5616 dst_iview->planes[0].isl.base_level,
5617 dst_iview->planes[0].isl.base_array_layer,
5618 aspect,
5619 render_area.offset.x, render_area.offset.y,
5620 render_area.offset.x, render_area.offset.y,
5621 render_area.extent.width,
5622 render_area.extent.height,
5623 gfx->layer_count, filter);
5624 } else {
5625 uint32_t res_view_mask = gfx->view_mask;
5626 while (res_view_mask) {
5627 int i = u_bit_scan(&res_view_mask);
5628
5629 anv_image_msaa_resolve(cmd_buffer,
5630 src_iview->image, src_aux_usage,
5631 src_iview->planes[0].isl.base_level,
5632 src_iview->planes[0].isl.base_array_layer + i,
5633 dst_iview->image, dst_aux_usage,
5634 dst_iview->planes[0].isl.base_level,
5635 dst_iview->planes[0].isl.base_array_layer + i,
5636 aspect,
5637 render_area.offset.x, render_area.offset.y,
5638 render_area.offset.x, render_area.offset.y,
5639 render_area.extent.width,
5640 render_area.extent.height,
5641 1, filter);
5642 }
5643 }
5644 }
5645
genX(CmdEndRendering)5646 void genX(CmdEndRendering)(
5647 VkCommandBuffer commandBuffer)
5648 {
5649 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5650 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5651
5652 if (anv_batch_has_error(&cmd_buffer->batch))
5653 return;
5654
5655 const bool is_multiview = gfx->view_mask != 0;
5656 const uint32_t layers =
5657 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5658
5659 bool has_color_resolve = false;
5660 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5661 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5662 VK_IMAGE_ASPECT_COLOR_BIT);
5663
5664 /* Stash this off for later */
5665 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
5666 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5667 has_color_resolve = true;
5668 }
5669
5670 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5671 VK_IMAGE_ASPECT_DEPTH_BIT);
5672
5673 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5674 VK_IMAGE_ASPECT_STENCIL_BIT);
5675
5676 if (has_color_resolve) {
5677 /* We are about to do some MSAA resolves. We need to flush so that the
5678 * result of writes to the MSAA color attachments show up in the sampler
5679 * when we blit to the single-sampled resolve target.
5680 */
5681 anv_add_pending_pipe_bits(cmd_buffer,
5682 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5683 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5684 "MSAA resolve");
5685 }
5686
5687 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
5688 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
5689 /* We are about to do some MSAA resolves. We need to flush so that the
5690 * result of writes to the MSAA depth attachments show up in the sampler
5691 * when we blit to the single-sampled resolve target.
5692 */
5693 anv_add_pending_pipe_bits(cmd_buffer,
5694 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5695 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5696 "MSAA resolve");
5697 }
5698
5699 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5700 const struct anv_attachment *att = &gfx->color_att[i];
5701 if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
5702 (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5703 continue;
5704
5705 cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
5706 VK_IMAGE_ASPECT_COLOR_BIT);
5707 }
5708
5709 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5710 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5711 const struct anv_image_view *src_iview = gfx->depth_att.iview;
5712
5713 /* MSAA resolves sample from the source attachment. Transition the
5714 * depth attachment first to get rid of any HiZ that we may not be
5715 * able to handle.
5716 */
5717 transition_depth_buffer(cmd_buffer, src_iview->image,
5718 src_iview->planes[0].isl.base_array_layer,
5719 layers,
5720 gfx->depth_att.layout,
5721 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5722 false /* will_full_fast_clear */);
5723
5724 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
5725 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5726 VK_IMAGE_ASPECT_DEPTH_BIT);
5727
5728 /* Transition the source back to the original layout. This seems a bit
5729 * inefficient but, since HiZ resolves aren't destructive, going from
5730 * less HiZ to more is generally a no-op.
5731 */
5732 transition_depth_buffer(cmd_buffer, src_iview->image,
5733 src_iview->planes[0].isl.base_array_layer,
5734 layers,
5735 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5736 gfx->depth_att.layout,
5737 false /* will_full_fast_clear */);
5738 }
5739
5740 if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5741 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5742 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
5743 gfx->stencil_att.layout,
5744 VK_IMAGE_ASPECT_STENCIL_BIT);
5745 }
5746
5747 #if GFX_VER == 7
5748 /* On gfx7, we have to store a texturable version of the stencil buffer in
5749 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
5750 * forth at strategic points. Stencil writes are only allowed in following
5751 * layouts:
5752 *
5753 * - VK_IMAGE_LAYOUT_GENERAL
5754 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
5755 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
5756 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
5757 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
5758 * - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
5759 * - VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT
5760 *
5761 * For general, we have no nice opportunity to transition so we do the copy
5762 * to the shadow unconditionally at the end of the subpass. For transfer
5763 * destinations, we can update it as part of the transfer op. For the other
5764 * layouts, we delay the copy until a transition into some other layout.
5765 */
5766 if (gfx->stencil_att.iview != NULL) {
5767 const struct anv_image_view *iview = gfx->stencil_att.iview;
5768 const struct anv_image *image = iview->image;
5769 const uint32_t plane =
5770 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5771
5772 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
5773 (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
5774 gfx->stencil_att.layout == VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT)) {
5775 anv_image_copy_to_shadow(cmd_buffer, image,
5776 VK_IMAGE_ASPECT_STENCIL_BIT,
5777 iview->planes[plane].isl.base_level, 1,
5778 iview->planes[plane].isl.base_array_layer,
5779 layers);
5780 }
5781 }
5782 #endif
5783
5784 anv_cmd_buffer_reset_rendering(cmd_buffer);
5785 }
5786
5787 void
genX(cmd_emit_conditional_render_predicate)5788 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5789 {
5790 #if GFX_VERx10 >= 75
5791 struct mi_builder b;
5792 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5793
5794 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5795 mi_reg32(ANV_PREDICATE_RESULT_REG));
5796 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5797
5798 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5799 mip.LoadOperation = LOAD_LOADINV;
5800 mip.CombineOperation = COMBINE_SET;
5801 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5802 }
5803 #endif
5804 }
5805
5806 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)5807 void genX(CmdBeginConditionalRenderingEXT)(
5808 VkCommandBuffer commandBuffer,
5809 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5810 {
5811 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5812 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5813 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5814 struct anv_address value_address =
5815 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5816
5817 const bool isInverted = pConditionalRenderingBegin->flags &
5818 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5819
5820 cmd_state->conditional_render_enabled = true;
5821
5822 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5823
5824 struct mi_builder b;
5825 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5826
5827 /* Section 19.4 of the Vulkan 1.1.85 spec says:
5828 *
5829 * If the value of the predicate in buffer memory changes
5830 * while conditional rendering is active, the rendering commands
5831 * may be discarded in an implementation-dependent way.
5832 * Some implementations may latch the value of the predicate
5833 * upon beginning conditional rendering while others
5834 * may read it before every rendering command.
5835 *
5836 * So it's perfectly fine to read a value from the buffer once.
5837 */
5838 struct mi_value value = mi_mem32(value_address);
5839
5840 /* Precompute predicate result, it is necessary to support secondary
5841 * command buffers since it is unknown if conditional rendering is
5842 * inverted when populating them.
5843 */
5844 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5845 isInverted ? mi_uge(&b, mi_imm(0), value) :
5846 mi_ult(&b, mi_imm(0), value));
5847 }
5848
genX(CmdEndConditionalRenderingEXT)5849 void genX(CmdEndConditionalRenderingEXT)(
5850 VkCommandBuffer commandBuffer)
5851 {
5852 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5853 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5854
5855 cmd_state->conditional_render_enabled = false;
5856 }
5857 #endif
5858
5859 /* Set of stage bits for which are pipelined, i.e. they get queued
5860 * by the command streamer for later execution.
5861 */
5862 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5863 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5864 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5865 VK_PIPELINE_STAGE_2_HOST_BIT | \
5866 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5867
genX(CmdSetEvent2)5868 void genX(CmdSetEvent2)(
5869 VkCommandBuffer commandBuffer,
5870 VkEvent _event,
5871 const VkDependencyInfo* pDependencyInfo)
5872 {
5873 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5874 ANV_FROM_HANDLE(anv_event, event, _event);
5875
5876 VkPipelineStageFlags2 src_stages = 0;
5877
5878 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5879 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5880 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5881 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5882 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5883 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5884
5885 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5886 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5887
5888 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5889 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5890 pc.StallAtPixelScoreboard = true;
5891 pc.CommandStreamerStallEnable = true;
5892 }
5893
5894 pc.DestinationAddressType = DAT_PPGTT,
5895 pc.PostSyncOperation = WriteImmediateData,
5896 pc.Address = (struct anv_address) {
5897 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5898 event->state.offset
5899 };
5900 pc.ImmediateData = VK_EVENT_SET;
5901 anv_debug_dump_pc(pc);
5902 }
5903 }
5904
genX(CmdResetEvent2)5905 void genX(CmdResetEvent2)(
5906 VkCommandBuffer commandBuffer,
5907 VkEvent _event,
5908 VkPipelineStageFlags2 stageMask)
5909 {
5910 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5911 ANV_FROM_HANDLE(anv_event, event, _event);
5912
5913 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5914 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5915
5916 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5917 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5918 pc.StallAtPixelScoreboard = true;
5919 pc.CommandStreamerStallEnable = true;
5920 }
5921
5922 pc.DestinationAddressType = DAT_PPGTT;
5923 pc.PostSyncOperation = WriteImmediateData;
5924 pc.Address = (struct anv_address) {
5925 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5926 event->state.offset
5927 };
5928 pc.ImmediateData = VK_EVENT_RESET;
5929 anv_debug_dump_pc(pc);
5930 }
5931 }
5932
genX(CmdWaitEvents2)5933 void genX(CmdWaitEvents2)(
5934 VkCommandBuffer commandBuffer,
5935 uint32_t eventCount,
5936 const VkEvent* pEvents,
5937 const VkDependencyInfo* pDependencyInfos)
5938 {
5939 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5940
5941 #if GFX_VER >= 8
5942 for (uint32_t i = 0; i < eventCount; i++) {
5943 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5944
5945 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5946 sem.WaitMode = PollingMode,
5947 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD,
5948 sem.SemaphoreDataDword = VK_EVENT_SET,
5949 sem.SemaphoreAddress = (struct anv_address) {
5950 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5951 event->state.offset
5952 };
5953 }
5954 }
5955 #else
5956 anv_finishme("Implement events on gfx7");
5957 #endif
5958
5959 cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
5960 }
5961
vk_to_intel_index_type(VkIndexType type)5962 static uint32_t vk_to_intel_index_type(VkIndexType type)
5963 {
5964 switch (type) {
5965 case VK_INDEX_TYPE_UINT8_EXT:
5966 return INDEX_BYTE;
5967 case VK_INDEX_TYPE_UINT16:
5968 return INDEX_WORD;
5969 case VK_INDEX_TYPE_UINT32:
5970 return INDEX_DWORD;
5971 default:
5972 unreachable("invalid index type");
5973 }
5974 }
5975
genX(CmdBindIndexBuffer)5976 void genX(CmdBindIndexBuffer)(
5977 VkCommandBuffer commandBuffer,
5978 VkBuffer _buffer,
5979 VkDeviceSize offset,
5980 VkIndexType indexType)
5981 {
5982 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5983 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5984
5985 cmd_buffer->state.gfx.restart_index = vk_index_to_restart(indexType);
5986 cmd_buffer->state.gfx.index_buffer = buffer;
5987 cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
5988 cmd_buffer->state.gfx.index_offset = offset;
5989
5990 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
5991 }
5992
genX(CmdSetPerformanceOverrideINTEL)5993 VkResult genX(CmdSetPerformanceOverrideINTEL)(
5994 VkCommandBuffer commandBuffer,
5995 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
5996 {
5997 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5998
5999 switch (pOverrideInfo->type) {
6000 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
6001 anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
6002 instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
6003 instpm.MediaInstructionDisable = pOverrideInfo->enable;
6004 instpm._3DRenderingInstructionDisableMask = true;
6005 instpm.MediaInstructionDisableMask = true;
6006 }
6007 break;
6008 }
6009
6010 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
6011 if (pOverrideInfo->enable) {
6012 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
6013 anv_add_pending_pipe_bits(cmd_buffer,
6014 ANV_PIPE_FLUSH_BITS |
6015 ANV_PIPE_INVALIDATE_BITS,
6016 "perf counter isolation");
6017 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6018 }
6019 break;
6020
6021 default:
6022 unreachable("Invalid override");
6023 }
6024
6025 return VK_SUCCESS;
6026 }
6027
genX(CmdSetPerformanceStreamMarkerINTEL)6028 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
6029 VkCommandBuffer commandBuffer,
6030 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
6031 {
6032 /* TODO: Waiting on the register to write, might depend on generation. */
6033
6034 return VK_SUCCESS;
6035 }
6036
6037 #define TIMESTAMP 0x2358
6038
genX(cmd_emit_timestamp)6039 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6040 struct anv_device *device,
6041 struct anv_address addr,
6042 enum anv_timestamp_capture_type type) {
6043 switch (type) {
6044 case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6045 struct mi_builder b;
6046 mi_builder_init(&b, device->info, batch);
6047 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6048 break;
6049 }
6050
6051 case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
6052 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6053 pc.PostSyncOperation = WriteTimestamp;
6054 pc.Address = addr;
6055 anv_debug_dump_pc(pc);
6056 }
6057 break;
6058
6059 case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6060 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6061 pc.CommandStreamerStallEnable = true;
6062 pc.PostSyncOperation = WriteTimestamp;
6063 pc.Address = addr;
6064 anv_debug_dump_pc(pc);
6065 }
6066 break;
6067
6068 default:
6069 unreachable("invalid");
6070 }
6071 }
6072
genX(cmd_capture_data)6073 void genX(cmd_capture_data)(struct anv_batch *batch,
6074 struct anv_device *device,
6075 struct anv_address dst_addr,
6076 struct anv_address src_addr,
6077 uint32_t size_B)
6078 {
6079 struct mi_builder b;
6080 mi_builder_init(&b, device->info, batch);
6081 mi_memcpy(&b, dst_addr, src_addr, size_B);
6082 }
6083