1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 #include "common/intel_genX_state_brw.h"
35 #include "common/intel_guardband.h"
36 #include "common/intel_tiled_render.h"
37 #include "compiler/brw_prim.h"
38
39 const uint32_t genX(vk_to_intel_blend)[] = {
40 [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO,
41 [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE,
42 [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR,
43 [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR,
44 [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR,
45 [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR,
46 [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA,
47 [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA,
48 [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA,
49 [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA,
50 [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR,
51 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
52 [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA,
53 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
54 [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE,
55 [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR,
56 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR,
57 [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA,
58 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA,
59 };
60
61 static const uint32_t genX(vk_to_intel_blend_op)[] = {
62 [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD,
63 [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT,
64 [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT,
65 [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN,
66 [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX,
67 };
68
69 static void
genX(streamout_prologue)70 genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
71 {
72 #if INTEL_WA_16013994831_GFX_VER
73 /* Wa_16013994831 - Disable preemption during streamout, enable back
74 * again if XFB not used by the current pipeline.
75 *
76 * Although this workaround applies to Gfx12+, we already disable object
77 * level preemption for another reason in genX_state.c so we can skip this
78 * for Gfx12.
79 */
80 if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
81 return;
82
83 struct anv_graphics_pipeline *pipeline =
84 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
85 if (pipeline->uses_xfb) {
86 genX(cmd_buffer_set_preemption)(cmd_buffer, false);
87 return;
88 }
89
90 if (!cmd_buffer->state.gfx.object_preemption)
91 genX(cmd_buffer_set_preemption)(cmd_buffer, true);
92 #endif
93 }
94
95 #if GFX_VER >= 12
96 static uint32_t
get_cps_state_offset(struct anv_cmd_buffer * cmd_buffer,bool cps_enabled,const struct vk_fragment_shading_rate_state * fsr)97 get_cps_state_offset(struct anv_cmd_buffer *cmd_buffer, bool cps_enabled,
98 const struct vk_fragment_shading_rate_state *fsr)
99 {
100 struct anv_device *device = cmd_buffer->device;
101
102 if (!cps_enabled)
103 return device->cps_states.offset;
104
105 uint32_t offset;
106 static const uint32_t size_index[] = {
107 [1] = 0,
108 [2] = 1,
109 [4] = 2,
110 };
111
112 #if GFX_VERx10 >= 125
113 offset =
114 1 + /* skip disabled */
115 fsr->combiner_ops[0] * 5 * 3 * 3 +
116 fsr->combiner_ops[1] * 3 * 3 +
117 size_index[fsr->fragment_size.width] * 3 +
118 size_index[fsr->fragment_size.height];
119 #else
120 offset =
121 1 + /* skip disabled */
122 size_index[fsr->fragment_size.width] * 3 +
123 size_index[fsr->fragment_size.height];
124 #endif
125
126 offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
127
128 return device->cps_states.offset + offset;
129 }
130 #endif /* GFX_VER >= 12 */
131
132 static bool
has_ds_feedback_loop(const struct vk_dynamic_graphics_state * dyn)133 has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn)
134 {
135 return dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
136 VK_IMAGE_ASPECT_STENCIL_BIT);
137 }
138
139 UNUSED static bool
want_stencil_pma_fix(struct anv_cmd_buffer * cmd_buffer,const struct vk_dynamic_graphics_state * dyn,const struct vk_depth_stencil_state * ds)140 want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
141 const struct vk_dynamic_graphics_state *dyn,
142 const struct vk_depth_stencil_state *ds)
143 {
144 if (GFX_VER > 9)
145 return false;
146 assert(GFX_VER == 9);
147
148 /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
149 *
150 * Clearing this bit will force the STC cache to wait for pending
151 * retirement of pixels at the HZ-read stage and do the STC-test for
152 * Non-promoted, R-computed and Computed depth modes instead of
153 * postponing the STC-test to RCPFE.
154 *
155 * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
156 * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
157 *
158 * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
159 * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
160 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
161 *
162 * COMP_STC_EN = STC_TEST_EN &&
163 * 3DSTATE_PS_EXTRA::PixelShaderComputesStencil
164 *
165 * SW parses the pipeline states to generate the following logical
166 * signal indicating if PMA FIX can be enabled.
167 *
168 * STC_PMA_OPT =
169 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
170 * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
171 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
172 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
173 * !(3DSTATE_WM::EDSC_Mode == 2) &&
174 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
175 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
176 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
177 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
178 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
179 * (COMP_STC_EN || STC_WRITE_EN) &&
180 * ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
181 * 3DSTATE_WM::ForceKillPix == ON ||
182 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
183 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
184 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
185 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
186 * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
187 */
188
189 /* These are always true:
190 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
191 * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
192 */
193
194 /* We only enable the PMA fix if we know for certain that HiZ is enabled.
195 * If we don't know whether HiZ is enabled or not, we disable the PMA fix
196 * and there is no harm.
197 *
198 * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
199 * 3DSTATE_DEPTH_BUFFER::HIZ Enable
200 */
201 if (!cmd_buffer->state.hiz_enabled)
202 return false;
203
204 /* We can't possibly know if HiZ is enabled without the depth attachment */
205 ASSERTED const struct anv_image_view *d_iview =
206 cmd_buffer->state.gfx.depth_att.iview;
207 assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
208
209 /* 3DSTATE_PS_EXTRA::PixelShaderValid */
210 struct anv_graphics_pipeline *pipeline =
211 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
212 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
213 return false;
214
215 /* !(3DSTATE_WM::EDSC_Mode == 2) */
216 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
217 if (wm_prog_data->early_fragment_tests)
218 return false;
219
220 /* We never use anv_pipeline for HiZ ops so this is trivially true:
221 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
222 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
223 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
224 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
225 */
226
227 /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
228 * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
229 */
230 const bool stc_test_en = ds->stencil.test_enable;
231
232 /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
233 * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
234 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
235 */
236 const bool stc_write_en = ds->stencil.write_enable;
237
238 /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
239 const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
240
241 /* COMP_STC_EN || STC_WRITE_EN */
242 if (!(comp_stc_en || stc_write_en))
243 return false;
244
245 /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
246 * 3DSTATE_WM::ForceKillPix == ON ||
247 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
248 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
249 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
250 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
251 * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
252 */
253 return pipeline->kill_pixel ||
254 pipeline->rp_has_ds_self_dep ||
255 has_ds_feedback_loop(dyn) ||
256 wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
257 }
258
259 static void
genX(rasterization_mode)260 genX(rasterization_mode)(VkPolygonMode raster_mode,
261 VkLineRasterizationModeKHR line_mode,
262 float line_width,
263 uint32_t *api_mode,
264 bool *msaa_rasterization_enable)
265 {
266 if (raster_mode == VK_POLYGON_MODE_LINE) {
267 /* Unfortunately, configuring our line rasterization hardware on gfx8
268 * and later is rather painful. Instead of giving us bits to tell the
269 * hardware what line mode to use like we had on gfx7, we now have an
270 * arcane combination of API Mode and MSAA enable bits which do things
271 * in a table which are expected to magically put the hardware into the
272 * right mode for your API. Sadly, Vulkan isn't any of the APIs the
273 * hardware people thought of so nothing works the way you want it to.
274 *
275 * Look at the table titled "Multisample Rasterization Modes" in Vol 7
276 * of the Skylake PRM for more details.
277 */
278 switch (line_mode) {
279 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
280 *api_mode = DX101;
281 #if GFX_VER <= 9
282 /* Prior to ICL, the algorithm the HW uses to draw wide lines
283 * doesn't quite match what the CTS expects, at least for rectangular
284 * lines, so we set this to false here, making it draw parallelograms
285 * instead, which work well enough.
286 */
287 *msaa_rasterization_enable = line_width < 1.0078125;
288 #else
289 *msaa_rasterization_enable = true;
290 #endif
291 break;
292
293 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
294 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
295 *api_mode = DX9OGL;
296 *msaa_rasterization_enable = false;
297 break;
298
299 default:
300 unreachable("Unsupported line rasterization mode");
301 }
302 } else {
303 *api_mode = DX101;
304 *msaa_rasterization_enable = true;
305 }
306 }
307
308 static bool
309 is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
310 {
311 return factor == BLENDFACTOR_SRC1_COLOR ||
312 factor == BLENDFACTOR_SRC1_ALPHA ||
313 factor == BLENDFACTOR_INV_SRC1_COLOR ||
314 factor == BLENDFACTOR_INV_SRC1_ALPHA;
315 }
316
317 #if GFX_VERx10 == 125
318 /**
319 * Return the dimensions of the current rendering area, defined as the
320 * bounding box of all present color, depth and stencil attachments.
321 */
322 UNUSED static bool
calculate_render_area(struct anv_cmd_buffer * cmd_buffer,unsigned * width,unsigned * height)323 calculate_render_area(struct anv_cmd_buffer *cmd_buffer,
324 unsigned *width, unsigned *height)
325 {
326 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
327
328 *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
329 *height = gfx->render_area.offset.y + gfx->render_area.extent.height;
330
331 for (unsigned i = 0; i < gfx->color_att_count; i++) {
332 struct anv_attachment *att = &gfx->color_att[i];
333 if (att->iview) {
334 *width = MAX2(*width, att->iview->vk.extent.width);
335 *height = MAX2(*height, att->iview->vk.extent.height);
336 }
337 }
338
339 const struct anv_image_view *const z_view = gfx->depth_att.iview;
340 if (z_view) {
341 *width = MAX2(*width, z_view->vk.extent.width);
342 *height = MAX2(*height, z_view->vk.extent.height);
343 }
344
345 const struct anv_image_view *const s_view = gfx->stencil_att.iview;
346 if (s_view) {
347 *width = MAX2(*width, s_view->vk.extent.width);
348 *height = MAX2(*height, s_view->vk.extent.height);
349 }
350
351 return *width && *height;
352 }
353
354 /* Calculate TBIMR tiling parameters adequate for the current pipeline
355 * setup. Return true if TBIMR should be enabled.
356 */
357 UNUSED static bool
calculate_tile_dimensions(struct anv_cmd_buffer * cmd_buffer,unsigned fb_width,unsigned fb_height,unsigned * tile_width,unsigned * tile_height)358 calculate_tile_dimensions(struct anv_cmd_buffer *cmd_buffer,
359 unsigned fb_width, unsigned fb_height,
360 unsigned *tile_width, unsigned *tile_height)
361 {
362 const struct anv_device *device = cmd_buffer->device;
363 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
364
365 assert(GFX_VER == 12);
366 const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
367
368 unsigned pixel_size = 0;
369
370 /* Perform a rough calculation of the tile cache footprint of the
371 * pixel pipeline, approximating it as the sum of the amount of
372 * memory used per pixel by every render target, depth, stencil and
373 * auxiliary surfaces bound to the pipeline.
374 */
375 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
376 struct anv_attachment *att = &gfx->color_att[i];
377
378 if (att->iview) {
379 const struct anv_image *image = att->iview->image;
380 const unsigned p = anv_image_aspect_to_plane(image,
381 VK_IMAGE_ASPECT_COLOR_BIT);
382 const struct anv_image_plane *plane = &image->planes[p];
383
384 pixel_size += intel_calculate_surface_pixel_size(
385 &plane->primary_surface.isl);
386
387 if (isl_aux_usage_has_mcs(att->aux_usage))
388 pixel_size += intel_calculate_surface_pixel_size(
389 &plane->aux_surface.isl);
390
391 if (isl_aux_usage_has_ccs(att->aux_usage))
392 pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
393 &plane->primary_surface.isl),
394 aux_scale);
395 }
396 }
397
398 const struct anv_image_view *const z_view = gfx->depth_att.iview;
399 if (z_view) {
400 const struct anv_image *image = z_view->image;
401 assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
402 const unsigned p = anv_image_aspect_to_plane(image,
403 VK_IMAGE_ASPECT_DEPTH_BIT);
404 const struct anv_image_plane *plane = &image->planes[p];
405
406 pixel_size += intel_calculate_surface_pixel_size(
407 &plane->primary_surface.isl);
408
409 if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
410 pixel_size += intel_calculate_surface_pixel_size(
411 &plane->aux_surface.isl);
412
413 if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
414 pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
415 &plane->primary_surface.isl),
416 aux_scale);
417 }
418
419 const struct anv_image_view *const s_view = gfx->depth_att.iview;
420 if (s_view && s_view != z_view) {
421 const struct anv_image *image = s_view->image;
422 assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
423 const unsigned p = anv_image_aspect_to_plane(image,
424 VK_IMAGE_ASPECT_STENCIL_BIT);
425 const struct anv_image_plane *plane = &image->planes[p];
426
427 pixel_size += intel_calculate_surface_pixel_size(
428 &plane->primary_surface.isl);
429 }
430
431 if (!pixel_size)
432 return false;
433
434 /* Compute a tile layout that allows reasonable utilization of the
435 * tile cache based on the per-pixel cache footprint estimated
436 * above.
437 */
438 intel_calculate_tile_dimensions(device->info, cmd_buffer->state.current_l3_config,
439 32, 32, fb_width, fb_height,
440 pixel_size, tile_width, tile_height);
441
442 /* Perform TBIMR tile passes only if the framebuffer covers more
443 * than a single tile.
444 */
445 return *tile_width < fb_width || *tile_height < fb_height;
446 }
447 #endif
448
449 /**
450 * This function takes the vulkan runtime values & dirty states and updates
451 * the values in anv_gfx_dynamic_state, flagging HW instructions for
452 * reemission if the values are changing.
453 *
454 * Nothing is emitted in the batch buffer.
455 *
456 * Returns a mask for state that we want to leave dirty afterwards.
457 */
458 anv_cmd_dirty_mask_t
genX(cmd_buffer_flush_gfx_runtime_state)459 genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
460 {
461 UNUSED struct anv_device *device = cmd_buffer->device;
462 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
463 const struct anv_graphics_pipeline *pipeline =
464 anv_pipeline_to_graphics(gfx->base.pipeline);
465 const struct vk_dynamic_graphics_state *dyn =
466 &cmd_buffer->vk.dynamic_graphics_state;
467 struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
468 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
469 struct anv_instance *instance = cmd_buffer->device->physical->instance;
470 anv_cmd_dirty_mask_t dirty_state_mask = 0;
471
472 #define GET(field) hw_state->field
473 #define SET(bit, field, value) \
474 do { \
475 __typeof(hw_state->field) __v = value; \
476 if (hw_state->field != __v) { \
477 hw_state->field = __v; \
478 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \
479 } \
480 } while (0)
481 #define SET_STAGE(bit, field, value, stage) \
482 do { \
483 __typeof(hw_state->field) __v = value; \
484 if (!anv_pipeline_has_stage(pipeline, \
485 MESA_SHADER_##stage)) { \
486 hw_state->field = __v; \
487 break; \
488 } \
489 if (hw_state->field != __v) { \
490 hw_state->field = __v; \
491 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \
492 } \
493 } while (0)
494
495 #define SETUP_PROVOKING_VERTEX(bit, cmd, mode) \
496 switch (mode) { \
497 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \
498 SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \
499 SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \
500 SET(bit, cmd.TriangleFanProvokingVertexSelect, 1); \
501 break; \
502 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \
503 SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2); \
504 SET(bit, cmd.LineStripListProvokingVertexSelect, 1); \
505 SET(bit, cmd.TriangleFanProvokingVertexSelect, 2); \
506 break; \
507 default: \
508 unreachable("Invalid provoking vertex mode"); \
509 } \
510
511 UNUSED bool fs_msaa_changed = false;
512 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
513 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
514 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
515 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR)) {
516 enum intel_msaa_flags fs_msaa_flags = 0;
517
518 if (wm_prog_data) {
519 /* If we have any dynamic bits here, we might need to update the
520 * value in the push constant for the shader.
521 */
522 if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES ||
523 wm_prog_data->persample_dispatch == BRW_SOMETIMES ||
524 wm_prog_data->alpha_to_coverage == BRW_SOMETIMES) {
525 fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC;
526
527 if (dyn->ms.rasterization_samples > 1) {
528 fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO;
529
530 if (wm_prog_data->sample_shading) {
531 assert(wm_prog_data->persample_dispatch != BRW_NEVER);
532 fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH;
533 }
534 if ((pipeline->sample_shading_enable &&
535 (pipeline->min_sample_shading * dyn->ms.rasterization_samples) > 1) ||
536 wm_prog_data->sample_shading) {
537 fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH |
538 INTEL_MSAA_FLAG_PERSAMPLE_INTERP;
539 }
540 }
541
542 if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES &&
543 !(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)) {
544 fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG |
545 INTEL_MSAA_FLAG_COARSE_RT_WRITES;
546 }
547
548 if (wm_prog_data->alpha_to_coverage == BRW_SOMETIMES &&
549 dyn->ms.alpha_to_coverage_enable)
550 fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
551
552 /* Check the last push constant value and update */
553
554 if (gfx->base.push_constants.gfx.fs_msaa_flags != fs_msaa_flags) {
555 gfx->base.push_constants.gfx.fs_msaa_flags = fs_msaa_flags;
556 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
557 gfx->base.push_constants_data_dirty = true;
558 }
559 }
560 }
561
562 if (fs_msaa_flags != gfx->fs_msaa_flags) {
563 gfx->fs_msaa_flags = fs_msaa_flags;
564 gfx->dirty |= ANV_CMD_DIRTY_FS_MSAA_FLAGS;
565 }
566 }
567
568 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
569 (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) ||
570 (gfx->dirty & ANV_CMD_DIRTY_COARSE_PIXEL_ACTIVE)) {
571 if (wm_prog_data) {
572 const struct anv_shader_bin *fs_bin =
573 pipeline->base.shaders[MESA_SHADER_FRAGMENT];
574
575 struct GENX(3DSTATE_PS) ps = {};
576 intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
577 MAX2(dyn->ms.rasterization_samples, 1),
578 gfx->fs_msaa_flags);
579
580 SET(PS, ps.KernelStartPointer0,
581 fs_bin->kernel.offset +
582 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
583 SET(PS, ps.KernelStartPointer1,
584 fs_bin->kernel.offset +
585 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
586 #if GFX_VER < 20
587 SET(PS, ps.KernelStartPointer2,
588 fs_bin->kernel.offset +
589 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
590 #endif
591
592 SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
593 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
594 SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
595 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
596 #if GFX_VER < 20
597 SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
598 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
599 #endif
600
601 #if GFX_VER < 20
602 SET(PS, ps._8PixelDispatchEnable, ps._8PixelDispatchEnable);
603 SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
604 SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
605 #else
606 SET(PS, ps.Kernel0Enable, ps.Kernel0Enable);
607 SET(PS, ps.Kernel1Enable, ps.Kernel1Enable);
608 SET(PS, ps.Kernel0SIMDWidth, ps.Kernel0SIMDWidth);
609 SET(PS, ps.Kernel1SIMDWidth, ps.Kernel1SIMDWidth);
610 SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
611 #endif
612
613 SET(PS, ps.PositionXYOffsetSelect,
614 !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
615 brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags) ?
616 POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
617
618 SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
619 brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags));
620 #if GFX_VER >= 11
621 const bool uses_coarse_pixel =
622 brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags);
623 SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel, uses_coarse_pixel);
624 #endif
625 #if GFX_VERx10 >= 125
626 enum anv_coarse_pixel_state cps_state = uses_coarse_pixel ?
627 ANV_COARSE_PIXEL_STATE_ENABLED : ANV_COARSE_PIXEL_STATE_DISABLED;
628 bool cps_state_toggled =
629 genX(cmd_buffer_set_coarse_pixel_active)(cmd_buffer, cps_state);
630 if (cps_state_toggled)
631 dirty_state_mask |= ANV_CMD_DIRTY_COARSE_PIXEL_ACTIVE;
632
633 const bool needs_ps_dependency =
634 /* TODO: We should only require this when the last geometry shader
635 * uses a fragment shading rate that is not constant.
636 */
637 uses_coarse_pixel || cps_state_toggled;
638 SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange, needs_ps_dependency);
639 #endif
640 SET(WM, wm.BarycentricInterpolationMode,
641 wm_prog_data_barycentric_modes(wm_prog_data, gfx->fs_msaa_flags));
642 } else {
643 #if GFX_VER < 20
644 SET(PS, ps._8PixelDispatchEnable, false);
645 SET(PS, ps._16PixelDispatchEnable, false);
646 SET(PS, ps._32PixelDispatchEnable, false);
647 #else
648 SET(PS, ps.Kernel0Enable, false);
649 SET(PS, ps.Kernel1Enable, false);
650 #endif
651 }
652 }
653
654 if ((gfx->dirty & (ANV_CMD_DIRTY_PIPELINE |
655 ANV_CMD_DIRTY_XFB_ENABLE |
656 ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) ||
657 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
658 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) ||
659 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
660 SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
661 SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
662
663 #if INTEL_NEEDS_WA_18022508906
664 /* Wa_18022508906 :
665 *
666 * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
667 *
668 * SOL_INT::Render_Enable =
669 * (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
670 * (
671 * (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
672 * !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
673 * !3DSTATE_STREAMOUT::API_Render_Disable &&
674 * (
675 * 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
676 * 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
677 * 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
678 * 3DSTATE_PS_EXTRA::PS_Valid ||
679 * 3DSTATE_WM::Legacy Depth_Buffer_Clear ||
680 * 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
681 * 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
682 * )
683 * )
684 *
685 * If SOL_INT::Render_Enable is false, the SO stage will not forward any
686 * topologies down the pipeline. Which is not what we want for occlusion
687 * queries.
688 *
689 * Here we force rendering to get SOL_INT::Render_Enable when occlusion
690 * queries are active.
691 */
692 SET(STREAMOUT, so.ForceRendering,
693 (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
694 Force_on : 0);
695 #endif
696
697 switch (dyn->rs.provoking_vertex) {
698 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
699 SET(STREAMOUT, so.ReorderMode, LEADING);
700 SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
701 break;
702
703 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
704 SET(STREAMOUT, so.ReorderMode, TRAILING);
705 SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
706 break;
707
708 default:
709 unreachable("Invalid provoking vertex mode");
710 }
711 }
712
713 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
714 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
715 uint32_t topology;
716 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
717 topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points);
718 else
719 topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
720
721 gfx->primitive_topology = topology;
722
723 SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
724 }
725
726 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
727 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
728 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
729 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
730 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
731
732 #if GFX_VER >= 11
733 if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate &&
734 ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
735 (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) ||
736 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) {
737 const bool cps_enable = wm_prog_data &&
738 brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags);
739 #if GFX_VER == 11
740 SET(CPS, cps.CoarsePixelShadingMode,
741 cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE);
742 SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
743 SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
744 #elif GFX_VER >= 12
745 SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
746 get_cps_state_offset(cmd_buffer, cps_enable, &dyn->fsr));
747 #endif
748 }
749 #endif /* GFX_VER >= 11 */
750
751 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
752 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
753 const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
754
755 if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
756 if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
757 SET(TE, te.OutputTopology, tes_prog_data->output_topology);
758 } else {
759 /* When the origin is upper-left, we have to flip the winding order */
760 if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
761 SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
762 } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
763 SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
764 } else {
765 SET(TE, te.OutputTopology, tes_prog_data->output_topology);
766 }
767 }
768 } else {
769 SET(TE, te.OutputTopology, OUTPUT_POINT);
770 }
771 }
772
773 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
774 SET(SF, sf.LineWidth, dyn->rs.line.width);
775
776 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
777 SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
778 SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
779 }
780
781 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
782 /**
783 * From the Vulkan Spec:
784 *
785 * "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth
786 * bias representation is a factor of constant r equal to 1."
787 *
788 * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
789 *
790 * "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
791 *
792 * Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
793 *
794 * Where r is the minimum representable value > 0 in the depth
795 * buffer format, converted to float32 (note: If state bit Legacy
796 * Global Depth Bias Enable is set, the r term will be forced to
797 * 1.0)"
798 *
799 * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
800 * LegacyGlobalDepthBiasEnable.
801 */
802 SET(SF, sf.LegacyGlobalDepthBiasEnable,
803 dyn->rs.depth_bias.representation ==
804 VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
805 }
806
807 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
808 SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D);
809
810 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
811 (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
812 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
813 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
814 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
815 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
816 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
817 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
818 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
819 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
820 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
821 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
822 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) {
823 /* Take dynamic primitive topology in to account with
824 * 3DSTATE_RASTER::APIMode
825 * 3DSTATE_RASTER::DXMultisampleRasterizationEnable
826 * 3DSTATE_RASTER::AntialiasingEnable
827 */
828 uint32_t api_mode = 0;
829 bool msaa_raster_enable = false;
830
831 const VkLineRasterizationModeKHR line_mode =
832 anv_line_rasterization_mode(dyn->rs.line.mode,
833 dyn->ms.rasterization_samples);
834
835 const VkPolygonMode dynamic_raster_mode =
836 genX(raster_polygon_mode)(pipeline,
837 dyn->rs.polygon_mode,
838 dyn->ia.primitive_topology);
839
840 genX(rasterization_mode)(dynamic_raster_mode,
841 line_mode, dyn->rs.line.width,
842 &api_mode, &msaa_raster_enable);
843
844 /* From the Browadwell PRM, Volume 2, documentation for
845 * 3DSTATE_RASTER, "Antialiasing Enable":
846 *
847 * "This field must be disabled if any of the render targets
848 * have integer (UINT or SINT) surface format."
849 *
850 * Additionally internal documentation for Gfx12+ states:
851 *
852 * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
853 * FORCED_SAMPLE_COUNT > 1."
854 */
855 const bool aa_enable =
856 anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
857 !gfx->has_uint_rt &&
858 !(GFX_VER >= 12 && gfx->samples > 1);
859
860 const bool depth_clip_enable =
861 vk_rasterization_state_depth_clip_enable(&dyn->rs);
862
863 const bool xy_clip_test_enable =
864 (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
865
866 SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
867
868 SET(RASTER, raster.APIMode, api_mode);
869 SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
870 SET(RASTER, raster.AntialiasingEnable, aa_enable);
871 SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]);
872 SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]);
873 SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
874 SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
875 SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
876 SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant);
877 SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope);
878 SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
879 SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
880 SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
881 SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
882 SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
883 SET(RASTER, raster.ConservativeRasterizationEnable,
884 dyn->rs.conservative_mode !=
885 VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
886 }
887
888 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
889 SET(MULTISAMPLE, ms.NumberofMultisamples,
890 __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
891 }
892
893 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
894 /* From the Vulkan 1.0 spec:
895 * If pSampleMask is NULL, it is treated as if the mask has all bits
896 * enabled, i.e. no coverage is removed from fragments.
897 *
898 * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
899 */
900 SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
901 }
902
903 if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
904 #if GFX_VER == 9
905 /* For the PMA fix */
906 (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
907 #endif
908 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
909 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
910 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
911 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
912 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
913 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
914 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
915 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
916 VkImageAspectFlags ds_aspects = 0;
917 if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
918 ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
919 if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
920 ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
921
922 struct vk_depth_stencil_state opt_ds = dyn->ds;
923 vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
924
925 SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
926
927 SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
928 opt_ds.stencil.front.compare_mask & 0xff);
929 SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
930 opt_ds.stencil.front.write_mask & 0xff);
931
932 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
933 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
934
935 SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
936 opt_ds.stencil.front.reference & 0xff);
937 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
938 opt_ds.stencil.back.reference & 0xff);
939
940 SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
941 SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
942 SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
943 genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]);
944 SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
945 SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable);
946 SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
947 genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]);
948 SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
949 genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]);
950 SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
951 genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]);
952 SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
953 genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]);
954 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
955 genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]);
956 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
957 genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]);
958 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
959 genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]);
960 SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
961 genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]);
962
963 #if GFX_VER == 9
964 const bool pma = want_stencil_pma_fix(cmd_buffer, dyn, &opt_ds);
965 SET(PMA_FIX, pma_fix, pma);
966 #endif
967
968 #if INTEL_WA_18019816803_GFX_VER
969 if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
970 bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
971 SET(WA_18019816803, ds_write_state, ds_write_state);
972 }
973 #endif
974 }
975
976 #if INTEL_WA_14018283232_GFX_VER
977 if (intel_needs_workaround(cmd_buffer->device->info, 14018283232) &&
978 ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
979 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE))) {
980 SET(WA_14018283232, wa_14018283232_toggle,
981 dyn->ds.depth.bounds_test.enable &&
982 wm_prog_data &&
983 wm_prog_data->uses_kill);
984 }
985 #endif
986
987 #if GFX_VER >= 12
988 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
989 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
990 SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
991 /* Only look at updating the bounds if testing is enabled */
992 if (dyn->ds.depth.bounds_test.enable) {
993 SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
994 SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
995 }
996 }
997 #endif
998
999 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
1000 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) {
1001 SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
1002 SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
1003 1.0f / MAX2(1, dyn->rs.line.stipple.factor));
1004 SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
1005
1006 SET(WM, wm.LineStippleEnable, dyn->rs.line.stipple.enable);
1007 }
1008
1009 if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) ||
1010 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1011 SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
1012 SET(VF, vf.CutIndex, gfx->restart_index);
1013 }
1014
1015 if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER)
1016 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER);
1017
1018 #if GFX_VERx10 >= 125
1019 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
1020 SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
1021 #endif
1022
1023 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1024 (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
1025 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
1026 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
1027
1028 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
1029 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
1030 SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
1031 wm_prog_data && (pipeline->rp_has_ds_self_dep ||
1032 has_ds_feedback_loop(dyn) ||
1033 wm_prog_data->uses_kill),
1034 FRAGMENT);
1035 }
1036
1037 #if GFX_VERx10 >= 125
1038 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
1039 SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
1040 wm_prog_data && wm_prog_data->has_side_effects,
1041 FRAGMENT);
1042 }
1043 #else
1044 if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
1045 ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) {
1046 /* Prior to Gfx12.5 the HW seems to avoid spawning fragment shaders even
1047 * if 3DSTATE_PS_EXTRA::PixelShaderKillsPixel=true when
1048 * 3DSTATE_PS_BLEND::HasWriteableRT=false. This is causing problems with
1049 * occlusion queries with 0 attachments. There are no CTS tests
1050 * exercising this but zink+anv fails a bunch of tests like piglit
1051 * arb_framebuffer_no_attachments-query.
1052 *
1053 * Here we choose to tweak the PixelShaderHasUAV to make sure the
1054 * fragment shaders are run properly.
1055 */
1056 SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
1057 wm_prog_data && (wm_prog_data->has_side_effects ||
1058 (gfx->color_att_count == 0 &&
1059 gfx->n_occlusion_queries > 0)),
1060 FRAGMENT);
1061 }
1062 #endif
1063
1064 if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1065 (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
1066 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
1067 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
1068 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
1069 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
1070 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
1071 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
1072 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
1073 const uint8_t color_writes = dyn->cb.color_write_enables;
1074 bool has_writeable_rt =
1075 anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
1076 !anv_cmd_buffer_all_color_write_masked(cmd_buffer);
1077
1078 SET(BLEND_STATE, blend.AlphaToCoverageEnable,
1079 dyn->ms.alpha_to_coverage_enable);
1080 SET(BLEND_STATE, blend.AlphaToOneEnable,
1081 dyn->ms.alpha_to_one_enable);
1082 SET(BLEND_STATE, blend.ColorDitherEnable,
1083 cmd_buffer->state.gfx.rendering_flags & VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT);
1084
1085 bool independent_alpha_blend = false;
1086 /* Wa_14018912822, check if we set these during RT setup. */
1087 bool color_blend_zero = false;
1088 bool alpha_blend_zero = false;
1089 for (uint32_t i = 0; i < MAX_RTS; i++) {
1090 /* Disable anything above the current number of color attachments. */
1091 bool write_disabled = i >= gfx->color_att_count ||
1092 (color_writes & BITFIELD_BIT(i)) == 0;
1093
1094 SET(BLEND_STATE, blend.rts[i].WriteDisableAlpha,
1095 write_disabled ||
1096 (dyn->cb.attachments[i].write_mask &
1097 VK_COLOR_COMPONENT_A_BIT) == 0);
1098 SET(BLEND_STATE, blend.rts[i].WriteDisableRed,
1099 write_disabled ||
1100 (dyn->cb.attachments[i].write_mask &
1101 VK_COLOR_COMPONENT_R_BIT) == 0);
1102 SET(BLEND_STATE, blend.rts[i].WriteDisableGreen,
1103 write_disabled ||
1104 (dyn->cb.attachments[i].write_mask &
1105 VK_COLOR_COMPONENT_G_BIT) == 0);
1106 SET(BLEND_STATE, blend.rts[i].WriteDisableBlue,
1107 write_disabled ||
1108 (dyn->cb.attachments[i].write_mask &
1109 VK_COLOR_COMPONENT_B_BIT) == 0);
1110 /* Vulkan specification 1.2.168, VkLogicOp:
1111 *
1112 * "Logical operations are controlled by the logicOpEnable and
1113 * logicOp members of VkPipelineColorBlendStateCreateInfo. If
1114 * logicOpEnable is VK_TRUE, then a logical operation selected by
1115 * logicOp is applied between each color attachment and the
1116 * fragment’s corresponding output value, and blending of all
1117 * attachments is treated as if it were disabled."
1118 *
1119 * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1120 * BLEND_STATE_ENTRY:
1121 *
1122 * "Enabling LogicOp and Color Buffer Blending at the same time is
1123 * UNDEFINED"
1124 */
1125 SET(BLEND_STATE, blend.rts[i].LogicOpFunction,
1126 genX(vk_to_intel_logic_op)[dyn->cb.logic_op]);
1127 SET(BLEND_STATE, blend.rts[i].LogicOpEnable, dyn->cb.logic_op_enable);
1128
1129 SET(BLEND_STATE, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT);
1130 SET(BLEND_STATE, blend.rts[i].PreBlendColorClampEnable, true);
1131 SET(BLEND_STATE, blend.rts[i].PostBlendColorClampEnable, true);
1132
1133 /* Setup blend equation. */
1134 SET(BLEND_STATE, blend.rts[i].ColorBlendFunction,
1135 genX(vk_to_intel_blend_op)[
1136 dyn->cb.attachments[i].color_blend_op]);
1137 SET(BLEND_STATE, blend.rts[i].AlphaBlendFunction,
1138 genX(vk_to_intel_blend_op)[
1139 dyn->cb.attachments[i].alpha_blend_op]);
1140
1141 if (dyn->cb.attachments[i].src_color_blend_factor !=
1142 dyn->cb.attachments[i].src_alpha_blend_factor ||
1143 dyn->cb.attachments[i].dst_color_blend_factor !=
1144 dyn->cb.attachments[i].dst_alpha_blend_factor ||
1145 dyn->cb.attachments[i].color_blend_op !=
1146 dyn->cb.attachments[i].alpha_blend_op) {
1147 independent_alpha_blend = true;
1148 }
1149
1150 /* The Dual Source Blending documentation says:
1151 *
1152 * "If SRC1 is included in a src/dst blend factor and
1153 * a DualSource RT Write message is not used, results
1154 * are UNDEFINED. (This reflects the same restriction in DX APIs,
1155 * where undefined results are produced if “o1” is not written
1156 * by a PS – there are no default values defined)."
1157 *
1158 * There is no way to gracefully fix this undefined situation
1159 * so we just disable the blending to prevent possible issues.
1160 */
1161 if (wm_prog_data && !wm_prog_data->dual_src_blend &&
1162 anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) {
1163 SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, false);
1164 } else {
1165 SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable,
1166 !dyn->cb.logic_op_enable &&
1167 dyn->cb.attachments[i].blend_enable);
1168 }
1169
1170 /* Our hardware applies the blend factor prior to the blend function
1171 * regardless of what function is used. Technically, this means the
1172 * hardware can do MORE than GL or Vulkan specify. However, it also
1173 * means that, for MIN and MAX, we have to stomp the blend factor to
1174 * ONE to make it a no-op.
1175 */
1176 uint32_t SourceBlendFactor;
1177 uint32_t DestinationBlendFactor;
1178 uint32_t SourceAlphaBlendFactor;
1179 uint32_t DestinationAlphaBlendFactor;
1180 if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN ||
1181 dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) {
1182 SourceBlendFactor = BLENDFACTOR_ONE;
1183 DestinationBlendFactor = BLENDFACTOR_ONE;
1184 } else {
1185 SourceBlendFactor = genX(vk_to_intel_blend)[
1186 dyn->cb.attachments[i].src_color_blend_factor];
1187 DestinationBlendFactor = genX(vk_to_intel_blend)[
1188 dyn->cb.attachments[i].dst_color_blend_factor];
1189 }
1190
1191 if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN ||
1192 dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) {
1193 SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1194 DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1195 } else {
1196 SourceAlphaBlendFactor = genX(vk_to_intel_blend)[
1197 dyn->cb.attachments[i].src_alpha_blend_factor];
1198 DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[
1199 dyn->cb.attachments[i].dst_alpha_blend_factor];
1200 }
1201
1202 /* Replace and Src1 value by 1.0 if dual source blending is not
1203 * enabled.
1204 */
1205 if (wm_prog_data && !wm_prog_data->dual_src_blend) {
1206 if (is_src1_blend_factor(SourceBlendFactor))
1207 SourceBlendFactor = BLENDFACTOR_ONE;
1208 if (is_src1_blend_factor(DestinationBlendFactor))
1209 DestinationBlendFactor = BLENDFACTOR_ONE;
1210 }
1211
1212 if (instance->intel_enable_wa_14018912822 &&
1213 intel_needs_workaround(cmd_buffer->device->info, 14018912822) &&
1214 dyn->ms.rasterization_samples > 1) {
1215 if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
1216 DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
1217 color_blend_zero = true;
1218 }
1219 if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
1220 DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
1221 alpha_blend_zero = true;
1222 }
1223 }
1224
1225 SET(BLEND_STATE, blend.rts[i].SourceBlendFactor, SourceBlendFactor);
1226 SET(BLEND_STATE, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor);
1227 SET(BLEND_STATE, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
1228 SET(BLEND_STATE, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
1229 }
1230 gfx->color_blend_zero = color_blend_zero;
1231 gfx->alpha_blend_zero = alpha_blend_zero;
1232
1233 SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
1234
1235 /* 3DSTATE_PS_BLEND to be consistent with the rest of the
1236 * BLEND_STATE_ENTRY.
1237 */
1238 SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
1239 SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable));
1240 SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor));
1241 SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, gfx->alpha_blend_zero ?
1242 BLENDFACTOR_CONST_ALPHA :
1243 GET(blend.rts[0].DestinationAlphaBlendFactor));
1244 SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor));
1245 SET(PS_BLEND, ps_blend.DestinationBlendFactor, gfx->color_blend_zero ?
1246 BLENDFACTOR_CONST_COLOR :
1247 GET(blend.rts[0].DestinationBlendFactor));
1248 SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
1249 SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable));
1250 SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable);
1251 }
1252
1253 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
1254 SET(CC_STATE, cc.BlendConstantColorRed,
1255 gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
1256 SET(CC_STATE, cc.BlendConstantColorGreen,
1257 gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
1258 SET(CC_STATE, cc.BlendConstantColorBlue,
1259 gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
1260 SET(CC_STATE, cc.BlendConstantColorAlpha,
1261 gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
1262 }
1263
1264 if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1265 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1266 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1267 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
1268 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1269 struct anv_instance *instance = cmd_buffer->device->physical->instance;
1270 const VkViewport *viewports = dyn->vp.viewports;
1271
1272 const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
1273
1274 /* From the Vulkan 1.0.45 spec:
1275 *
1276 * "If the last active vertex processing stage shader entry point's
1277 * interface does not include a variable decorated with
1278 * ViewportIndex, then the first viewport is used."
1279 *
1280 * This could mean that we might need to set the MaximumVPIndex based on
1281 * the pipeline's last stage, but if the last shader doesn't write the
1282 * viewport index and the VUE header is used, the compiler will force
1283 * the value to 0 (which is what the spec requires above). Otherwise it
1284 * seems like the HW should be pulling 0 if the VUE header is not
1285 * present.
1286 *
1287 * Avoiding a check on the pipeline seems to prevent additional
1288 * emissions of 3DSTATE_CLIP which appear to impact performance on
1289 * Assassin's Creed Valhalla..
1290 */
1291 SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
1292 dyn->vp.viewport_count - 1 : 0);
1293
1294 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1295 const VkViewport *vp = &viewports[i];
1296
1297 /* The gfx7 state struct has just the matrix and guardband fields, the
1298 * gfx8 struct adds the min/max viewport fields. */
1299 struct GENX(SF_CLIP_VIEWPORT) sfv = {
1300 .ViewportMatrixElementm00 = vp->width / 2,
1301 .ViewportMatrixElementm11 = vp->height / 2,
1302 .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
1303 .ViewportMatrixElementm30 = vp->x + vp->width / 2,
1304 .ViewportMatrixElementm31 = vp->y + vp->height / 2,
1305 .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
1306 (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
1307 .XMinClipGuardband = -1.0f,
1308 .XMaxClipGuardband = 1.0f,
1309 .YMinClipGuardband = -1.0f,
1310 .YMaxClipGuardband = 1.0f,
1311 .XMinViewPort = vp->x,
1312 .XMaxViewPort = vp->x + vp->width - 1,
1313 .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
1314 .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
1315 };
1316
1317 /* Fix depth test misrenderings by lowering translated depth range */
1318 if (instance->lower_depth_range_rate != 1.0f)
1319 sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
1320
1321 const uint32_t fb_size_max = 1 << 14;
1322 uint32_t x_min = 0, x_max = fb_size_max;
1323 uint32_t y_min = 0, y_max = fb_size_max;
1324
1325 /* If we have a valid renderArea, include that */
1326 if (gfx->render_area.extent.width > 0 &&
1327 gfx->render_area.extent.height > 0) {
1328 x_min = MAX2(x_min, gfx->render_area.offset.x);
1329 x_max = MIN2(x_max, gfx->render_area.offset.x +
1330 gfx->render_area.extent.width);
1331 y_min = MAX2(y_min, gfx->render_area.offset.y);
1332 y_max = MIN2(y_max, gfx->render_area.offset.y +
1333 gfx->render_area.extent.height);
1334 }
1335
1336 /* The client is required to have enough scissors for whatever it
1337 * sets as ViewportIndex but it's possible that they've got more
1338 * viewports set from a previous command. Also, from the Vulkan
1339 * 1.3.207:
1340 *
1341 * "The application must ensure (using scissor if necessary) that
1342 * all rendering is contained within the render area."
1343 *
1344 * If the client doesn't set a scissor, that basically means it
1345 * guarantees everything is in-bounds already. If we end up using a
1346 * guardband of [-1, 1] in that case, there shouldn't be much loss.
1347 * It's theoretically possible that they could do all their clipping
1348 * with clip planes but that'd be a bit odd.
1349 */
1350 if (i < dyn->vp.scissor_count) {
1351 const VkRect2D *scissor = &dyn->vp.scissors[i];
1352 x_min = MAX2(x_min, scissor->offset.x);
1353 x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
1354 y_min = MAX2(y_min, scissor->offset.y);
1355 y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
1356 }
1357
1358 /* Only bother calculating the guardband if our known render area is
1359 * less than the maximum size. Otherwise, it will calculate [-1, 1]
1360 * anyway but possibly with precision loss.
1361 */
1362 if (x_min > 0 || x_max < fb_size_max ||
1363 y_min > 0 || y_max < fb_size_max) {
1364 intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
1365 sfv.ViewportMatrixElementm00,
1366 sfv.ViewportMatrixElementm11,
1367 sfv.ViewportMatrixElementm30,
1368 sfv.ViewportMatrixElementm31,
1369 &sfv.XMinClipGuardband,
1370 &sfv.XMaxClipGuardband,
1371 &sfv.YMinClipGuardband,
1372 &sfv.YMaxClipGuardband);
1373 }
1374
1375 #define SET_VP(bit, state, field) \
1376 do { \
1377 if (hw_state->state.field != sfv.field) { \
1378 hw_state->state.field = sfv.field; \
1379 BITSET_SET(hw_state->dirty, \
1380 ANV_GFX_STATE_##bit); \
1381 } \
1382 } while (0)
1383 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
1384 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
1385 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
1386 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
1387 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
1388 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
1389 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
1390 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
1391 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
1392 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
1393 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
1394 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
1395 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
1396 SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
1397 #undef SET_VP
1398
1399 const bool depth_range_unrestricted =
1400 cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted;
1401
1402 float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0;
1403 float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0;
1404
1405 float min_depth = dyn->rs.depth_clamp_enable ?
1406 MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
1407 float max_depth = dyn->rs.depth_clamp_enable ?
1408 MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
1409
1410 SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
1411 SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
1412 }
1413
1414 /* If the HW state is already considered dirty or the previous
1415 * programmed viewport count is smaller than what we need, update the
1416 * viewport count and ensure the HW state is dirty. Otherwise if the
1417 * number of viewport programmed previously was larger than what we need
1418 * now, no need to reemit we can just keep the old programmed values.
1419 */
1420 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
1421 hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
1422 hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
1423 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
1424 }
1425 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
1426 hw_state->vp_cc.count < dyn->vp.viewport_count) {
1427 hw_state->vp_cc.count = dyn->vp.viewport_count;
1428 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
1429 }
1430 }
1431
1432 if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1433 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1434 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
1435 const VkRect2D *scissors = dyn->vp.scissors;
1436 const VkViewport *viewports = dyn->vp.viewports;
1437
1438 for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
1439 const VkRect2D *s = &scissors[i];
1440 const VkViewport *vp = &viewports[i];
1441
1442 const int max = 0xffff;
1443
1444 uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
1445 uint32_t x_min = MAX2(s->offset.x, vp->x);
1446 int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
1447 MAX2(vp->y, vp->y + vp->height) - 1);
1448 int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
1449 vp->x + vp->width - 1);
1450
1451 y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
1452 x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
1453
1454 /* Do this math using int64_t so overflow gets clamped correctly. */
1455 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1456 y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
1457 x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
1458 y_max = CLAMP((uint64_t) y_max, 0,
1459 gfx->render_area.offset.y +
1460 gfx->render_area.extent.height - 1);
1461 x_max = CLAMP((uint64_t) x_max, 0,
1462 gfx->render_area.offset.x +
1463 gfx->render_area.extent.width - 1);
1464 }
1465
1466 if (s->extent.width <= 0 || s->extent.height <= 0) {
1467 /* Since xmax and ymax are inclusive, we have to have xmax < xmin
1468 * or ymax < ymin for empty clips. In case clip x, y, width height
1469 * are all 0, the clamps below produce 0 for xmin, ymin, xmax,
1470 * ymax, which isn't what we want. Just special case empty clips
1471 * and produce a canonical empty clip.
1472 */
1473 SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
1474 SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
1475 SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
1476 SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
1477 } else {
1478 SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
1479 SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
1480 SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
1481 SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
1482 }
1483 }
1484
1485 /* If the HW state is already considered dirty or the previous
1486 * programmed viewport count is smaller than what we need, update the
1487 * viewport count and ensure the HW state is dirty. Otherwise if the
1488 * number of viewport programmed previously was larger than what we need
1489 * now, no need to reemit we can just keep the old programmed values.
1490 */
1491 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR) ||
1492 hw_state->scissor.count < dyn->vp.scissor_count) {
1493 hw_state->scissor.count = dyn->vp.scissor_count;
1494 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
1495 }
1496 }
1497
1498 #if GFX_VERx10 == 125
1499 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)) {
1500 unsigned fb_width, fb_height, tile_width, tile_height;
1501
1502 if (cmd_buffer->device->physical->instance->enable_tbimr &&
1503 calculate_render_area(cmd_buffer, &fb_width, &fb_height) &&
1504 calculate_tile_dimensions(cmd_buffer, fb_width, fb_height,
1505 &tile_width, &tile_height)) {
1506 /* Use a batch size of 128 polygons per slice as recommended
1507 * by BSpec 68436 "TBIMR Programming".
1508 */
1509 const unsigned num_slices = cmd_buffer->device->info->num_slices;
1510 const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
1511
1512 SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
1513 SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
1514 SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
1515 DIV_ROUND_UP(fb_height, tile_height));
1516 SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
1517 DIV_ROUND_UP(fb_width, tile_width));
1518 SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
1519 util_logbase2(batch_size) - 5);
1520 SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
1521 SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
1522 } else {
1523 hw_state->use_tbimr = false;
1524 }
1525 }
1526 #endif
1527
1528 struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants;
1529
1530 /* If the pipeline uses a dynamic value of patch_control_points and either
1531 * the pipeline change or the dynamic value change, check the value and
1532 * reemit if needed.
1533 */
1534 if (pipeline->dynamic_patch_control_points &&
1535 ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1536 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) &&
1537 push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) {
1538 push->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
1539 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
1540 gfx->base.push_constants_data_dirty = true;
1541 }
1542
1543 #undef GET
1544 #undef SET
1545 #undef SET_STAGE
1546
1547 vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
1548
1549 return dirty_state_mask;
1550 }
1551
1552 static void
emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer * cmd_buffer)1553 emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
1554 {
1555 #if GFX_VERx10 >= 125
1556 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
1557 vfg.DistributionMode = RR_STRICT;
1558 }
1559 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
1560 vf.GeometryDistributionEnable = true;
1561 }
1562 #endif
1563
1564 #if GFX_VER >= 12
1565 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1566 pr.ReplicaMask = 1;
1567 }
1568 #endif
1569
1570 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
1571 rr.CullMode = CULLMODE_NONE;
1572 rr.FrontFaceFillMode = FILL_MODE_SOLID;
1573 rr.BackFaceFillMode = FILL_MODE_SOLID;
1574 }
1575
1576 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
1577 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
1578
1579 #if GFX_VER >= 11
1580 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
1581 #endif
1582
1583 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
1584 clip.ClipEnable = true;
1585 clip.ClipMode = CLIPMODE_REJECT_ALL;
1586 }
1587
1588 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
1589 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
1590 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
1591 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
1592 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
1593 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
1594
1595 uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
1596 GENX(3DSTATE_VERTEX_ELEMENTS));
1597 uint32_t *ve_pack_dest = &vertex_elements[1];
1598
1599 for (int i = 0; i < 2; i++) {
1600 struct GENX(VERTEX_ELEMENT_STATE) element = {
1601 .Valid = true,
1602 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
1603 .Component0Control = VFCOMP_STORE_0,
1604 .Component1Control = VFCOMP_STORE_0,
1605 .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
1606 .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
1607 };
1608 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
1609 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
1610 }
1611
1612 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
1613 topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
1614 }
1615
1616 /* Emit dummy draw per slice. */
1617 for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
1618 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1619 prim.VertexCountPerInstance = 3;
1620 prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
1621 prim.InstanceCount = 1;
1622 prim.VertexAccessType = SEQUENTIAL;
1623 }
1624 }
1625 }
1626
1627 #if INTEL_WA_14018283232_GFX_VER
1628 void
genX(batch_emit_wa_14018283232)1629 genX(batch_emit_wa_14018283232)(struct anv_batch *batch)
1630 {
1631 anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) {
1632 barrier.ResourceBarrierBody = (struct GENX(RESOURCE_BARRIER_BODY)) {
1633 .BarrierType = RESOURCE_BARRIER_TYPE_IMMEDIATE,
1634 .SignalStage = RESOURCE_BARRIER_STAGE_COLOR,
1635 .WaitStage = RESOURCE_BARRIER_STAGE_PIXEL,
1636 };
1637 }
1638 }
1639 #endif
1640
1641 /**
1642 * This function handles dirty state emission to the batch buffer.
1643 */
1644 static void
cmd_buffer_gfx_state_emission(struct anv_cmd_buffer * cmd_buffer)1645 cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
1646 {
1647 struct anv_device *device = cmd_buffer->device;
1648 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1649 struct anv_graphics_pipeline *pipeline =
1650 anv_pipeline_to_graphics(gfx->base.pipeline);
1651 const struct vk_dynamic_graphics_state *dyn =
1652 &cmd_buffer->vk.dynamic_graphics_state;
1653 struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
1654 const bool protected = cmd_buffer->vk.pool->flags &
1655 VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
1656
1657 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
1658 genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
1659
1660 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
1661
1662 memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
1663 sizeof(struct intel_urb_config));
1664 }
1665
1666 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION))
1667 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication);
1668
1669 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING))
1670 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing);
1671
1672 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS))
1673 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs);
1674
1675 #if GFX_VER >= 11
1676 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2))
1677 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2);
1678 #endif
1679
1680 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS)) {
1681 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1682 final.vs, protected);
1683 }
1684
1685 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS)) {
1686 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1687 final.hs, protected);
1688 }
1689
1690 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS)) {
1691 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1692 final.ds, protected);
1693 }
1694
1695 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS))
1696 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics);
1697
1698 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE))
1699 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe);
1700
1701 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ))
1702 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz);
1703
1704 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
1705 /* Wa_16011773973:
1706 * If SOL is enabled and SO_DECL state has to be programmed,
1707 * 1. Send 3D State SOL state with SOL disabled
1708 * 2. Send SO_DECL NP state
1709 * 3. Send 3D State SOL with SOL Enabled
1710 */
1711 if (intel_needs_workaround(device->info, 16011773973) &&
1712 pipeline->uses_xfb)
1713 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so);
1714
1715 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
1716 final.so_decl_list);
1717
1718 #if GFX_VER >= 11 && GFX_VER < 20
1719 /* ICL PRMs, Volume 2a - Command Reference: Instructions,
1720 * 3DSTATE_SO_DECL_LIST:
1721 *
1722 * "Workaround: This command must be followed by a PIPE_CONTROL with
1723 * CS Stall bit set."
1724 *
1725 * On DG2+ also known as Wa_1509820217.
1726 */
1727 genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
1728 cmd_buffer->state.current_pipeline,
1729 ANV_PIPE_CS_STALL_BIT);
1730 #endif
1731 }
1732
1733 if (device->vk.enabled_extensions.EXT_mesh_shader) {
1734 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL)) {
1735 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1736 final.mesh_control, protected);
1737 }
1738
1739 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER))
1740 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader);
1741
1742 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB))
1743 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib);
1744
1745 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL)) {
1746 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1747 final.task_control, protected);
1748 }
1749
1750 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER))
1751 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader);
1752
1753 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB))
1754 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib);
1755
1756 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH))
1757 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh);
1758
1759 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH))
1760 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh);
1761 } else {
1762 assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) &&
1763 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) &&
1764 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) &&
1765 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) &&
1766 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) &&
1767 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) &&
1768 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) &&
1769 !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH));
1770 }
1771
1772 #define INIT(category, name) \
1773 .name = hw_state->category.name
1774 #define SET(s, category, name) \
1775 s.name = hw_state->category.name
1776
1777 /* Now the potentially dynamic instructions */
1778
1779 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) {
1780 anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_PS),
1781 pipeline, partial.ps, ps, protected) {
1782 SET(ps, ps, KernelStartPointer0);
1783 SET(ps, ps, KernelStartPointer1);
1784 SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
1785 SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
1786
1787 #if GFX_VER < 20
1788 SET(ps, ps, KernelStartPointer2);
1789 SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
1790
1791 SET(ps, ps, _8PixelDispatchEnable);
1792 SET(ps, ps, _16PixelDispatchEnable);
1793 SET(ps, ps, _32PixelDispatchEnable);
1794 #else
1795 SET(ps, ps, Kernel0Enable);
1796 SET(ps, ps, Kernel1Enable);
1797 SET(ps, ps, Kernel0SIMDWidth);
1798 SET(ps, ps, Kernel1SIMDWidth);
1799 SET(ps, ps, Kernel0PolyPackingPolicy);
1800 #endif
1801 SET(ps, ps, PositionXYOffsetSelect);
1802 }
1803 }
1804
1805 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) {
1806 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA),
1807 pipeline, partial.ps_extra, pse) {
1808 SET(pse, ps_extra, PixelShaderHasUAV);
1809 SET(pse, ps_extra, PixelShaderIsPerSample);
1810 #if GFX_VER >= 11
1811 SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
1812 #endif
1813 #if GFX_VERx10 >= 125
1814 SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
1815 #endif
1816 SET(pse, ps_extra, PixelShaderKillsPixel);
1817 }
1818 }
1819
1820 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) {
1821 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
1822 pipeline, partial.clip, clip) {
1823 SET(clip, clip, APIMode);
1824 SET(clip, clip, ViewportXYClipTestEnable);
1825 SET(clip, clip, TriangleStripListProvokingVertexSelect);
1826 SET(clip, clip, LineStripListProvokingVertexSelect);
1827 SET(clip, clip, TriangleFanProvokingVertexSelect);
1828 SET(clip, clip, MaximumVPIndex);
1829 }
1830 }
1831
1832 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) {
1833 genX(streamout_prologue)(cmd_buffer);
1834
1835 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
1836 pipeline, partial.so, so) {
1837 SET(so, so, RenderingDisable);
1838 SET(so, so, RenderStreamSelect);
1839 SET(so, so, ReorderMode);
1840 SET(so, so, ForceRendering);
1841 }
1842 }
1843
1844 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) {
1845 struct anv_state sf_clip_state =
1846 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1847 hw_state->vp_sf_clip.count * 64, 64);
1848
1849 for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
1850 struct GENX(SF_CLIP_VIEWPORT) sfv = {
1851 INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
1852 INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
1853 INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
1854 INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
1855 INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
1856 INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
1857 INIT(vp_sf_clip.elem[i], XMinClipGuardband),
1858 INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
1859 INIT(vp_sf_clip.elem[i], YMinClipGuardband),
1860 INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
1861 INIT(vp_sf_clip.elem[i], XMinViewPort),
1862 INIT(vp_sf_clip.elem[i], XMaxViewPort),
1863 INIT(vp_sf_clip.elem[i], YMinViewPort),
1864 INIT(vp_sf_clip.elem[i], YMaxViewPort),
1865 };
1866 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
1867 }
1868
1869 anv_batch_emit(&cmd_buffer->batch,
1870 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
1871 clip.SFClipViewportPointer = sf_clip_state.offset;
1872 }
1873 }
1874
1875 /* Force CC_VIEWPORT reallocation on Gfx9 when reprogramming
1876 * 3DSTATE_VIEWPORT_STATE_POINTERS_CC :
1877 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
1878 */
1879 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
1880 (GFX_VER == 9 &&
1881 BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR))) {
1882 hw_state->vp_cc.state =
1883 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1884 hw_state->vp_cc.count * 8, 32);
1885
1886 for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
1887 struct GENX(CC_VIEWPORT) cc_viewport = {
1888 INIT(vp_cc.elem[i], MinimumDepth),
1889 INIT(vp_cc.elem[i], MaximumDepth),
1890 };
1891 GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
1892 &cc_viewport);
1893 }
1894
1895 /* Dirty the pointers to reemit 3DSTATE_VIEWPORT_STATE_POINTERS_CC below
1896 */
1897 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
1898 }
1899
1900 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) {
1901 anv_batch_emit(&cmd_buffer->batch,
1902 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
1903 cc.CCViewportPointer = hw_state->vp_cc.state.offset;
1904 }
1905 cmd_buffer->state.gfx.viewport_set = true;
1906 }
1907
1908 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) {
1909 /* Wa_1409725701:
1910 *
1911 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
1912 * stored as an array of up to 16 elements. The location of first
1913 * element of the array, as specified by Pointer to SCISSOR_RECT,
1914 * should be aligned to a 64-byte boundary.
1915 */
1916 struct anv_state scissor_state =
1917 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1918 hw_state->scissor.count * 8, 64);
1919
1920 for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
1921 struct GENX(SCISSOR_RECT) scissor = {
1922 INIT(scissor.elem[i], ScissorRectangleYMin),
1923 INIT(scissor.elem[i], ScissorRectangleXMin),
1924 INIT(scissor.elem[i], ScissorRectangleYMax),
1925 INIT(scissor.elem[i], ScissorRectangleXMax),
1926 };
1927 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
1928 }
1929
1930 anv_batch_emit(&cmd_buffer->batch,
1931 GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
1932 ssp.ScissorRectPointer = scissor_state.offset;
1933 }
1934 }
1935
1936 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) {
1937 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
1938 SET(vft, vft, PrimitiveTopologyType);
1939 }
1940 }
1941
1942 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) {
1943 const uint32_t ve_count =
1944 pipeline->vs_input_elements + pipeline->svgs_count;
1945 const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
1946 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
1947 GENX(3DSTATE_VERTEX_ELEMENTS));
1948
1949 if (p) {
1950 if (ve_count == 0) {
1951 memcpy(p + 1, cmd_buffer->device->physical->empty_vs_input,
1952 sizeof(cmd_buffer->device->physical->empty_vs_input));
1953 } else if (ve_count == pipeline->vertex_input_elems) {
1954 /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so
1955 * everything is in pipeline->vertex_input_data and we can just
1956 * memcpy
1957 */
1958 memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
1959 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
1960 final.vf_instancing);
1961 } else {
1962 assert(pipeline->final.vf_instancing.len == 0);
1963 /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
1964 genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
1965 pipeline, dyn->vi, false /* emit_in_pipeline */);
1966 /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
1967 memcpy(p + 1 + 2 * pipeline->vs_input_elements,
1968 pipeline->vertex_input_data,
1969 4 * 2 * pipeline->vertex_input_elems);
1970 }
1971 }
1972 }
1973
1974 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) {
1975 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
1976 pipeline, partial.te, te) {
1977 SET(te, te, OutputTopology);
1978 }
1979 }
1980
1981 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) {
1982 anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_GS),
1983 pipeline, partial.gs, gs, protected) {
1984 SET(gs, gs, ReorderMode);
1985 }
1986 }
1987
1988 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) {
1989 #if GFX_VER == 11
1990 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) {
1991 SET(cps, cps, CoarsePixelShadingMode);
1992 SET(cps, cps, MinCPSizeX);
1993 SET(cps, cps, MinCPSizeY);
1994 }
1995 #elif GFX_VER >= 12
1996 /* TODO: we can optimize this flush in the following cases:
1997 *
1998 * In the case where the last geometry shader emits a value that is
1999 * not constant, we can avoid this stall because we can synchronize
2000 * the pixel shader internally with
2001 * 3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
2002 *
2003 * If we know that the previous pipeline and the current one are
2004 * using the same fragment shading rate.
2005 */
2006 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2007 #if GFX_VERx10 >= 125
2008 pc.PSSStallSyncEnable = true;
2009 #else
2010 pc.PSDSyncEnable = true;
2011 #endif
2012 }
2013
2014 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) {
2015 SET(cps, cps, CoarsePixelShadingStateArrayPointer);
2016 }
2017 #endif
2018 }
2019
2020 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) {
2021 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
2022 pipeline, partial.sf, sf) {
2023 SET(sf, sf, LineWidth);
2024 SET(sf, sf, TriangleStripListProvokingVertexSelect);
2025 SET(sf, sf, LineStripListProvokingVertexSelect);
2026 SET(sf, sf, TriangleFanProvokingVertexSelect);
2027 SET(sf, sf, LegacyGlobalDepthBiasEnable);
2028 }
2029 }
2030
2031 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) {
2032 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
2033 pipeline, partial.raster, raster) {
2034 SET(raster, raster, APIMode);
2035 SET(raster, raster, DXMultisampleRasterizationEnable);
2036 SET(raster, raster, AntialiasingEnable);
2037 SET(raster, raster, CullMode);
2038 SET(raster, raster, FrontWinding);
2039 SET(raster, raster, GlobalDepthOffsetEnableSolid);
2040 SET(raster, raster, GlobalDepthOffsetEnableWireframe);
2041 SET(raster, raster, GlobalDepthOffsetEnablePoint);
2042 SET(raster, raster, GlobalDepthOffsetConstant);
2043 SET(raster, raster, GlobalDepthOffsetScale);
2044 SET(raster, raster, GlobalDepthOffsetClamp);
2045 SET(raster, raster, FrontFaceFillMode);
2046 SET(raster, raster, BackFaceFillMode);
2047 SET(raster, raster, ViewportZFarClipTestEnable);
2048 SET(raster, raster, ViewportZNearClipTestEnable);
2049 SET(raster, raster, ConservativeRasterizationEnable);
2050 }
2051 }
2052
2053 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) {
2054 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_MULTISAMPLE),
2055 pipeline, partial.ms, ms) {
2056 SET(ms, ms, NumberofMultisamples);
2057 }
2058 }
2059
2060 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) {
2061 hw_state->cc.state =
2062 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2063 GENX(COLOR_CALC_STATE_length) * 4,
2064 64);
2065 struct GENX(COLOR_CALC_STATE) cc = {
2066 INIT(cc, BlendConstantColorRed),
2067 INIT(cc, BlendConstantColorGreen),
2068 INIT(cc, BlendConstantColorBlue),
2069 INIT(cc, BlendConstantColorAlpha),
2070 };
2071 GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);
2072
2073 /* Dirty the pointers to reemit 3DSTATE_CC_STATE_POINTERS below
2074 */
2075 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
2076 }
2077
2078 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR)) {
2079 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
2080 ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
2081 ccp.ColorCalcStatePointerValid = true;
2082 }
2083 }
2084
2085 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) {
2086 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
2087 SET(sm, sm, SampleMask);
2088 }
2089 }
2090
2091 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) {
2092 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
2093 SET(ds, ds, DoubleSidedStencilEnable);
2094 SET(ds, ds, StencilTestMask);
2095 SET(ds, ds, StencilWriteMask);
2096 SET(ds, ds, BackfaceStencilTestMask);
2097 SET(ds, ds, BackfaceStencilWriteMask);
2098 SET(ds, ds, StencilReferenceValue);
2099 SET(ds, ds, BackfaceStencilReferenceValue);
2100 SET(ds, ds, DepthTestEnable);
2101 SET(ds, ds, DepthBufferWriteEnable);
2102 SET(ds, ds, DepthTestFunction);
2103 SET(ds, ds, StencilTestEnable);
2104 SET(ds, ds, StencilBufferWriteEnable);
2105 SET(ds, ds, StencilFailOp);
2106 SET(ds, ds, StencilPassDepthPassOp);
2107 SET(ds, ds, StencilPassDepthFailOp);
2108 SET(ds, ds, StencilTestFunction);
2109 SET(ds, ds, BackfaceStencilFailOp);
2110 SET(ds, ds, BackfaceStencilPassDepthPassOp);
2111 SET(ds, ds, BackfaceStencilPassDepthFailOp);
2112 SET(ds, ds, BackfaceStencilTestFunction);
2113 }
2114 }
2115
2116 #if GFX_VER >= 12
2117 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) {
2118 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
2119 SET(db, db, DepthBoundsTestEnable);
2120 SET(db, db, DepthBoundsTestMinValue);
2121 SET(db, db, DepthBoundsTestMaxValue);
2122 }
2123 }
2124 #endif
2125
2126 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) {
2127 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
2128 SET(ls, ls, LineStipplePattern);
2129 SET(ls, ls, LineStippleInverseRepeatCount);
2130 SET(ls, ls, LineStippleRepeatCount);
2131 }
2132 #if GFX_VER >= 11
2133 /* ICL PRMs, Volume 2a - Command Reference: Instructions,
2134 * 3DSTATE_LINE_STIPPLE:
2135 *
2136 * "Workaround: This command must be followed by a PIPE_CONTROL with
2137 * CS Stall bit set."
2138 */
2139 genx_batch_emit_pipe_control(&cmd_buffer->batch,
2140 cmd_buffer->device->info,
2141 cmd_buffer->state.current_pipeline,
2142 ANV_PIPE_CS_STALL_BIT);
2143 #endif
2144 }
2145
2146 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) {
2147 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
2148 #if GFX_VERx10 >= 125
2149 vf.GeometryDistributionEnable = true;
2150 #endif
2151 SET(vf, vf, IndexedDrawCutIndexEnable);
2152 SET(vf, vf, CutIndex);
2153 }
2154 }
2155
2156 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) {
2157 struct anv_buffer *buffer = gfx->index_buffer;
2158 uint32_t offset = gfx->index_offset;
2159 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
2160 ib.IndexFormat = gfx->index_type;
2161 ib.MOCS = anv_mocs(cmd_buffer->device,
2162 buffer ? buffer->address.bo : NULL,
2163 ISL_SURF_USAGE_INDEX_BUFFER_BIT);
2164 #if GFX_VER >= 12
2165 ib.L3BypassDisable = true;
2166 #endif
2167 if (buffer) {
2168 ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
2169 ib.BufferSize = gfx->index_size;
2170 }
2171 }
2172 }
2173
2174 #if GFX_VERx10 >= 125
2175 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) {
2176 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
2177 pipeline, partial.vfg, vfg) {
2178 SET(vfg, vfg, ListCutIndexEnable);
2179 }
2180 }
2181 #endif
2182
2183 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) {
2184 genX(emit_sample_pattern)(&cmd_buffer->batch,
2185 dyn->ms.sample_locations_enable ?
2186 dyn->ms.sample_locations : NULL);
2187 }
2188
2189 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) {
2190 anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
2191 pipeline, partial.wm, wm) {
2192 SET(wm, wm, LineStippleEnable);
2193 SET(wm, wm, BarycentricInterpolationMode);
2194 }
2195 }
2196
2197 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) {
2198 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) {
2199 SET(blend, ps_blend, HasWriteableRT);
2200 SET(blend, ps_blend, ColorBufferBlendEnable);
2201 SET(blend, ps_blend, SourceAlphaBlendFactor);
2202 SET(blend, ps_blend, DestinationAlphaBlendFactor);
2203 SET(blend, ps_blend, SourceBlendFactor);
2204 SET(blend, ps_blend, DestinationBlendFactor);
2205 SET(blend, ps_blend, AlphaTestEnable);
2206 SET(blend, ps_blend, IndependentAlphaBlendEnable);
2207 SET(blend, ps_blend, AlphaToCoverageEnable);
2208 }
2209 }
2210
2211 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) {
2212 const uint32_t num_dwords = GENX(BLEND_STATE_length) +
2213 GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
2214 hw_state->blend.state =
2215 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2216 num_dwords * 4,
2217 64);
2218
2219 uint32_t *dws = hw_state->blend.state.map;
2220
2221 struct GENX(BLEND_STATE) blend_state = {
2222 INIT(blend, AlphaToCoverageEnable),
2223 INIT(blend, AlphaToOneEnable),
2224 INIT(blend, IndependentAlphaBlendEnable),
2225 INIT(blend, ColorDitherEnable),
2226 };
2227 GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);
2228
2229 /* Jump to blend entries. */
2230 dws += GENX(BLEND_STATE_length);
2231 for (uint32_t i = 0; i < MAX_RTS; i++) {
2232 struct GENX(BLEND_STATE_ENTRY) entry = {
2233 INIT(blend.rts[i], WriteDisableAlpha),
2234 INIT(blend.rts[i], WriteDisableRed),
2235 INIT(blend.rts[i], WriteDisableGreen),
2236 INIT(blend.rts[i], WriteDisableBlue),
2237 INIT(blend.rts[i], LogicOpFunction),
2238 INIT(blend.rts[i], LogicOpEnable),
2239 INIT(blend.rts[i], ColorBufferBlendEnable),
2240 INIT(blend.rts[i], ColorClampRange),
2241 INIT(blend.rts[i], PreBlendColorClampEnable),
2242 INIT(blend.rts[i], PostBlendColorClampEnable),
2243 INIT(blend.rts[i], SourceBlendFactor),
2244 INIT(blend.rts[i], DestinationBlendFactor),
2245 INIT(blend.rts[i], ColorBlendFunction),
2246 INIT(blend.rts[i], SourceAlphaBlendFactor),
2247 INIT(blend.rts[i], DestinationAlphaBlendFactor),
2248 INIT(blend.rts[i], AlphaBlendFunction),
2249 };
2250
2251 GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
2252 dws += GENX(BLEND_STATE_ENTRY_length);
2253 }
2254
2255 /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */
2256 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
2257 }
2258
2259 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR)) {
2260 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
2261 bsp.BlendStatePointer = hw_state->blend.state.offset;
2262 bsp.BlendStatePointerValid = true;
2263 }
2264 }
2265
2266 #if INTEL_WA_18019816803_GFX_VER
2267 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) {
2268 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
2269 cmd_buffer->state.current_pipeline,
2270 ANV_PIPE_PSS_STALL_SYNC_BIT);
2271 }
2272 #endif
2273
2274 #if INTEL_WA_14018283232_GFX_VER
2275 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_14018283232))
2276 genX(batch_emit_wa_14018283232)(&cmd_buffer->batch);
2277 #endif
2278
2279 #if GFX_VER == 9
2280 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX))
2281 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
2282 #endif
2283
2284 #if GFX_VERx10 >= 125
2285 if (hw_state->use_tbimr &&
2286 BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) {
2287 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO),
2288 tbimr) {
2289 SET(tbimr, tbimr, TileRectangleHeight);
2290 SET(tbimr, tbimr, TileRectangleWidth);
2291 SET(tbimr, tbimr, VerticalTileCount);
2292 SET(tbimr, tbimr, HorizontalTileCount);
2293 SET(tbimr, tbimr, TBIMRBatchSize);
2294 SET(tbimr, tbimr, TileBoxCheck);
2295 }
2296 }
2297 #endif
2298
2299 #undef INIT
2300 #undef SET
2301
2302 BITSET_ZERO(hw_state->dirty);
2303 }
2304
2305 /**
2306 * This function handles possible state workarounds and emits the dirty
2307 * instructions to the batch buffer.
2308 */
2309 void
genX(cmd_buffer_flush_gfx_hw_state)2310 genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
2311 {
2312 struct anv_device *device = cmd_buffer->device;
2313 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2314 struct anv_graphics_pipeline *pipeline =
2315 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2316 struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
2317
2318 if (INTEL_DEBUG(DEBUG_REEMIT)) {
2319 BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty,
2320 device->gfx_dirty_state);
2321 }
2322
2323 /**
2324 * Put potential workarounds here if you need to reemit an instruction
2325 * because of another one is changing.
2326 */
2327
2328 /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
2329 * it after.
2330 */
2331 if (intel_needs_workaround(device->info, 16011773973) &&
2332 pipeline->uses_xfb &&
2333 BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
2334 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2335 }
2336
2337 /* Gfx11 undocumented issue :
2338 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
2339 */
2340 #if GFX_VER == 11
2341 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE))
2342 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
2343 #endif
2344
2345 /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
2346 if (intel_needs_workaround(device->info, 18020335297) &&
2347 (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
2348 BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) &&
2349 cmd_buffer->state.gfx.viewport_set) {
2350 /* For mesh, we implement the WA using CS stall. This is for
2351 * simplicity and takes care of possible interaction with Wa_16014390852.
2352 */
2353 if (anv_pipeline_is_mesh(pipeline)) {
2354 genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2355 _3D, ANV_PIPE_CS_STALL_BIT);
2356 } else {
2357 /* Mask off all instructions that we program. */
2358 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG);
2359 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF);
2360 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2361 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER);
2362 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2363 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2364 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2365 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP);
2366 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2367 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2368 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2369
2370 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS);
2371 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS);
2372 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
2373 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE);
2374 BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
2375
2376 cmd_buffer_gfx_state_emission(cmd_buffer);
2377
2378 emit_wa_18020335297_dummy_draw(cmd_buffer);
2379
2380 /* Dirty all emitted WA state to make sure that current real
2381 * state is restored.
2382 */
2383 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG);
2384 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
2385 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2386 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
2387 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2388 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2389 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2390 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
2391 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2392 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2393 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2394
2395 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
2396 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
2397 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
2398 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
2399 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
2400 }
2401 }
2402
2403 cmd_buffer_gfx_state_emission(cmd_buffer);
2404 }
2405
2406 void
genX(cmd_buffer_enable_pma_fix)2407 genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
2408 {
2409 if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
2410 return;
2411
2412 if (cmd_buffer->state.pma_fix_enabled == enable)
2413 return;
2414
2415 cmd_buffer->state.pma_fix_enabled = enable;
2416
2417 /* According to the Broadwell PIPE_CONTROL documentation, software should
2418 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2419 * prior to the LRI. If stencil buffer writes are enabled, then a Render
2420 * Cache Flush is also necessary.
2421 *
2422 * The Skylake docs say to use a depth stall rather than a command
2423 * streamer stall. However, the hardware seems to violently disagree.
2424 * A full command streamer stall seems to be needed in both cases.
2425 */
2426 genx_batch_emit_pipe_control
2427 (&cmd_buffer->batch, cmd_buffer->device->info,
2428 cmd_buffer->state.current_pipeline,
2429 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2430 ANV_PIPE_CS_STALL_BIT |
2431 #if GFX_VER >= 12
2432 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2433 #endif
2434 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2435
2436 #if GFX_VER == 9
2437 uint32_t cache_mode;
2438 anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
2439 .STCPMAOptimizationEnable = enable,
2440 .STCPMAOptimizationEnableMask = true);
2441 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2442 lri.RegisterOffset = GENX(CACHE_MODE_0_num);
2443 lri.DataDWord = cache_mode;
2444 }
2445
2446 #endif /* GFX_VER == 9 */
2447
2448 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2449 * Flush bits is often necessary. We do it regardless because it's easier.
2450 * The render cache flush is also necessary if stencil writes are enabled.
2451 *
2452 * Again, the Skylake docs give a different set of flushes but the BDW
2453 * flushes seem to work just as well.
2454 */
2455 genx_batch_emit_pipe_control
2456 (&cmd_buffer->batch, cmd_buffer->device->info,
2457 cmd_buffer->state.current_pipeline,
2458 ANV_PIPE_DEPTH_STALL_BIT |
2459 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2460 #if GFX_VER >= 12
2461 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2462 #endif
2463 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2464 }
2465